def punctured_neighbourhood( target_word: str, word_to_int: dict, word_embeddings_norm: np.ndarray, neighbourhood_size: int, word_embeddings_pairwise_dists: np.ndarray, ann_instance: ApproxNN, ) -> np.ndarray: """ Finds a punctured neighbourhood around a target word using cosine distances. Parameters ---------- target_word : str Target word (w) word_to_int : dict of str and int Dictionary mapping from word to its integer representation. word_embeddings_norm : np.ndarray Normalized word embeddings neighbourhood_size : int Neighbourhood size (n) word_embeddings_pairwise_dists : np.ndarray Pairwise distances between word embeddings ann_instance : ApproxNN Approximate nearest neighbour (ANN) instance, built on the word embeddings If specified, the ANN index is used to find punctured neighbourhoods. Returns ------- neighbouring_word_embeddings : np.ndarray Neighbouring word embeddings of `target_word`, excluding the word itself """ # Find neighbouring words (excluding the target word itself) target_word_int = word_to_int[target_word] if ann_instance is not None: neighbourhood_sorted_indices = ann_instance.search( query_vector=word_embeddings_norm[target_word_int], k_neighbours=neighbourhood_size, excluded_neighbour_indices=[target_word_int], ) else: if word_embeddings_pairwise_dists is not None: neighbourhood_distances = word_embeddings_pairwise_dists[ target_word_int] else: neighbourhood_distances = vector_to_matrix_distance( u=word_embeddings_norm[target_word_int], m=word_embeddings_norm, metric=fastdist.euclidean, metric_name="euclidean", ) neighbourhood_sorted_indices = np.argsort( neighbourhood_distances)[1:neighbourhood_size + 1] neighbouring_word_embeddings = word_embeddings_norm[ neighbourhood_sorted_indices] return neighbouring_word_embeddings
def get_knn_func_data_points( data_points: np.ndarray, pairwise_distances: np.ndarray = None, approx_nn: ApproxNN = None, metric: Callable = fastdist.euclidean, metric_name: str = "euclidean", ) -> KnnFunc: """ Gets a K-nearest neighbour callable for data points, used in `compute_gad`. Parameters ---------- data_points : np.ndarray Data points. pairwise_distances : np.ndarray, optional Pairwise distances of data points (defaults to None). approx_nn : ApproxNN, optional ApproxNN instance. metric : Callable, optional fastdist metric; only required if `pairwise_distances` and `approx_nn` are None (defaults to fastdist.euclidean). metric_name : str, optional String name of the `metric` callable (defaults to "euclidean"). Returns ------- knn_func : KnnFunc K-nearest neighbour callable for data points. """ if approx_nn is not None: return lambda point_idx, k_neighbours: approx_nn.search( query_vector=data_points[point_idx], k_neighbours=k_neighbours, excluded_neighbour_indices=[point_idx], return_distances=True, ) elif pairwise_distances is not None: return lambda point_idx, k_neighbours: get_nearest_neighbours( distances=pairwise_distances[point_idx], k_neighbours=k_neighbours, ) else: return lambda point_idx, k_neighbours: get_nearest_neighbours( distances=fastdist.vector_to_matrix_distance( u=data_points[point_idx], m=data_points, metric=metric, metric_name=metric_name, ), k_neighbours=k_neighbours, )
def postprocess_word2vec_embeddings( model_training_output_dir: str, model_name: str, dataset_name: str, vocab_size: int, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Applies post-processing to trained word2vec word embeddings: - Saves normalized word embeddings - Creates approximate nearest-neighbour index using Annoy Parameters ---------- model_training_output_dir : str word2vec model training output directory. model_name : str Name of the trained model. dataset_name : str Name of the dataset the model is trained on. vocab_size : int Size of the vocabulary to use, -1 denotes all words. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. """ # Load output from training word2vec w2v_training_output = load_model_training_output( model_training_output_dir=model_training_output_dir, model_name=model_name, dataset_name=dataset_name, ) last_embedding_weights = w2v_training_output["last_embedding_weights"] use_full_vocab = False if vocab_size == -1: vocab_size = last_embedding_weights.shape[0] use_full_vocab = True # Define filepaths last_embedding_weights_filepath = w2v_training_output[ "last_embedding_weights_filepath"] last_embedding_weights_filepath_no_ext = Path( last_embedding_weights_filepath).stem if use_full_vocab: last_embedding_weights_normalized_filepath = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_normalized.npy", ) else: last_embedding_weights_normalized_filepath = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_normalized.npy", ) if use_full_vocab: model_annoy_index_filepath = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_annoy_index.ann", ) model_scann_artifacts_dir = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_scann_artifacts", ) else: model_annoy_index_filepath = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_annoy_index.ann", ) model_scann_artifacts_dir = join( model_training_output_dir, f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_scann_artifacts", ) # Normalize word embeddings and save to file if not isfile(last_embedding_weights_normalized_filepath): print("Normalizing word embeddings and saving to file...") # Normalize word embeddings if use_full_vocab: last_embedding_weights_in_vocab = last_embedding_weights else: last_embedding_weights_in_vocab = last_embedding_weights[: vocab_size] last_embedding_weights_normalized = ( last_embedding_weights_in_vocab / np.linalg.norm( last_embedding_weights_in_vocab, axis=1).reshape(-1, 1)) np.save( last_embedding_weights_normalized_filepath, last_embedding_weights_normalized, ) print("Done!") else: last_embedding_weights_normalized = np.load( last_embedding_weights_normalized_filepath) annoy_index_created = isfile(model_annoy_index_filepath) scann_instance_created = isdir(model_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: # Add word embeddings to index and build it if use_full_vocab: last_embedding_weights_normalized_in_vocab = ( last_embedding_weights_normalized) else: last_embedding_weights_normalized_in_vocab = ( last_embedding_weights_normalized[:vocab_size]) if not isfile(model_annoy_index_filepath): ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=last_embedding_weights_normalized_in_vocab, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(model_annoy_index_filepath) if not isdir(model_scann_artifacts_dir): scann_instance = ApproxNN(ann_alg="scann") scann_instance.build( data=last_embedding_weights_normalized_in_vocab, distance_measure="dot_product", scann_num_leaves_scaling=scann_num_leaves_scaling, ) scann_instance.save(model_scann_artifacts_dir)
def load_model_training_output( model_training_output_dir: str, model_name: str, dataset_name: str, word_embeddings_mmap_mode: str = "r", return_normalized_embeddings: bool = False, return_annoy_instance: bool = False, annoy_instance_prefault: bool = False, return_scann_instance: bool = False, return_scann_instance_filepath: bool = False, ) -> dict: """ Loads and returns a dict object containing output from word2vec training Parameters ---------- model_training_output_dir : str word2vec model training output directory model_name : str Name of the trained model. dataset_name : str Name of the dataset the model is trained on. word_embeddings_mmap_mode : str, optional Memmap mode to use when loading last word embedding weights (defaults to "r", or read). return_normalized_embeddings : bool, optional Whether or not to return last embedding weights, normalized, if they are present (defaults to False). return_annoy_instance : bool, optional Whether or not to return Annoy index fit on last embedding weights, if they are present (defaults to False). annoy_instance_prefault : bool, optional Whether or not to enable the `prefault` option when loading Annoy index. `return_annoy_instance` must be set to True to have an affect. (Defaults to False). return_scann_instance : bool, optional Whether or not to return the ScaNN instance fit on the last embedding weights, if they are present (defaults to False). return_scann_instance_filepath : bool, optional Whether or not to return the filepath of the ScaNN instance fit on the last word embedding weights, if they are present (defaults to False). Returns ------- model_training_output : dict Dictionary containing output from word2vec training """ # Get filepaths of the model output checkpoint_filepaths_dict = get_model_checkpoint_filepaths( output_dir=model_training_output_dir, model_name=model_name, dataset_name=dataset_name, ) # Get last word embeddings from training last_embedding_weights_filepath = checkpoint_filepaths_dict[ "intermediate_embedding_weight_filepaths"][-1] last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode=word_embeddings_mmap_mode) # Get word counts from tokenizer of word2vec model with open(checkpoint_filepaths_dict["train_word_counts_filepath"], "r") as word_counts_file: word_counts = np.array([ int(word_count) for word_count in word_counts_file.read().split("\n") ]) # Get array of words and word_to_int lookup dictionary with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as words_file: words = np.array(words_file.read().split("\n")) word_to_int = {word: i for i, word in enumerate(words)} # Normalized embedding weights last_embedding_weights_normalized = None if (return_normalized_embeddings and "intermediate_embedding_weight_normalized_filepaths" in checkpoint_filepaths_dict): last_embedding_weights_normalized = np.load( checkpoint_filepaths_dict[ "intermediate_embedding_weight_normalized_filepaths"][-1], mmap_mode="r", ) # Annoy index last_embedding_weights_annoy_instance = None if (return_annoy_instance and "intermediate_embedding_weight_annoy_index_filepaths" in checkpoint_filepaths_dict): last_embedding_weights_annoy_instance = ApproxNN(ann_alg="annoy") last_embedding_weights_annoy_instance.load( ann_path=checkpoint_filepaths_dict[ "intermediate_embedding_weight_annoy_index_filepaths"][-1], annoy_data_dimensionality=last_embedding_weights.shape[1], annoy_mertic="euclidean", annoy_prefault=annoy_instance_prefault, ) # ScaNN instance last_embedding_weights_scann_instance = None last_embedding_weights_scann_instance_filepath = None if "intermediate_embedding_weight_scann_artifact_dirs" in checkpoint_filepaths_dict: scann_instance_filepath = checkpoint_filepaths_dict[ "intermediate_embedding_weight_scann_artifact_dirs"][-1] if return_scann_instance: last_embedding_weights_scann_instance = ApproxNN(ann_alg="scann") last_embedding_weights_scann_instance.load( ann_path=scann_instance_filepath) if return_scann_instance_filepath: last_embedding_weights_scann_instance_filepath = scann_instance_filepath return { "last_embedding_weights": last_embedding_weights, "last_embedding_weights_filepath": last_embedding_weights_filepath, "last_embedding_weights_normalized": last_embedding_weights_normalized, "last_embedding_weights_annoy_instance": last_embedding_weights_annoy_instance, "last_embedding_weights_scann_instance": last_embedding_weights_scann_instance, "last_embedding_weights_scann_instance_filepath": last_embedding_weights_scann_instance_filepath, "words": words, "word_to_int": word_to_int, "word_counts": word_counts, }
def preprocess_google_news( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality (https://arxiv.org/pdf/1310.4546.pdf). In Proceedings of NIPS, 2013. """ # Ensure output directory exists output_dir = join(output_dir, "GoogleNews") makedirs(output_dir, exist_ok=True) # Define filepaths google_news_vectors_zip_raw_download_url = "https://filesender.uninett.no/download.php?token=b0aea55e-72a7-4ac0-9409-8d5dbb322505&files_ids=645861" google_news_vectors_zip_raw_filename = "GoogleNews-vectors-negative300.bin.gz" google_news_vectors_zip_raw_filepath = join( raw_data_dir, google_news_vectors_zip_raw_filename ) google_news_vectors_bin_raw_filepath = join( raw_data_dir, "GoogleNews-vectors-negative300.bin" ) google_news_words_filepath = join( output_dir, "GoogleNews-vectors-negative300_words.txt" ) google_news_vectors_filepath = join( output_dir, "GoogleNews-vectors-negative300.npy" ) google_news_normalized_vectors_filepath = join( output_dir, "GoogleNews-vectors-negative300_normalized.npy" ) google_news_vectors_annoy_index_filepath = join( output_dir, "GoogleNews-vectors-negative300_annoy_index.ann" ) google_news_vectors_scann_artifacts_dir = join( output_dir, "GoogleNews-vectors-negative300_scann_artifacts" ) # -- GoogleNews-vectors-negative300.bin.gz -- if not isfile(google_news_vectors_zip_raw_filepath): print(f"Downloading {google_news_vectors_zip_raw_filename}...") download_from_url( url=google_news_vectors_zip_raw_download_url, destination_filepath=google_news_vectors_zip_raw_filepath, ) print("Done!") if not isfile(google_news_vectors_bin_raw_filepath): print(f"Extracting {google_news_vectors_zip_raw_filename}...") with gzip.GzipFile(google_news_vectors_zip_raw_filepath, "rb") as gzip_file_raw: with open(google_news_vectors_bin_raw_filepath, "wb") as gzip_file_output: gzip_file_output.write(gzip_file_raw.read()) print("Done!") # Parse vectors from binary file and save result should_load_vectors = ( not isfile(google_news_words_filepath) or not isfile(google_news_vectors_filepath) or not isfile(google_news_normalized_vectors_filepath) ) if should_load_vectors: google_news_word_embeddings, google_news_words = load_word2vec_binary_format( word2vec_filepath=google_news_vectors_bin_raw_filepath, tqdm_enabled=True, ) # Save words if not isfile(google_news_words_filepath): with open(google_news_words_filepath, "w") as file: for i, word in enumerate(google_news_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(google_news_vectors_filepath): np.save(google_news_vectors_filepath, google_news_word_embeddings) # Save normalized word embeddings google_news_word_embeddings_normalized = None if not isfile(google_news_normalized_vectors_filepath): google_news_word_embeddings_normalized = ( google_news_word_embeddings / np.linalg.norm(google_news_word_embeddings, axis=1).reshape(-1, 1) ) np.save( google_news_normalized_vectors_filepath, google_news_word_embeddings_normalized, ) annoy_index_created = isfile(google_news_vectors_annoy_index_filepath) scann_instance_created = isdir(google_news_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if google_news_word_embeddings_normalized is None: google_news_word_embeddings_normalized = np.load( google_news_normalized_vectors_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=google_news_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(google_news_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=google_news_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(google_news_vectors_scann_artifacts_dir)
def preprocess_fasttext_tps( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020). Topology of Word Embeddings: Singularities Reflect Polysemy. """ # Ensure output directory exists output_dir = join(output_dir, "fastTextTPS") makedirs(output_dir, exist_ok=True) # Define constants env_config = dotenv_values(join("..", ".env")) tps_fasttext_model_filesender_token = env_config[ "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN" ] tps_fasttext_model_filesender_token_files_ids = env_config[ "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN_FILES_IDS" ] tps_fasttext_model_url = f"https://filesender.uninett.no/download.php?token={tps_fasttext_model_filesender_token}&files_ids={tps_fasttext_model_filesender_token_files_ids}" tps_fasttext_model_name = "fastText.TPS.300d" tps_fasttext_model_raw_filepath = join( raw_data_dir, f"{tps_fasttext_model_name}.bin" ) tps_fasttext_model_words_filepath = join( output_dir, f"{tps_fasttext_model_name}_words.txt" ) tps_fasttext_model_vectors_filepath = join( output_dir, f"{tps_fasttext_model_name}.npy" ) tps_fasttext_model_vectors_normalized_filepath = join( output_dir, f"{tps_fasttext_model_name}_normalized.npy" ) tps_fasttext_model_annoy_index_filepath = join( output_dir, f"{tps_fasttext_model_name}_annoy_index.ann" ) tps_fasttext_model_scann_artifacts_dir = join( output_dir, f"{tps_fasttext_model_name}_scann_artifacts" ) if not isfile(tps_fasttext_model_raw_filepath): print(f"Downloading {tps_fasttext_model_name}...") download_from_url( url=tps_fasttext_model_url, destination_filepath=tps_fasttext_model_raw_filepath, ) print("Done!") # Load output from trained fastText model fasttext_model = fasttext.load_model(tps_fasttext_model_raw_filepath) fasttext_model_words = fasttext_model.words fasttext_model_embedding_weights = np.zeros( (len(fasttext_model_words), fasttext_model.get_dimension()) ) for i, word in enumerate(fasttext_model.words): fasttext_model_embedding_weights[i] = fasttext_model.get_word_vector(word) # Save words if not isfile(tps_fasttext_model_words_filepath): with open(tps_fasttext_model_words_filepath, "w") as file: for i, word in enumerate(fasttext_model.words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(tps_fasttext_model_vectors_filepath): np.save(tps_fasttext_model_vectors_filepath, fasttext_model_embedding_weights) # Save normalized word embeddings fasttext_model_embedding_weights_normalized = None if not isfile(tps_fasttext_model_vectors_normalized_filepath): fasttext_model_embedding_weights_normalized = ( fasttext_model_embedding_weights / np.linalg.norm(fasttext_model_embedding_weights, axis=1).reshape(-1, 1) ) np.save( tps_fasttext_model_vectors_normalized_filepath, fasttext_model_embedding_weights_normalized, ) annoy_index_created = isfile(tps_fasttext_model_annoy_index_filepath) scann_instance_created = isdir(tps_fasttext_model_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if fasttext_model_embedding_weights_normalized is None: fasttext_model_embedding_weights_normalized = np.load( tps_fasttext_model_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=fasttext_model_embedding_weights_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(tps_fasttext_model_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=fasttext_model_embedding_weights_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(tps_fasttext_model_scann_artifacts_dir)
def preprocess_fasttext( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Grave, E., Bojanowski, P., Gupta, P., Joulin, A., & Mikolov, T. (2018). Learning Word Vectors for 157 Languages. In Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018). """ # Ensure output directory exists output_dir = join(output_dir, "fastText") makedirs(output_dir, exist_ok=True) # Define constants fasttext_data_filename = "cc.en.300.vec" fasttext_vectors_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{fasttext_data_filename}.gz" fasttext_word_vectors_raw_gzip_filepath = join( raw_data_dir, f"{fasttext_data_filename}.gz" ) fasttext_word_vectors_raw_txt_filepath = join(raw_data_dir, fasttext_data_filename) fasttext_word_vectors_words_filepath = join( output_dir, f"{fasttext_data_filename}_words.txt" ) fasttext_word_vectors_filepath = join(output_dir, f"{fasttext_data_filename}.npy") fasttext_word_vectors_normalized_filepath = join( output_dir, f"{fasttext_data_filename}_normalized.npy" ) fasttext_word_vectors_annoy_index_filepath = join( output_dir, f"{fasttext_data_filename}_annoy_index.ann" ) fasttext_word_vectors_scann_artifacts_dir = join( output_dir, f"{fasttext_data_filename}_scann_artifacts" ) if not isfile(fasttext_word_vectors_raw_gzip_filepath): print(f"Downloading {fasttext_data_filename}...") download_from_url( url=fasttext_vectors_url, destination_filepath=fasttext_word_vectors_raw_gzip_filepath, ) print("Done!") if not isfile(fasttext_word_vectors_raw_txt_filepath): print(f"Extracting {fasttext_data_filename}...") with gzip.GzipFile( fasttext_word_vectors_raw_gzip_filepath, "rb" ) as gzip_file_raw: with open(fasttext_word_vectors_raw_txt_filepath, "wb") as gzip_file_output: gzip_file_output.write(gzip_file_raw.read()) print("Done!") # Parse vectors from text file and save result should_load_vectors = ( not isfile(fasttext_word_vectors_words_filepath) or not isfile(fasttext_word_vectors_filepath) or not isfile(fasttext_word_vectors_normalized_filepath) ) if should_load_vectors: fasttext_word_embeddings, fasttext_words = load_word_embeddings_text_format( word_embeddings_text_filepath=fasttext_word_vectors_raw_txt_filepath, first_line_header=True, tqdm_enabled=True, ) # Save words if not isfile(fasttext_word_vectors_words_filepath): with open(fasttext_word_vectors_words_filepath, "w") as file: for i, word in enumerate(fasttext_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(fasttext_word_vectors_filepath): np.save(fasttext_word_vectors_filepath, fasttext_word_embeddings) # Save normalized word embeddings fasttext_word_embeddings_normalized = None if not isfile(fasttext_word_vectors_normalized_filepath): fasttext_word_embeddings_normalized = fasttext_word_embeddings / np.linalg.norm( fasttext_word_embeddings, axis=1 ).reshape(-1, 1) np.save( fasttext_word_vectors_normalized_filepath, fasttext_word_embeddings_normalized, ) annoy_index_created = isfile(fasttext_word_vectors_annoy_index_filepath) scann_instance_created = isdir(fasttext_word_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if fasttext_word_embeddings_normalized is None: fasttext_word_embeddings_normalized = np.load( fasttext_word_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=fasttext_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(fasttext_word_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=fasttext_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(fasttext_word_vectors_scann_artifacts_dir)
def preprocess_glove( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Jeffrey Pennington, Richard Socher, & Christopher D. Manning (2014). GloVe: Global Vectors for Word Representation. In Empirical Methods in Natural Language Processing (EMNLP) (pp. 1532–1543). """ # Ensure output directory exists output_dir = join(output_dir, "GloVe") makedirs(output_dir, exist_ok=True) # Define constants glove_data_filename = "glove.840B.300d" glove_word_vectors_url = f"http://nlp.stanford.edu/data/{glove_data_filename}.zip" glove_word_vectors_raw_zip_filepath = join( raw_data_dir, f"{glove_data_filename}.zip" ) glove_word_vectors_raw_txt_filename = f"{glove_data_filename}.txt" glove_word_vectors_raw_txt_filepath = join( raw_data_dir, glove_word_vectors_raw_txt_filename ) glove_word_vectors_words_filepath = join( output_dir, f"{glove_data_filename}_words.txt" ) glove_word_vectors_filepath = join(output_dir, f"{glove_data_filename}.npy") glove_word_vectors_normalized_filepath = join( output_dir, f"{glove_data_filename}_normalized.npy" ) glove_word_vectors_annoy_index_filepath = join( output_dir, f"{glove_data_filename}_annoy_index.ann" ) glove_word_vectors_scann_artifacts_dir = join( output_dir, f"{glove_data_filename}_scann_artifacts" ) if not isfile(glove_word_vectors_raw_zip_filepath): print(f"Downloading {glove_data_filename}...") download_from_url( url=glove_word_vectors_url, destination_filepath=glove_word_vectors_raw_zip_filepath, ) print("Done!") if not isfile(glove_word_vectors_raw_txt_filepath): print(f"Extracting {glove_data_filename}...") with zipfile.ZipFile(glove_word_vectors_raw_zip_filepath, "r") as zip_ref: zip_ref.extractall(raw_data_dir) print("Done!") # Parse vectors from text file and save result should_load_vectors = ( not isfile(glove_word_vectors_words_filepath) or not isfile(glove_word_vectors_filepath) or not isfile(glove_word_vectors_normalized_filepath) ) if should_load_vectors: glove_word_embeddings, glove_words = load_word_embeddings_text_format( word_embeddings_text_filepath=glove_word_vectors_raw_txt_filepath, first_line_header=False, tqdm_enabled=True, ) # Save words if not isfile(glove_word_vectors_words_filepath): with open(glove_word_vectors_words_filepath, "w") as file: for i, word in enumerate(glove_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(glove_word_vectors_filepath): np.save(glove_word_vectors_filepath, glove_word_embeddings) # Save normalized word embeddings glove_word_embeddings_normalized = None if not isfile(glove_word_vectors_normalized_filepath): glove_word_embeddings_normalized = glove_word_embeddings / np.linalg.norm( glove_word_embeddings, axis=1 ).reshape(-1, 1) np.save( glove_word_vectors_normalized_filepath, glove_word_embeddings_normalized, ) annoy_index_created = isfile(glove_word_vectors_annoy_index_filepath) scann_instance_created = isdir(glove_word_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if glove_word_embeddings_normalized is None: glove_word_embeddings_normalized = np.load( glove_word_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=glove_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(glove_word_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=glove_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(glove_word_vectors_scann_artifacts_dir)
def evaluate_word2vec( model_dir: str, model_name: str, dataset_name: str, sswr_dataset_filepath: str, msr_dataset_filepath: str, pad_dataset_filepath: str, vocab_size: int, approx_nn_path: str, approx_nn_alg: str, top_n_prediction: int, output_dir: str, ) -> None: """ Evaluates a word2vec model on the SSWR and MSR test analogy datasets. Parameters ---------- model_dir : str Directory of the model to evaluate. model_name : str Name of the trained model. dataset_name : str Name of the dataset the model is trained on. sswr_dataset_filepath : str Filepath of the SSWR test dataset. msr_dataset_filepath : str Filepath of the MSR test dataset. pad_dataset_filepath : str Filepath of the PAD test dataset vocab_size : int Vocabulary size to use when evaluating on the test datasets. approx_nn_path : str Filepath of an ApproxNN instance, built on the word embeddings. approx_nn_alg : str Algorithm of ApproxNN instance. top_n_prediction : int Which top-N prediction we would like to do. output_dir : str Output directory to save evaluation results. """ # Load output from training word2vec w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name=model_name, dataset_name=dataset_name, ) last_embedding_weights = w2v_training_output["last_embedding_weights"] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] # Append date/time to output directory. output_dir = join(output_dir, datetime.now().strftime("%d-%b-%Y_%H-%M-%S")) makedirs(output_dir, exist_ok=True) # Load ApproxNN instance approx_nn = None if approx_nn_path != "": approx_nn = ApproxNN(ann_alg=approx_nn_alg) load_args = {} if approx_nn_alg == "annoy": load_args[ "annoy_data_dimensionality"] = last_embedding_weights.shape[1] load_args["annoy_mertic"] = "euclidean" load_args["annoy_prefault"] = True approx_nn.load(approx_nn_path, **load_args) # SSWR print("--- Evaluating SSWR ---") sswr_accuracies = evaluate_model_word_analogies( analogies_filepath=sswr_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) # Compute average semantic and syntactic accuracies sswr_categories = list(sswr_accuracies.keys()) sswr_semantic_categories = sswr_categories[:5] sswr_syntactic_categories = sswr_categories[5:-1] sswr_semantic_avg_acc = np.mean( [sswr_accuracies[cat] for cat in sswr_semantic_categories]) sswr_syntactic_avg_acc = np.mean( [sswr_accuracies[cat] for cat in sswr_syntactic_categories]) sswr_accuracies["semantic_avg"] = sswr_semantic_avg_acc sswr_accuracies["syntactic_avg"] = sswr_syntactic_avg_acc save_analogies_accuracies_to_file("sswr", output_dir, sswr_accuracies) print(sswr_accuracies) # MSR print("--- Evaluating MSR ---") msr_accuracies = evaluate_model_word_analogies( analogies_filepath=msr_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) save_analogies_accuracies_to_file("msr", output_dir, msr_accuracies) print(msr_accuracies) # PAD print("--- Evaluating PAD ---") pad_accuracies = evaluate_model_word_analogies( analogies_filepath=pad_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) save_analogies_accuracies_to_file("pad", output_dir, pad_accuracies) print(pad_accuracies)
def topological_polysemy_pipeline( semeval_word_senses_filepath: str, word2vec_semeval_model_dir: str, word2vec_enwiki_model_dir: str, word2vec_google_news_model_dir: str, glove_model_dir: str, fasttext_model_dir: str, fasttext_tps_model_dir: str, tps_neighbourhood_sizes: str, num_top_k_words_frequencies: int, cyclo_octane_data_filepath: str, henneberg_data_filepath: str, custom_point_cloud_neighbourhood_size: int, output_dir: str, ) -> None: """ Computes the topological polysemy of various word embeddings and data sets. Saves results in output dir with some additional plots. Parameters ---------- semeval_word_senses_filepath : str Filepath of the SemEval-2010 task 14 word senses word2vec_semeval_model_dir : str Directory of the SemEval-2010 task 14 word2vec model. word2vec_enwiki_model_dir : str Directory of the enwiki word2vec model. word2vec_google_news_model_dir : str Directory of the Google News 3M word2vec model glove_model_dir : str Directory of the GloVe model. fasttext_model_dir : str Directory of the fastText model. fasttext_tps_model_dir : str Directory of the TPS fastText model. tps_neighbourhood_sizes : str Neighbourhood sizes to use when computing TPS (e.g. 50, 60). num_top_k_words_frequencies : int Number of top words to use when computing TPS scores vs. word frequencies. cyclo_octane_data_filepath : str Filepath of the cyclo-octane dataset. henneberg_data_filepath : str Filepath of the Henneberg dataset. custom_point_cloud_neighbourhood_size : int Neighbourhood size to use when computing TPS for custom point clouds. output_dir : str Output directory to save results. """ # Ensure output directory exists makedirs(output_dir, exist_ok=True) # Load SemEval-2010 task 14 word senses semeval_word_senses: dict = joblib.load(semeval_word_senses_filepath) semeval_target_words = np.array(list(semeval_word_senses["all"].keys())) semeval_target_word_gs_clusters = np.array( list(semeval_word_senses["all"].values()) ) # Parse strings into int tps_neighbourhood_sizes = [int(n_size) for n_size in tps_neighbourhood_sizes] # -- Compute TPS for word embeddings (SemEval and enwiki) -- for dataset_name, model_dir in zip( ["semeval_2010_task_14", "enwiki"], [word2vec_semeval_model_dir, word2vec_enwiki_model_dir], ): # Load word embeddings print(f"Loading {dataset_name} word embeddings...") w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name="word2vec", dataset_name=dataset_name, return_normalized_embeddings=True, return_scann_instance=True, ) last_embedding_weights_normalized = w2v_training_output[ "last_embedding_weights_normalized" ] last_embedding_weights_scann_instance = w2v_training_output[ "last_embedding_weights_scann_instance" ] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] word_counts = w2v_training_output["word_counts"] print("Done!") print("Computing TPS for word embeddings...") tps_word_embeddings( word_embeddings_name=dataset_name, neighbourhood_sizes=tps_neighbourhood_sizes, semeval_target_words=semeval_target_words, semeval_target_words_gs_clusters=semeval_target_word_gs_clusters, word_embeddings_normalized=last_embedding_weights_normalized, word_to_int=word_to_int, word_vocabulary=words, num_top_k_words_frequencies=num_top_k_words_frequencies, output_dir=output_dir, word_counts=word_counts, ann_instance=last_embedding_weights_scann_instance, ) del last_embedding_weights_scann_instance print("Done!") # -- Compute TPS for external word embeddings -- # Prepare constants external_word_embeddings = [ ( "google_news_3m", "GoogleNews-vectors-negative300", word2vec_google_news_model_dir, ), ( "glove_cc_840b_300d", "glove.840B.300d", glove_model_dir, ), ( "fasttext_cc_300d", "cc.en.300.vec", fasttext_model_dir, ), ( "fasttext_tps_300d", "fastText.TPS.300d", fasttext_tps_model_dir, ), ] # Compute TPS for each external word embeddings for word_embeddings_name, model_name, model_dir in external_word_embeddings: # Prepare filepaths model_normalized_weights_filepath = join( model_dir, f"{model_name}_normalized.npy" ) model_words_filepath = join(model_dir, f"{model_name}_words.txt") model_scann_artifacts_dir = join(model_dir, f"{model_name}_scann_artifacts") # Load data print(f"Loading {model_name} data...") model_weights_normalized = np.load( model_normalized_weights_filepath, mmap_mode="r" ) with open(model_words_filepath, "r") as words_file: model_words = np.array(words_file.read().split("\n")) model_approx_nn = ApproxNN(ann_alg="scann") model_approx_nn.load(ann_path=model_scann_artifacts_dir) print("Done!") print(f"Computing TPS for {model_name} word embeddings...") tps_word_embeddings( word_embeddings_name=word_embeddings_name, neighbourhood_sizes=tps_neighbourhood_sizes, semeval_target_words=semeval_target_words, semeval_target_words_gs_clusters=semeval_target_word_gs_clusters, word_embeddings_normalized=model_weights_normalized, word_to_int={word: i for i, word in enumerate(model_words)}, word_vocabulary=model_words, num_top_k_words_frequencies=num_top_k_words_frequencies, output_dir=output_dir, ann_instance=model_approx_nn, ) del model_approx_nn print("Done!") # -- Compute TPS for custom point clouds -- for point_cloud_name, point_cloud_filepath in zip( ["cyclo_octane", "henneberg"], [cyclo_octane_data_filepath, henneberg_data_filepath], ): # Load and prepare data for TPS point_cloud = pd.read_csv(point_cloud_filepath, header=None).values point_cloud_normalized = point_cloud / np.linalg.norm( point_cloud, axis=1 ).reshape(-1, 1) point_cloud_pairwise_dists = euclidean_distances(point_cloud) # Compute TPS scores num_points = len(point_cloud) tps_scores = np.zeros(num_points) print(f"Computing TPS scores for {point_cloud_name}...") for point_index in tqdm(range(num_points)): tps_score = tps_point_cloud( point_index=point_index, neighbourhood_size=custom_point_cloud_neighbourhood_size, point_cloud_normalized=point_cloud_normalized, point_cloud_pairwise_dists=point_cloud_pairwise_dists, ) tps_scores[point_index] = tps_score # Save result point_cloud_output_dir = join(output_dir, point_cloud_name) makedirs(point_cloud_output_dir, exist_ok=True) np.save( join( point_cloud_output_dir, f"tps_scores_{custom_point_cloud_neighbourhood_size}.npy", ), tps_scores, )
def similar_words( weights: np.ndarray, word_to_int: Dict[str, int], words: np.ndarray, ann_instance: ApproxNN = None, top_n: int = 10, positive_words: Optional[List[str]] = None, negative_words: Optional[List[str]] = None, vocab_size: int = -1, return_similarity_score: bool = True, ) -> List[Union[Tuple, str]]: """ Finds the most similar words of a linear combination of positively and negatively contributing words. Parameters ---------- weights : np.ndarray Numpy matrix (vocabulary size, embedding dim) containing word vectors. word_to_int : dict of str and int Dictionary mapping from word to its integer representation. words : np.ndarray Numpy array containing words from the vocabulary. ann_instance : ApproxNN, optional ApproxNN instance, built on word embeddings (defaults to None). top_n : int, optional Number of similar words (defaults to 10). positive_words : list of str, optional List of words contribution positively (defaults to empty list). negative_words : list of str, optional List of words contribution negatively (defaults to empty list). vocab_size : int, optional Vocabulary size to use, e.g., only most common `vocab_size` words to taken into account (defaults to -1 meaning all words). return_similarity_score : bool, optional Whether or not to return the cosine similarity score (`ann_instance` must be set to None to have an effect). Returns ------- If return_similarity_score is True, then pairs : list of tuples of str and int List of `top_n` similar words and their cosine similarities. else: closest_words : list of str List of `top_n` similar words. """ # Default values if positive_words is None: positive_words = [] if negative_words is None: negative_words = [] # Restrict vocabulary if vocab_size > 0: weights = weights[:vocab_size] words = words[:vocab_size] # Create query word vector query_word_vec = np.zeros((weights.shape[1], ), dtype=np.float64) query_word_vec += np.array([ get_word_vec(pos_word, word_to_int, weights) for pos_word in positive_words ]).sum(axis=0) query_word_vec -= np.array([ get_word_vec(neg_word, word_to_int, weights) for neg_word in negative_words ]).sum(axis=0) # Create indices list of query words to exclude from search exclude_words_indices = [ word_to_int[word] for word in positive_words + negative_words ] # Find closest words if ann_instance is None: # Use cosine similarity to find similar words cos_sims = fastdist.cosine_vector_to_matrix(query_word_vec, weights) sorted_indices = cos_sims.argsort()[::-1] sorted_indices = [ idx for idx in sorted_indices if idx not in exclude_words_indices ] else: query_word_vec_norm = query_word_vec / np.linalg.norm(query_word_vec) sorted_indices = ann_instance.search( query_vector=query_word_vec_norm, k_neighbours=top_n, excluded_neighbour_indices=exclude_words_indices, ) # Filter top words/similarities top_words = words[sorted_indices][:top_n] # Create word similarity pairs if return_similarity_score and ann_instance is None: top_sims = cos_sims[sorted_indices][:top_n] result = list(zip(top_words, top_sims)) else: result = top_words return result
def prepare_num_word_meanings_supervised_data( model_dir: str, model_name: str, dataset_name: str, id_estimation_num_neighbours: list, semeval_2010_14_word_senses_filepath: str, tps_neighbourhood_sizes: list, raw_data_dir: str, output_dir: str, ) -> None: """ Prepares data for the supervised word meanings prediction task. Parameters ---------- model_dir : str Directory of the model to load. model_name : str Name of the trained word2vec model. dataset_name : str Name of the dataset the model is trained on. id_estimation_num_neighbours : list Number of neighbours to use when estimating intrinsic dimension for each word semeval_2010_14_word_senses_filepath : str Filepath of SemEval-2010 task 14 word senses joblib dict. tps_neighbourhood_sizes : list List of TPS neighbourhood sizes. raw_data_dir : str Directory where raw data will be saved to. output_dir: str Output directory. """ # Convert list arguments to int tps_neighbourhood_sizes = [ int(n_size) for n_size in tps_neighbourhood_sizes ] id_estimation_num_neighbours = [ int(num_neighbours) for num_neighbours in id_estimation_num_neighbours ] # Prepare directory constants and create raw data dir for caching data files task_id = f"wme_{model_name}_{dataset_name}" # wme = word meaning estimation task_raw_data_dir = join(raw_data_dir, task_id) task_raw_data_tps_dir = join(task_raw_data_dir, "tps") makedirs(task_raw_data_dir, exist_ok=True) # Load word embeddings from model print("Loading word embeddings...") w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name=model_name, dataset_name=dataset_name, return_normalized_embeddings=True, return_scann_instance_filepath=True, ) last_embedding_weights_normalized = w2v_training_output[ "last_embedding_weights_normalized"] last_embedding_weights_scann_instance_filepath = w2v_training_output[ "last_embedding_weights_scann_instance_filepath"] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] print("Done!") # Prepare SemEval-2010 task 14 data semeval_2010_14_word_senses = joblib.load( semeval_2010_14_word_senses_filepath) semeval_target_words = np.array( list(semeval_2010_14_word_senses["all"].keys())) semeval_target_words_in_vocab_filter = [ i for i, word in enumerate(semeval_target_words) if word in word_to_int ] semeval_target_words_in_vocab = semeval_target_words[ semeval_target_words_in_vocab_filter] semeval_gs_clusters = np.array( list(semeval_2010_14_word_senses["all"].values())) semeval_gs_clusters_in_vocab = semeval_gs_clusters[ semeval_target_words_in_vocab_filter] semeval_2010_14_word_senses_in_vocab = { word: gs_meanings for word, gs_meanings in zip(semeval_target_words_in_vocab, semeval_gs_clusters_in_vocab) } # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary -- words_to_num_meanings_filepath = join(task_raw_data_dir, "words_to_num_meanings.joblib") if not isfile(words_to_num_meanings_filepath): words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy() print("Finding words in vocabulary with #Wordnet synsets > 0") for word in tqdm(words): if word in semeval_target_words_in_vocab: continue num_synsets = len(wn.synsets(word)) if num_synsets > 0: words_to_num_meanings[word] = num_synsets joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath) else: words_to_num_meanings = joblib.load(words_to_num_meanings_filepath) print("Loaded words_to_num_meanings!") data_words = np.array(list(words_to_num_meanings.keys())) data_words_no_semeval = [ word for word in data_words if word not in semeval_target_words_in_vocab ] data_word_to_int = {word: i for i, word in enumerate(data_words)} # Filter out word embeddings using Wordnet words (data_words) data_words_to_full_vocab_ints = np.array( [word_to_int[word] for word in data_words]) # (2) -- Compute TPS_n for train/test words -- makedirs(task_raw_data_tps_dir, exist_ok=True) tps_scores_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] tps_pds_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): if isfile(tps_scores_filepath) and isfile(tps_pds_filepath): continue print( f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..." ) # Load ScaNN instance scann_instance = ApproxNN(ann_alg="scann") scann_instance.load( ann_path=last_embedding_weights_scann_instance_filepath) # Compute TPS tps_scores_ns, tps_pds_ns = tps_multiple( target_words=data_words, word_to_int=word_to_int, neighbourhood_size=tps_neighbourhood_size, word_embeddings_normalized=last_embedding_weights_normalized, ann_instance=scann_instance, return_persistence_diagram=True, n_jobs=-1, progressbar_enabled=True, ) # Save result print("Saving TPS result...") np.save(tps_scores_filepath, tps_scores_ns) np.save(tps_pds_filepath, tps_pds_ns) print("Done!") # Free resources del scann_instance # (3) -- Compute GAD -- gad_dir = join(task_raw_data_dir, "gad") makedirs(gad_dir, exist_ok=True) gad_params = [ (25, 250), (25, 500), (25, 750), (25, 1000), # ---------- (50, 250), (50, 500), (50, 750), (50, 1000), # ---------- (100, 1000), (100, 1250), (100, 1500), (100, 1750), (100, 2000), # ---------- (150, 1000), (150, 1250), (150, 1500), (150, 1750), (150, 2000), # ---------- (200, 1000), (200, 1250), (200, 1500), (200, 1750), (200, 2000), ] gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_filepath = join(gad_dir, f"{gad_id}.joblib") if isfile(gad_filepath): continue print(f"-- {gad_id} -- ") # Load ScaNN instance approx_nn = ApproxNN(ann_alg="scann") approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath) # Compute features gad_result = compute_gad( data_points=last_embedding_weights_normalized, data_point_ints=data_words_to_full_vocab_ints, manifold_dimension=2, data_points_approx_nn=approx_nn, use_knn_annulus=True, knn_annulus_inner=inner_param, knn_annulus_outer=outer_param, return_annlus_persistence_diagrams=True, progressbar_enabled=True, n_jobs=-1, ) print( "P_man:", len(gad_result["P_man"]), "P_int:", len(gad_result["P_int"]), "P_bnd:", len(gad_result["P_bnd"]), ) joblib.dump(gad_result, gad_filepath, protocol=4) # Free resources del approx_nn # (4) -- Estimate the intrinsic dimension (ID) for each word vector -- words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids") id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [ ("lpca", est_ids.lPCA, {}), ("knn", est_ids.KNN, {}), ("twonn", est_ids.TwoNN, {}), ("mle", est_ids.MLE, {}), ("tle", est_ids.TLE, {}), ] makedirs(words_estimated_ids_dir, exist_ok=True) for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators: for num_neighbours in id_estimation_num_neighbours: estimated_ids_filepath = join( words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy") if isfile(estimated_ids_filepath): continue print( f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..." ) id_estimator = id_estimator_cls(**id_estimator_params) estimated_ids = id_estimator.fit_predict_pw( X=last_embedding_weights_normalized[ data_words_to_full_vocab_ints], n_neighbors=num_neighbours, n_jobs=-1, ) # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints] print("Done! Saving to file...") np.save(estimated_ids_filepath, estimated_ids) # (5) -- Create features from GAD result to speed up combining of data -- gad_features_dir = join(task_raw_data_dir, "gad_features") makedirs(gad_features_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") if isfile(gad_features_filepath): continue print(f"Creating GAD features for {gad_id}...") # Load GAD result gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Features from GAD (P_man, P_int, P_bnd) gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3), dtype=int) for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)): for gad_category, gad_category_idx in gad_categories.items(): if word_int in gad_result[gad_category]: gad_features[i, gad_category_idx] = 1 # Save GAD features np.save(gad_features_filepath, gad_features) # (6) -- Vectorize persistence diagrams from GAD features -- gad_features_pd_vectorized_dir = join(task_raw_data_dir, "gad_features_pd_vectorized") gad_features_pd_vectorized_size = 5 gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2 makedirs(gad_features_pd_vectorized_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir, f"{gad_id}.npy") if isfile(gad_features_pd_vecs_filepath): continue print(f"Vectorizing GAD features for {gad_id}...") # Load GAD features gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Use PersistenceImage to vectorize persistence diagrams gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints), gad_features_pd_vectorized_size_flat)) for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)): # Get persistence diagram and create a range such that we get a square image from PersistenceImager gad_features_pd = gad_result["annulus_pds"][point_index] if len(gad_features_pd) == 0: gad_features_pd_vecs[i] = np.zeros( gad_features_pd_vectorized_size_flat, dtype=int) continue births, deaths = gad_features_pd.T persistence = deaths - births square_min = min(births.min(), persistence.min()) square_max = max(births.max(), persistence.max()) square_range = (square_min, square_max) pixel_size = (square_max - square_min) / gad_features_pd_vectorized_size # Vectorize persistence diagram pimgr = PersistenceImager(birth_range=square_range, pers_range=square_range, pixel_size=pixel_size) pd_vec = pimgr.transform(gad_features_pd) gad_features_pd_vecs[i] = pd_vec.flatten() # Save persistence image vectors to file np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs) # (7) -- Combine data into data (features and labels) for WME task -- word_meaning_train_data_filepath = join(output_dir, "word_meaning_train_data.csv") word_meaning_test_data_filepath = join(output_dir, "word_meaning_test_data.csv") word_meaning_semeval_test_data_filepath = join( output_dir, "word_meaning_semeval_test_data.csv") if (not isfile(word_meaning_train_data_filepath) or not isfile(word_meaning_test_data_filepath) or not isfile(word_meaning_semeval_test_data_filepath)): # -- Load data for creating features -- # Load estimated IDs from file words_estimated_ids = { f"{id_estimator_name}_{num_neighbours}": np.load( join(words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy")) for num_neighbours in id_estimation_num_neighbours for id_estimator_name, _, _ in id_estimators } print("Loaded estimated IDs!") # Load GAD features gad_features_dict = {} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" # Load GAD features gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") gad_features_dict[gad_id] = np.load(gad_features_filepath) print("Loaded GAD features!") # Load TPS features tps_scores = {} tps_pds = {} for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath) tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath, allow_pickle=True) print("Loaded TPS features!") data_words_train, data_words_test = train_test_split( data_words_no_semeval, test_size=0.05, random_state=rng_seed) if not isfile(word_meaning_train_data_filepath): print("Preparing data for training...") train_data_df = create_word_meaning_model_data_features( target_words=data_words_train, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) train_data_df.to_csv(word_meaning_train_data_filepath, index=False) if not isfile(word_meaning_test_data_filepath): print("Preparing data for testing...") test_data_df = create_word_meaning_model_data_features( target_words=data_words_test, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) test_data_df.to_csv(word_meaning_test_data_filepath, index=False) if not isfile(word_meaning_semeval_test_data_filepath): print("Preparing data for external testing (SemEval)...") semeval_test_data_df = create_word_meaning_model_data_features( target_words=semeval_target_words_in_vocab, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) semeval_test_data_df.to_csv( word_meaning_semeval_test_data_filepath, index=False) else: train_data_df = pd.read_csv(word_meaning_train_data_filepath) test_data_df = pd.read_csv(word_meaning_test_data_filepath) semeval_test_data_df = pd.read_csv( word_meaning_semeval_test_data_filepath) print("Train", train_data_df) print("Test", test_data_df) print("SemEval test", semeval_test_data_df)