def load_model_training_output( model_training_output_dir: str, model_name: str, dataset_name: str, word_embeddings_mmap_mode: str = "r", return_normalized_embeddings: bool = False, return_annoy_instance: bool = False, annoy_instance_prefault: bool = False, return_scann_instance: bool = False, return_scann_instance_filepath: bool = False, ) -> dict: """ Loads and returns a dict object containing output from word2vec training Parameters ---------- model_training_output_dir : str word2vec model training output directory model_name : str Name of the trained model. dataset_name : str Name of the dataset the model is trained on. word_embeddings_mmap_mode : str, optional Memmap mode to use when loading last word embedding weights (defaults to "r", or read). return_normalized_embeddings : bool, optional Whether or not to return last embedding weights, normalized, if they are present (defaults to False). return_annoy_instance : bool, optional Whether or not to return Annoy index fit on last embedding weights, if they are present (defaults to False). annoy_instance_prefault : bool, optional Whether or not to enable the `prefault` option when loading Annoy index. `return_annoy_instance` must be set to True to have an affect. (Defaults to False). return_scann_instance : bool, optional Whether or not to return the ScaNN instance fit on the last embedding weights, if they are present (defaults to False). return_scann_instance_filepath : bool, optional Whether or not to return the filepath of the ScaNN instance fit on the last word embedding weights, if they are present (defaults to False). Returns ------- model_training_output : dict Dictionary containing output from word2vec training """ # Get filepaths of the model output checkpoint_filepaths_dict = get_model_checkpoint_filepaths( output_dir=model_training_output_dir, model_name=model_name, dataset_name=dataset_name, ) # Get last word embeddings from training last_embedding_weights_filepath = checkpoint_filepaths_dict[ "intermediate_embedding_weight_filepaths"][-1] last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode=word_embeddings_mmap_mode) # Get word counts from tokenizer of word2vec model with open(checkpoint_filepaths_dict["train_word_counts_filepath"], "r") as word_counts_file: word_counts = np.array([ int(word_count) for word_count in word_counts_file.read().split("\n") ]) # Get array of words and word_to_int lookup dictionary with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as words_file: words = np.array(words_file.read().split("\n")) word_to_int = {word: i for i, word in enumerate(words)} # Normalized embedding weights last_embedding_weights_normalized = None if (return_normalized_embeddings and "intermediate_embedding_weight_normalized_filepaths" in checkpoint_filepaths_dict): last_embedding_weights_normalized = np.load( checkpoint_filepaths_dict[ "intermediate_embedding_weight_normalized_filepaths"][-1], mmap_mode="r", ) # Annoy index last_embedding_weights_annoy_instance = None if (return_annoy_instance and "intermediate_embedding_weight_annoy_index_filepaths" in checkpoint_filepaths_dict): last_embedding_weights_annoy_instance = ApproxNN(ann_alg="annoy") last_embedding_weights_annoy_instance.load( ann_path=checkpoint_filepaths_dict[ "intermediate_embedding_weight_annoy_index_filepaths"][-1], annoy_data_dimensionality=last_embedding_weights.shape[1], annoy_mertic="euclidean", annoy_prefault=annoy_instance_prefault, ) # ScaNN instance last_embedding_weights_scann_instance = None last_embedding_weights_scann_instance_filepath = None if "intermediate_embedding_weight_scann_artifact_dirs" in checkpoint_filepaths_dict: scann_instance_filepath = checkpoint_filepaths_dict[ "intermediate_embedding_weight_scann_artifact_dirs"][-1] if return_scann_instance: last_embedding_weights_scann_instance = ApproxNN(ann_alg="scann") last_embedding_weights_scann_instance.load( ann_path=scann_instance_filepath) if return_scann_instance_filepath: last_embedding_weights_scann_instance_filepath = scann_instance_filepath return { "last_embedding_weights": last_embedding_weights, "last_embedding_weights_filepath": last_embedding_weights_filepath, "last_embedding_weights_normalized": last_embedding_weights_normalized, "last_embedding_weights_annoy_instance": last_embedding_weights_annoy_instance, "last_embedding_weights_scann_instance": last_embedding_weights_scann_instance, "last_embedding_weights_scann_instance_filepath": last_embedding_weights_scann_instance_filepath, "words": words, "word_to_int": word_to_int, "word_counts": word_counts, }
def topological_polysemy_pipeline( semeval_word_senses_filepath: str, word2vec_semeval_model_dir: str, word2vec_enwiki_model_dir: str, word2vec_google_news_model_dir: str, glove_model_dir: str, fasttext_model_dir: str, fasttext_tps_model_dir: str, tps_neighbourhood_sizes: str, num_top_k_words_frequencies: int, cyclo_octane_data_filepath: str, henneberg_data_filepath: str, custom_point_cloud_neighbourhood_size: int, output_dir: str, ) -> None: """ Computes the topological polysemy of various word embeddings and data sets. Saves results in output dir with some additional plots. Parameters ---------- semeval_word_senses_filepath : str Filepath of the SemEval-2010 task 14 word senses word2vec_semeval_model_dir : str Directory of the SemEval-2010 task 14 word2vec model. word2vec_enwiki_model_dir : str Directory of the enwiki word2vec model. word2vec_google_news_model_dir : str Directory of the Google News 3M word2vec model glove_model_dir : str Directory of the GloVe model. fasttext_model_dir : str Directory of the fastText model. fasttext_tps_model_dir : str Directory of the TPS fastText model. tps_neighbourhood_sizes : str Neighbourhood sizes to use when computing TPS (e.g. 50, 60). num_top_k_words_frequencies : int Number of top words to use when computing TPS scores vs. word frequencies. cyclo_octane_data_filepath : str Filepath of the cyclo-octane dataset. henneberg_data_filepath : str Filepath of the Henneberg dataset. custom_point_cloud_neighbourhood_size : int Neighbourhood size to use when computing TPS for custom point clouds. output_dir : str Output directory to save results. """ # Ensure output directory exists makedirs(output_dir, exist_ok=True) # Load SemEval-2010 task 14 word senses semeval_word_senses: dict = joblib.load(semeval_word_senses_filepath) semeval_target_words = np.array(list(semeval_word_senses["all"].keys())) semeval_target_word_gs_clusters = np.array( list(semeval_word_senses["all"].values()) ) # Parse strings into int tps_neighbourhood_sizes = [int(n_size) for n_size in tps_neighbourhood_sizes] # -- Compute TPS for word embeddings (SemEval and enwiki) -- for dataset_name, model_dir in zip( ["semeval_2010_task_14", "enwiki"], [word2vec_semeval_model_dir, word2vec_enwiki_model_dir], ): # Load word embeddings print(f"Loading {dataset_name} word embeddings...") w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name="word2vec", dataset_name=dataset_name, return_normalized_embeddings=True, return_scann_instance=True, ) last_embedding_weights_normalized = w2v_training_output[ "last_embedding_weights_normalized" ] last_embedding_weights_scann_instance = w2v_training_output[ "last_embedding_weights_scann_instance" ] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] word_counts = w2v_training_output["word_counts"] print("Done!") print("Computing TPS for word embeddings...") tps_word_embeddings( word_embeddings_name=dataset_name, neighbourhood_sizes=tps_neighbourhood_sizes, semeval_target_words=semeval_target_words, semeval_target_words_gs_clusters=semeval_target_word_gs_clusters, word_embeddings_normalized=last_embedding_weights_normalized, word_to_int=word_to_int, word_vocabulary=words, num_top_k_words_frequencies=num_top_k_words_frequencies, output_dir=output_dir, word_counts=word_counts, ann_instance=last_embedding_weights_scann_instance, ) del last_embedding_weights_scann_instance print("Done!") # -- Compute TPS for external word embeddings -- # Prepare constants external_word_embeddings = [ ( "google_news_3m", "GoogleNews-vectors-negative300", word2vec_google_news_model_dir, ), ( "glove_cc_840b_300d", "glove.840B.300d", glove_model_dir, ), ( "fasttext_cc_300d", "cc.en.300.vec", fasttext_model_dir, ), ( "fasttext_tps_300d", "fastText.TPS.300d", fasttext_tps_model_dir, ), ] # Compute TPS for each external word embeddings for word_embeddings_name, model_name, model_dir in external_word_embeddings: # Prepare filepaths model_normalized_weights_filepath = join( model_dir, f"{model_name}_normalized.npy" ) model_words_filepath = join(model_dir, f"{model_name}_words.txt") model_scann_artifacts_dir = join(model_dir, f"{model_name}_scann_artifacts") # Load data print(f"Loading {model_name} data...") model_weights_normalized = np.load( model_normalized_weights_filepath, mmap_mode="r" ) with open(model_words_filepath, "r") as words_file: model_words = np.array(words_file.read().split("\n")) model_approx_nn = ApproxNN(ann_alg="scann") model_approx_nn.load(ann_path=model_scann_artifacts_dir) print("Done!") print(f"Computing TPS for {model_name} word embeddings...") tps_word_embeddings( word_embeddings_name=word_embeddings_name, neighbourhood_sizes=tps_neighbourhood_sizes, semeval_target_words=semeval_target_words, semeval_target_words_gs_clusters=semeval_target_word_gs_clusters, word_embeddings_normalized=model_weights_normalized, word_to_int={word: i for i, word in enumerate(model_words)}, word_vocabulary=model_words, num_top_k_words_frequencies=num_top_k_words_frequencies, output_dir=output_dir, ann_instance=model_approx_nn, ) del model_approx_nn print("Done!") # -- Compute TPS for custom point clouds -- for point_cloud_name, point_cloud_filepath in zip( ["cyclo_octane", "henneberg"], [cyclo_octane_data_filepath, henneberg_data_filepath], ): # Load and prepare data for TPS point_cloud = pd.read_csv(point_cloud_filepath, header=None).values point_cloud_normalized = point_cloud / np.linalg.norm( point_cloud, axis=1 ).reshape(-1, 1) point_cloud_pairwise_dists = euclidean_distances(point_cloud) # Compute TPS scores num_points = len(point_cloud) tps_scores = np.zeros(num_points) print(f"Computing TPS scores for {point_cloud_name}...") for point_index in tqdm(range(num_points)): tps_score = tps_point_cloud( point_index=point_index, neighbourhood_size=custom_point_cloud_neighbourhood_size, point_cloud_normalized=point_cloud_normalized, point_cloud_pairwise_dists=point_cloud_pairwise_dists, ) tps_scores[point_index] = tps_score # Save result point_cloud_output_dir = join(output_dir, point_cloud_name) makedirs(point_cloud_output_dir, exist_ok=True) np.save( join( point_cloud_output_dir, f"tps_scores_{custom_point_cloud_neighbourhood_size}.npy", ), tps_scores, )
def evaluate_word2vec( model_dir: str, model_name: str, dataset_name: str, sswr_dataset_filepath: str, msr_dataset_filepath: str, pad_dataset_filepath: str, vocab_size: int, approx_nn_path: str, approx_nn_alg: str, top_n_prediction: int, output_dir: str, ) -> None: """ Evaluates a word2vec model on the SSWR and MSR test analogy datasets. Parameters ---------- model_dir : str Directory of the model to evaluate. model_name : str Name of the trained model. dataset_name : str Name of the dataset the model is trained on. sswr_dataset_filepath : str Filepath of the SSWR test dataset. msr_dataset_filepath : str Filepath of the MSR test dataset. pad_dataset_filepath : str Filepath of the PAD test dataset vocab_size : int Vocabulary size to use when evaluating on the test datasets. approx_nn_path : str Filepath of an ApproxNN instance, built on the word embeddings. approx_nn_alg : str Algorithm of ApproxNN instance. top_n_prediction : int Which top-N prediction we would like to do. output_dir : str Output directory to save evaluation results. """ # Load output from training word2vec w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name=model_name, dataset_name=dataset_name, ) last_embedding_weights = w2v_training_output["last_embedding_weights"] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] # Append date/time to output directory. output_dir = join(output_dir, datetime.now().strftime("%d-%b-%Y_%H-%M-%S")) makedirs(output_dir, exist_ok=True) # Load ApproxNN instance approx_nn = None if approx_nn_path != "": approx_nn = ApproxNN(ann_alg=approx_nn_alg) load_args = {} if approx_nn_alg == "annoy": load_args[ "annoy_data_dimensionality"] = last_embedding_weights.shape[1] load_args["annoy_mertic"] = "euclidean" load_args["annoy_prefault"] = True approx_nn.load(approx_nn_path, **load_args) # SSWR print("--- Evaluating SSWR ---") sswr_accuracies = evaluate_model_word_analogies( analogies_filepath=sswr_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) # Compute average semantic and syntactic accuracies sswr_categories = list(sswr_accuracies.keys()) sswr_semantic_categories = sswr_categories[:5] sswr_syntactic_categories = sswr_categories[5:-1] sswr_semantic_avg_acc = np.mean( [sswr_accuracies[cat] for cat in sswr_semantic_categories]) sswr_syntactic_avg_acc = np.mean( [sswr_accuracies[cat] for cat in sswr_syntactic_categories]) sswr_accuracies["semantic_avg"] = sswr_semantic_avg_acc sswr_accuracies["syntactic_avg"] = sswr_syntactic_avg_acc save_analogies_accuracies_to_file("sswr", output_dir, sswr_accuracies) print(sswr_accuracies) # MSR print("--- Evaluating MSR ---") msr_accuracies = evaluate_model_word_analogies( analogies_filepath=msr_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) save_analogies_accuracies_to_file("msr", output_dir, msr_accuracies) print(msr_accuracies) # PAD print("--- Evaluating PAD ---") pad_accuracies = evaluate_model_word_analogies( analogies_filepath=pad_dataset_filepath, word_embeddings=last_embedding_weights, word_to_int=word_to_int, words=words, vocab_size=vocab_size, ann_instance=approx_nn, top_n=top_n_prediction, ) save_analogies_accuracies_to_file("pad", output_dir, pad_accuracies) print(pad_accuracies)
def prepare_num_word_meanings_supervised_data( model_dir: str, model_name: str, dataset_name: str, id_estimation_num_neighbours: list, semeval_2010_14_word_senses_filepath: str, tps_neighbourhood_sizes: list, raw_data_dir: str, output_dir: str, ) -> None: """ Prepares data for the supervised word meanings prediction task. Parameters ---------- model_dir : str Directory of the model to load. model_name : str Name of the trained word2vec model. dataset_name : str Name of the dataset the model is trained on. id_estimation_num_neighbours : list Number of neighbours to use when estimating intrinsic dimension for each word semeval_2010_14_word_senses_filepath : str Filepath of SemEval-2010 task 14 word senses joblib dict. tps_neighbourhood_sizes : list List of TPS neighbourhood sizes. raw_data_dir : str Directory where raw data will be saved to. output_dir: str Output directory. """ # Convert list arguments to int tps_neighbourhood_sizes = [ int(n_size) for n_size in tps_neighbourhood_sizes ] id_estimation_num_neighbours = [ int(num_neighbours) for num_neighbours in id_estimation_num_neighbours ] # Prepare directory constants and create raw data dir for caching data files task_id = f"wme_{model_name}_{dataset_name}" # wme = word meaning estimation task_raw_data_dir = join(raw_data_dir, task_id) task_raw_data_tps_dir = join(task_raw_data_dir, "tps") makedirs(task_raw_data_dir, exist_ok=True) # Load word embeddings from model print("Loading word embeddings...") w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name=model_name, dataset_name=dataset_name, return_normalized_embeddings=True, return_scann_instance_filepath=True, ) last_embedding_weights_normalized = w2v_training_output[ "last_embedding_weights_normalized"] last_embedding_weights_scann_instance_filepath = w2v_training_output[ "last_embedding_weights_scann_instance_filepath"] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] print("Done!") # Prepare SemEval-2010 task 14 data semeval_2010_14_word_senses = joblib.load( semeval_2010_14_word_senses_filepath) semeval_target_words = np.array( list(semeval_2010_14_word_senses["all"].keys())) semeval_target_words_in_vocab_filter = [ i for i, word in enumerate(semeval_target_words) if word in word_to_int ] semeval_target_words_in_vocab = semeval_target_words[ semeval_target_words_in_vocab_filter] semeval_gs_clusters = np.array( list(semeval_2010_14_word_senses["all"].values())) semeval_gs_clusters_in_vocab = semeval_gs_clusters[ semeval_target_words_in_vocab_filter] semeval_2010_14_word_senses_in_vocab = { word: gs_meanings for word, gs_meanings in zip(semeval_target_words_in_vocab, semeval_gs_clusters_in_vocab) } # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary -- words_to_num_meanings_filepath = join(task_raw_data_dir, "words_to_num_meanings.joblib") if not isfile(words_to_num_meanings_filepath): words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy() print("Finding words in vocabulary with #Wordnet synsets > 0") for word in tqdm(words): if word in semeval_target_words_in_vocab: continue num_synsets = len(wn.synsets(word)) if num_synsets > 0: words_to_num_meanings[word] = num_synsets joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath) else: words_to_num_meanings = joblib.load(words_to_num_meanings_filepath) print("Loaded words_to_num_meanings!") data_words = np.array(list(words_to_num_meanings.keys())) data_words_no_semeval = [ word for word in data_words if word not in semeval_target_words_in_vocab ] data_word_to_int = {word: i for i, word in enumerate(data_words)} # Filter out word embeddings using Wordnet words (data_words) data_words_to_full_vocab_ints = np.array( [word_to_int[word] for word in data_words]) # (2) -- Compute TPS_n for train/test words -- makedirs(task_raw_data_tps_dir, exist_ok=True) tps_scores_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] tps_pds_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): if isfile(tps_scores_filepath) and isfile(tps_pds_filepath): continue print( f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..." ) # Load ScaNN instance scann_instance = ApproxNN(ann_alg="scann") scann_instance.load( ann_path=last_embedding_weights_scann_instance_filepath) # Compute TPS tps_scores_ns, tps_pds_ns = tps_multiple( target_words=data_words, word_to_int=word_to_int, neighbourhood_size=tps_neighbourhood_size, word_embeddings_normalized=last_embedding_weights_normalized, ann_instance=scann_instance, return_persistence_diagram=True, n_jobs=-1, progressbar_enabled=True, ) # Save result print("Saving TPS result...") np.save(tps_scores_filepath, tps_scores_ns) np.save(tps_pds_filepath, tps_pds_ns) print("Done!") # Free resources del scann_instance # (3) -- Compute GAD -- gad_dir = join(task_raw_data_dir, "gad") makedirs(gad_dir, exist_ok=True) gad_params = [ (25, 250), (25, 500), (25, 750), (25, 1000), # ---------- (50, 250), (50, 500), (50, 750), (50, 1000), # ---------- (100, 1000), (100, 1250), (100, 1500), (100, 1750), (100, 2000), # ---------- (150, 1000), (150, 1250), (150, 1500), (150, 1750), (150, 2000), # ---------- (200, 1000), (200, 1250), (200, 1500), (200, 1750), (200, 2000), ] gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_filepath = join(gad_dir, f"{gad_id}.joblib") if isfile(gad_filepath): continue print(f"-- {gad_id} -- ") # Load ScaNN instance approx_nn = ApproxNN(ann_alg="scann") approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath) # Compute features gad_result = compute_gad( data_points=last_embedding_weights_normalized, data_point_ints=data_words_to_full_vocab_ints, manifold_dimension=2, data_points_approx_nn=approx_nn, use_knn_annulus=True, knn_annulus_inner=inner_param, knn_annulus_outer=outer_param, return_annlus_persistence_diagrams=True, progressbar_enabled=True, n_jobs=-1, ) print( "P_man:", len(gad_result["P_man"]), "P_int:", len(gad_result["P_int"]), "P_bnd:", len(gad_result["P_bnd"]), ) joblib.dump(gad_result, gad_filepath, protocol=4) # Free resources del approx_nn # (4) -- Estimate the intrinsic dimension (ID) for each word vector -- words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids") id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [ ("lpca", est_ids.lPCA, {}), ("knn", est_ids.KNN, {}), ("twonn", est_ids.TwoNN, {}), ("mle", est_ids.MLE, {}), ("tle", est_ids.TLE, {}), ] makedirs(words_estimated_ids_dir, exist_ok=True) for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators: for num_neighbours in id_estimation_num_neighbours: estimated_ids_filepath = join( words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy") if isfile(estimated_ids_filepath): continue print( f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..." ) id_estimator = id_estimator_cls(**id_estimator_params) estimated_ids = id_estimator.fit_predict_pw( X=last_embedding_weights_normalized[ data_words_to_full_vocab_ints], n_neighbors=num_neighbours, n_jobs=-1, ) # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints] print("Done! Saving to file...") np.save(estimated_ids_filepath, estimated_ids) # (5) -- Create features from GAD result to speed up combining of data -- gad_features_dir = join(task_raw_data_dir, "gad_features") makedirs(gad_features_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") if isfile(gad_features_filepath): continue print(f"Creating GAD features for {gad_id}...") # Load GAD result gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Features from GAD (P_man, P_int, P_bnd) gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3), dtype=int) for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)): for gad_category, gad_category_idx in gad_categories.items(): if word_int in gad_result[gad_category]: gad_features[i, gad_category_idx] = 1 # Save GAD features np.save(gad_features_filepath, gad_features) # (6) -- Vectorize persistence diagrams from GAD features -- gad_features_pd_vectorized_dir = join(task_raw_data_dir, "gad_features_pd_vectorized") gad_features_pd_vectorized_size = 5 gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2 makedirs(gad_features_pd_vectorized_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir, f"{gad_id}.npy") if isfile(gad_features_pd_vecs_filepath): continue print(f"Vectorizing GAD features for {gad_id}...") # Load GAD features gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Use PersistenceImage to vectorize persistence diagrams gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints), gad_features_pd_vectorized_size_flat)) for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)): # Get persistence diagram and create a range such that we get a square image from PersistenceImager gad_features_pd = gad_result["annulus_pds"][point_index] if len(gad_features_pd) == 0: gad_features_pd_vecs[i] = np.zeros( gad_features_pd_vectorized_size_flat, dtype=int) continue births, deaths = gad_features_pd.T persistence = deaths - births square_min = min(births.min(), persistence.min()) square_max = max(births.max(), persistence.max()) square_range = (square_min, square_max) pixel_size = (square_max - square_min) / gad_features_pd_vectorized_size # Vectorize persistence diagram pimgr = PersistenceImager(birth_range=square_range, pers_range=square_range, pixel_size=pixel_size) pd_vec = pimgr.transform(gad_features_pd) gad_features_pd_vecs[i] = pd_vec.flatten() # Save persistence image vectors to file np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs) # (7) -- Combine data into data (features and labels) for WME task -- word_meaning_train_data_filepath = join(output_dir, "word_meaning_train_data.csv") word_meaning_test_data_filepath = join(output_dir, "word_meaning_test_data.csv") word_meaning_semeval_test_data_filepath = join( output_dir, "word_meaning_semeval_test_data.csv") if (not isfile(word_meaning_train_data_filepath) or not isfile(word_meaning_test_data_filepath) or not isfile(word_meaning_semeval_test_data_filepath)): # -- Load data for creating features -- # Load estimated IDs from file words_estimated_ids = { f"{id_estimator_name}_{num_neighbours}": np.load( join(words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy")) for num_neighbours in id_estimation_num_neighbours for id_estimator_name, _, _ in id_estimators } print("Loaded estimated IDs!") # Load GAD features gad_features_dict = {} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" # Load GAD features gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") gad_features_dict[gad_id] = np.load(gad_features_filepath) print("Loaded GAD features!") # Load TPS features tps_scores = {} tps_pds = {} for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath) tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath, allow_pickle=True) print("Loaded TPS features!") data_words_train, data_words_test = train_test_split( data_words_no_semeval, test_size=0.05, random_state=rng_seed) if not isfile(word_meaning_train_data_filepath): print("Preparing data for training...") train_data_df = create_word_meaning_model_data_features( target_words=data_words_train, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) train_data_df.to_csv(word_meaning_train_data_filepath, index=False) if not isfile(word_meaning_test_data_filepath): print("Preparing data for testing...") test_data_df = create_word_meaning_model_data_features( target_words=data_words_test, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) test_data_df.to_csv(word_meaning_test_data_filepath, index=False) if not isfile(word_meaning_semeval_test_data_filepath): print("Preparing data for external testing (SemEval)...") semeval_test_data_df = create_word_meaning_model_data_features( target_words=semeval_target_words_in_vocab, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) semeval_test_data_df.to_csv( word_meaning_semeval_test_data_filepath, index=False) else: train_data_df = pd.read_csv(word_meaning_train_data_filepath) test_data_df = pd.read_csv(word_meaning_test_data_filepath) semeval_test_data_df = pd.read_csv( word_meaning_semeval_test_data_filepath) print("Train", train_data_df) print("Test", test_data_df) print("SemEval test", semeval_test_data_df)