def to_metric(self, pixel_size=0.1): births = [b for dgms in self.data for dgm in dgms for b,_ in dgm] tpers = [d - b for dgms in self.data for dgm in dgms for b,d in dgm if d < np.inf] pim = PersistenceImager((min(births),max(max(births),pixel_size)), (min(tpers),max(max(tpers),pixel_size)), pixel_size=pixel_size) return np.vstack([np.stack([pim.transform(noinf(d)) for d in D]).ravel() for D in tqdm(self.data, desc='[ Persim')])
def test_fit_diagram_list(): persimgr = PersistenceImager(birth_range=(0, 1), pers_range=(0, 2), pixel_size=1) dgms = [ np.array([[1, 2], [4, 8], [-1, 5.25]]), np.array([[1, 2], [2, 3], [3, 4]]) ] persimgr.fit(dgms) np.testing.assert_equal(persimgr.pixel_size, 1) np.testing.assert_equal(persimgr._pixel_size, 1) np.testing.assert_equal(persimgr.birth_range, (-1, 4)) np.testing.assert_equal(persimgr._birth_range, (-1, 4)) np.testing.assert_equal(persimgr.pers_range, (0.625, 6.625)) np.testing.assert_equal(persimgr._pers_range, (0.625, 6.625)) np.testing.assert_equal(persimgr.height, 6) np.testing.assert_equal(persimgr._height, 6) np.testing.assert_equal(persimgr.width, 5) np.testing.assert_equal(persimgr._width, 5) np.testing.assert_equal(persimgr.resolution, (5, 6)) np.testing.assert_equal(persimgr._resolution, (5, 6)) np.testing.assert_array_equal( persimgr._ppnts, [0.625, 1.625, 2.625, 3.625, 4.625, 5.625, 6.625]) np.testing.assert_array_equal(persimgr._bpnts, [-1., 0., 1., 2., 3., 4.])
def test_lists_of_lists(self): persimgr = PersistenceImager(birth_range=(0, 3), pers_range=(0, 3), pixel_size=1) dgm = [[0, 1], [1, 1], [3, 5]] img = persimgr.transform(dgm) np.testing.assert_equal(img.shape, (3, 3))
def test_n_pixels(self): persimgr = PersistenceImager(birth_range=(0, 5), pers_range=(0, 3), pixel_size=1) dgm = np.array([[0, 1], [1, 1], [3, 5]]) img = persimgr.transform(dgm) np.testing.assert_equal(img.shape, (5, 3)) img = persimgr.fit_transform(dgm) np.testing.assert_equal(img.shape, (3, 2))
def test_linear_ramp(self): persimgr = PersistenceImager(weight=images_weights.linear_ramp, weight_params={ 'low': 0.0, 'high': 5.0, 'start': 0.0, 'end': 1.0 }) wf = persimgr.weight wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 0) np.testing.assert_equal(wf(1, 1 / 5, **wf_params), 1) np.testing.assert_equal(wf(1, 1, **wf_params), 5) np.testing.assert_equal(wf(1, 2, **wf_params), 5) persimgr.weight_params = { 'low': 0.0, 'high': 5.0, 'start': 0.0, 'end': 5.0 } wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 0) np.testing.assert_equal(wf(1, 1 / 5, **wf_params), 1 / 5) np.testing.assert_equal(wf(1, 1, **wf_params), 1) np.testing.assert_equal(wf(1, 5, **wf_params), 5) persimgr.weight_params = { 'low': 0.0, 'high': 5.0, 'start': 1.0, 'end': 5.0 } wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 0) np.testing.assert_equal(wf(1, 1, **wf_params), 0) np.testing.assert_equal(wf(1, 5, **wf_params), 5) persimgr.weight_params = { 'low': 1.0, 'high': 5.0, 'start': 1.0, 'end': 5.0 } wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 1) np.testing.assert_equal(wf(1, 1, **wf_params), 1) np.testing.assert_equal(wf(1, 2, **wf_params), 2)
def test_persistence(self): persimgr = PersistenceImager(weight=images_weights.persistence, weight_params={'n': 1.0}) wf = persimgr.weight wf_params = persimgr.weight_params x = np.random.rand() np.testing.assert_equal(wf(1, x, **wf_params), x) persimgr.weight_params = {'n': 1.5} wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, x, **wf_params), x**1.5)
def test_multiple_diagrams(self): persimgr = PersistenceImager(birth_range=(0, 5), pers_range=(0, 3), pixel_size=1) dgm1 = np.array([[0, 1], [1, 1], [3, 5]]) dgm2 = np.array([[0, 1], [1, 1], [3, 6], [1, 1]]) imgs = persimgr.transform([dgm1, dgm2]) np.testing.assert_equal(len(imgs), 2) np.testing.assert_equal(imgs[0].shape, imgs[1].shape) imgs = persimgr.fit_transform([dgm1, dgm2]) np.testing.assert_equal(len(imgs), 2) np.testing.assert_equal(imgs[0].shape, imgs[1].shape) np.testing.assert_equal(imgs[0].shape, (3, 3))
def test_zero_on_birthaxis(self): persimgr = PersistenceImager(weight=images_weights.linear_ramp, weight_params={ 'low': 0.0, 'high': 1.0, 'start': 0.0, 'end': 1.0 }) wf = persimgr.weight wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 0) persimgr = PersistenceImager(weight=images_weights.persistence, weight_params={'n': 2}) wf = persimgr.weight wf_params = persimgr.weight_params np.testing.assert_equal(wf(1, 0, **wf_params), 0)
def test_mixed_pairs(): """ This test is inspired by gh issue #3 by gh user muszyna25. Integer diagrams return nan values. This does not work: dgm = [[0, 2], [0, 6], [0, 8]]; This one works fine: dgm = [[0.0, 2.0], [0.0, 6.0], [0.0, 8.0]]; """ persimgr = PersistenceImager() dgm = [[0, 2], [0, 6], [0, 8]] dgm2 = [[0.0, 2.0], [0.0, 6.0], [0.0, 8.0]] dgm3 = [[0.0, 2], [0.0, 6.0], [0, 8.0e0]] res = persimgr.transform(dgm) res2 = persimgr.transform(dgm2) res3 = persimgr.transform(dgm3) np.testing.assert_array_equal(res, res2) np.testing.assert_array_equal(res, res3)
def test_empty_diagram_list(): dgms1 = [np.array([[2, 3]]), np.zeros((0, 2))] persimgr1 = PersistenceImager(pixel_size=0.1) res1 = persimgr1.transform(dgms1) np.testing.assert_array_equal(res1[1], np.zeros((10, 10))) dgms2 = [np.zeros((0, 2)), np.array([[2, 3]])] persimgr2 = PersistenceImager(pixel_size=0.1) res2 = persimgr2.transform(dgms2) np.testing.assert_array_equal(res2[0], np.zeros((10, 10))) dgms3 = [np.zeros((0, 2)), np.zeros((0, 2))] persimgr3 = PersistenceImager(pixel_size=0.1) res3 = persimgr3.transform(dgms3) np.testing.assert_array_equal(res3[0], np.zeros((10, 10))) np.testing.assert_array_equal(res3[1], np.zeros((10, 10)))
def test_pixel_size_setter(): persimgr = PersistenceImager(birth_range=(0, 1), pers_range=(0, 2), pixel_size=1) persimgr.pixel_size = .75 np.testing.assert_equal(persimgr.pixel_size, .75) np.testing.assert_equal(persimgr._pixel_size, .75) np.testing.assert_equal(persimgr.birth_range, (-0.25, 1.25)) np.testing.assert_equal(persimgr._birth_range, (-0.25, 1.25)) np.testing.assert_equal(persimgr.pers_range, (-0.125, 2.125)) np.testing.assert_equal(persimgr._pers_range, (-0.125, 2.125)) np.testing.assert_equal(persimgr.height, 2.25) np.testing.assert_equal(persimgr._height, 2.25) np.testing.assert_equal(persimgr.width, 1.5) np.testing.assert_equal(persimgr._width, 1.5) np.testing.assert_equal(persimgr.resolution, (2, 3)) np.testing.assert_equal(persimgr._resolution, (2, 3)) np.testing.assert_array_equal(persimgr._ppnts, [-0.125, 0.625, 1.375, 2.125]) np.testing.assert_array_equal(persimgr._bpnts, [-0.25, 0.5, 1.25])
def test_pers_range_setter(): persimgr = PersistenceImager(birth_range=(0, 1), pers_range=(0, 2), pixel_size=1) persimgr.pers_range = (-1.5, 4.5) np.testing.assert_equal(persimgr.pixel_size, 1) np.testing.assert_equal(persimgr._pixel_size, 1) np.testing.assert_equal(persimgr.pers_range, (-1.5, 4.5)) np.testing.assert_equal(persimgr._pers_range, (-1.5, 4.5)) np.testing.assert_equal(persimgr.birth_range, (0, 1)) np.testing.assert_equal(persimgr._birth_range, (0, 1)) np.testing.assert_equal(persimgr.width, 1) np.testing.assert_equal(persimgr._width, 1) np.testing.assert_equal(persimgr.height, 6) np.testing.assert_equal(persimgr._height, 6) np.testing.assert_equal(persimgr.resolution, (1, 6)) np.testing.assert_equal(persimgr._resolution, (1, 6)) np.testing.assert_array_equal(persimgr._ppnts, [-1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5]) np.testing.assert_array_equal(persimgr._bpnts, [0., 1.])
def test_birth_range_setter(): persimgr = PersistenceImager(birth_range=(0, 1), pers_range=(0, 2), pixel_size=1) persimgr.birth_range = (0.0, 4.5) np.testing.assert_equal(persimgr.pixel_size, 1) np.testing.assert_equal(persimgr._pixel_size, 1) np.testing.assert_equal(persimgr.pers_range, (0, 2)) np.testing.assert_equal(persimgr._pers_range, (0, 2)) np.testing.assert_equal(persimgr.birth_range, (-.25, 4.75)) np.testing.assert_equal(persimgr._birth_range, (-.25, 4.75)) np.testing.assert_equal(persimgr.width, 5) np.testing.assert_equal(persimgr._width, 5) np.testing.assert_equal(persimgr.height, 2) np.testing.assert_equal(persimgr._height, 2) np.testing.assert_equal(persimgr.resolution, (5, 2)) np.testing.assert_equal(persimgr._resolution, (5, 2)) np.testing.assert_array_equal(persimgr._bpnts, [-0.25, 0.75, 1.75, 2.75, 3.75, 4.75]) np.testing.assert_array_equal(persimgr._ppnts, [0., 1., 2.])
def construct_imager(param_dict): pimgr = PersistenceImager(**param_dict)
def test_empty_diagram(): dgm = np.zeros((0, 2)) persimgr = PersistenceImager(pixel_size=0.1) res = persimgr.transform(dgm) np.testing.assert_array_equal(res, np.zeros((10, 10)))
def prepare_num_word_meanings_supervised_data( model_dir: str, model_name: str, dataset_name: str, id_estimation_num_neighbours: list, semeval_2010_14_word_senses_filepath: str, tps_neighbourhood_sizes: list, raw_data_dir: str, output_dir: str, ) -> None: """ Prepares data for the supervised word meanings prediction task. Parameters ---------- model_dir : str Directory of the model to load. model_name : str Name of the trained word2vec model. dataset_name : str Name of the dataset the model is trained on. id_estimation_num_neighbours : list Number of neighbours to use when estimating intrinsic dimension for each word semeval_2010_14_word_senses_filepath : str Filepath of SemEval-2010 task 14 word senses joblib dict. tps_neighbourhood_sizes : list List of TPS neighbourhood sizes. raw_data_dir : str Directory where raw data will be saved to. output_dir: str Output directory. """ # Convert list arguments to int tps_neighbourhood_sizes = [ int(n_size) for n_size in tps_neighbourhood_sizes ] id_estimation_num_neighbours = [ int(num_neighbours) for num_neighbours in id_estimation_num_neighbours ] # Prepare directory constants and create raw data dir for caching data files task_id = f"wme_{model_name}_{dataset_name}" # wme = word meaning estimation task_raw_data_dir = join(raw_data_dir, task_id) task_raw_data_tps_dir = join(task_raw_data_dir, "tps") makedirs(task_raw_data_dir, exist_ok=True) # Load word embeddings from model print("Loading word embeddings...") w2v_training_output = load_model_training_output( model_training_output_dir=model_dir, model_name=model_name, dataset_name=dataset_name, return_normalized_embeddings=True, return_scann_instance_filepath=True, ) last_embedding_weights_normalized = w2v_training_output[ "last_embedding_weights_normalized"] last_embedding_weights_scann_instance_filepath = w2v_training_output[ "last_embedding_weights_scann_instance_filepath"] words = w2v_training_output["words"] word_to_int = w2v_training_output["word_to_int"] print("Done!") # Prepare SemEval-2010 task 14 data semeval_2010_14_word_senses = joblib.load( semeval_2010_14_word_senses_filepath) semeval_target_words = np.array( list(semeval_2010_14_word_senses["all"].keys())) semeval_target_words_in_vocab_filter = [ i for i, word in enumerate(semeval_target_words) if word in word_to_int ] semeval_target_words_in_vocab = semeval_target_words[ semeval_target_words_in_vocab_filter] semeval_gs_clusters = np.array( list(semeval_2010_14_word_senses["all"].values())) semeval_gs_clusters_in_vocab = semeval_gs_clusters[ semeval_target_words_in_vocab_filter] semeval_2010_14_word_senses_in_vocab = { word: gs_meanings for word, gs_meanings in zip(semeval_target_words_in_vocab, semeval_gs_clusters_in_vocab) } # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary -- words_to_num_meanings_filepath = join(task_raw_data_dir, "words_to_num_meanings.joblib") if not isfile(words_to_num_meanings_filepath): words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy() print("Finding words in vocabulary with #Wordnet synsets > 0") for word in tqdm(words): if word in semeval_target_words_in_vocab: continue num_synsets = len(wn.synsets(word)) if num_synsets > 0: words_to_num_meanings[word] = num_synsets joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath) else: words_to_num_meanings = joblib.load(words_to_num_meanings_filepath) print("Loaded words_to_num_meanings!") data_words = np.array(list(words_to_num_meanings.keys())) data_words_no_semeval = [ word for word in data_words if word not in semeval_target_words_in_vocab ] data_word_to_int = {word: i for i, word in enumerate(data_words)} # Filter out word embeddings using Wordnet words (data_words) data_words_to_full_vocab_ints = np.array( [word_to_int[word] for word in data_words]) # (2) -- Compute TPS_n for train/test words -- makedirs(task_raw_data_tps_dir, exist_ok=True) tps_scores_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] tps_pds_filepaths = [ join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy") for tps_neighbourhood_size in tps_neighbourhood_sizes ] for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): if isfile(tps_scores_filepath) and isfile(tps_pds_filepath): continue print( f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..." ) # Load ScaNN instance scann_instance = ApproxNN(ann_alg="scann") scann_instance.load( ann_path=last_embedding_weights_scann_instance_filepath) # Compute TPS tps_scores_ns, tps_pds_ns = tps_multiple( target_words=data_words, word_to_int=word_to_int, neighbourhood_size=tps_neighbourhood_size, word_embeddings_normalized=last_embedding_weights_normalized, ann_instance=scann_instance, return_persistence_diagram=True, n_jobs=-1, progressbar_enabled=True, ) # Save result print("Saving TPS result...") np.save(tps_scores_filepath, tps_scores_ns) np.save(tps_pds_filepath, tps_pds_ns) print("Done!") # Free resources del scann_instance # (3) -- Compute GAD -- gad_dir = join(task_raw_data_dir, "gad") makedirs(gad_dir, exist_ok=True) gad_params = [ (25, 250), (25, 500), (25, 750), (25, 1000), # ---------- (50, 250), (50, 500), (50, 750), (50, 1000), # ---------- (100, 1000), (100, 1250), (100, 1500), (100, 1750), (100, 2000), # ---------- (150, 1000), (150, 1250), (150, 1500), (150, 1750), (150, 2000), # ---------- (200, 1000), (200, 1250), (200, 1500), (200, 1750), (200, 2000), ] gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_filepath = join(gad_dir, f"{gad_id}.joblib") if isfile(gad_filepath): continue print(f"-- {gad_id} -- ") # Load ScaNN instance approx_nn = ApproxNN(ann_alg="scann") approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath) # Compute features gad_result = compute_gad( data_points=last_embedding_weights_normalized, data_point_ints=data_words_to_full_vocab_ints, manifold_dimension=2, data_points_approx_nn=approx_nn, use_knn_annulus=True, knn_annulus_inner=inner_param, knn_annulus_outer=outer_param, return_annlus_persistence_diagrams=True, progressbar_enabled=True, n_jobs=-1, ) print( "P_man:", len(gad_result["P_man"]), "P_int:", len(gad_result["P_int"]), "P_bnd:", len(gad_result["P_bnd"]), ) joblib.dump(gad_result, gad_filepath, protocol=4) # Free resources del approx_nn # (4) -- Estimate the intrinsic dimension (ID) for each word vector -- words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids") id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [ ("lpca", est_ids.lPCA, {}), ("knn", est_ids.KNN, {}), ("twonn", est_ids.TwoNN, {}), ("mle", est_ids.MLE, {}), ("tle", est_ids.TLE, {}), ] makedirs(words_estimated_ids_dir, exist_ok=True) for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators: for num_neighbours in id_estimation_num_neighbours: estimated_ids_filepath = join( words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy") if isfile(estimated_ids_filepath): continue print( f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..." ) id_estimator = id_estimator_cls(**id_estimator_params) estimated_ids = id_estimator.fit_predict_pw( X=last_embedding_weights_normalized[ data_words_to_full_vocab_ints], n_neighbors=num_neighbours, n_jobs=-1, ) # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints] print("Done! Saving to file...") np.save(estimated_ids_filepath, estimated_ids) # (5) -- Create features from GAD result to speed up combining of data -- gad_features_dir = join(task_raw_data_dir, "gad_features") makedirs(gad_features_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") if isfile(gad_features_filepath): continue print(f"Creating GAD features for {gad_id}...") # Load GAD result gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Features from GAD (P_man, P_int, P_bnd) gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3), dtype=int) for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)): for gad_category, gad_category_idx in gad_categories.items(): if word_int in gad_result[gad_category]: gad_features[i, gad_category_idx] = 1 # Save GAD features np.save(gad_features_filepath, gad_features) # (6) -- Vectorize persistence diagrams from GAD features -- gad_features_pd_vectorized_dir = join(task_raw_data_dir, "gad_features_pd_vectorized") gad_features_pd_vectorized_size = 5 gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2 makedirs(gad_features_pd_vectorized_dir, exist_ok=True) for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir, f"{gad_id}.npy") if isfile(gad_features_pd_vecs_filepath): continue print(f"Vectorizing GAD features for {gad_id}...") # Load GAD features gad_result_filepath = join(gad_dir, f"{gad_id}.joblib") gad_result = joblib.load(gad_result_filepath) # Use PersistenceImage to vectorize persistence diagrams gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints), gad_features_pd_vectorized_size_flat)) for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)): # Get persistence diagram and create a range such that we get a square image from PersistenceImager gad_features_pd = gad_result["annulus_pds"][point_index] if len(gad_features_pd) == 0: gad_features_pd_vecs[i] = np.zeros( gad_features_pd_vectorized_size_flat, dtype=int) continue births, deaths = gad_features_pd.T persistence = deaths - births square_min = min(births.min(), persistence.min()) square_max = max(births.max(), persistence.max()) square_range = (square_min, square_max) pixel_size = (square_max - square_min) / gad_features_pd_vectorized_size # Vectorize persistence diagram pimgr = PersistenceImager(birth_range=square_range, pers_range=square_range, pixel_size=pixel_size) pd_vec = pimgr.transform(gad_features_pd) gad_features_pd_vecs[i] = pd_vec.flatten() # Save persistence image vectors to file np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs) # (7) -- Combine data into data (features and labels) for WME task -- word_meaning_train_data_filepath = join(output_dir, "word_meaning_train_data.csv") word_meaning_test_data_filepath = join(output_dir, "word_meaning_test_data.csv") word_meaning_semeval_test_data_filepath = join( output_dir, "word_meaning_semeval_test_data.csv") if (not isfile(word_meaning_train_data_filepath) or not isfile(word_meaning_test_data_filepath) or not isfile(word_meaning_semeval_test_data_filepath)): # -- Load data for creating features -- # Load estimated IDs from file words_estimated_ids = { f"{id_estimator_name}_{num_neighbours}": np.load( join(words_estimated_ids_dir, f"{id_estimator_name}_{num_neighbours}.npy")) for num_neighbours in id_estimation_num_neighbours for id_estimator_name, _, _ in id_estimators } print("Loaded estimated IDs!") # Load GAD features gad_features_dict = {} for inner_param, outer_param in gad_params: gad_id = f"gad_knn_{inner_param}_{outer_param}" # Load GAD features gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy") gad_features_dict[gad_id] = np.load(gad_features_filepath) print("Loaded GAD features!") # Load TPS features tps_scores = {} tps_pds = {} for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip( tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths): tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath) tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath, allow_pickle=True) print("Loaded TPS features!") data_words_train, data_words_test = train_test_split( data_words_no_semeval, test_size=0.05, random_state=rng_seed) if not isfile(word_meaning_train_data_filepath): print("Preparing data for training...") train_data_df = create_word_meaning_model_data_features( target_words=data_words_train, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) train_data_df.to_csv(word_meaning_train_data_filepath, index=False) if not isfile(word_meaning_test_data_filepath): print("Preparing data for testing...") test_data_df = create_word_meaning_model_data_features( target_words=data_words_test, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) test_data_df.to_csv(word_meaning_test_data_filepath, index=False) if not isfile(word_meaning_semeval_test_data_filepath): print("Preparing data for external testing (SemEval)...") semeval_test_data_df = create_word_meaning_model_data_features( target_words=semeval_target_words_in_vocab, word_to_int=data_word_to_int, tps_scores=tps_scores, tps_pds=tps_pds, tps_neighbourhood_sizes=tps_neighbourhood_sizes, words_estimated_ids=words_estimated_ids, words_to_meanings=words_to_num_meanings, gad_categories=gad_categories, gad_features_dict=gad_features_dict, ) semeval_test_data_df.to_csv( word_meaning_semeval_test_data_filepath, index=False) else: train_data_df = pd.read_csv(word_meaning_train_data_filepath) test_data_df = pd.read_csv(word_meaning_test_data_filepath) semeval_test_data_df = pd.read_csv( word_meaning_semeval_test_data_filepath) print("Train", train_data_df) print("Test", test_data_df) print("SemEval test", semeval_test_data_df)