コード例 #1
0
ファイル: tda.py プロジェクト: shirtd/tpers
 def to_metric(self, pixel_size=0.1):
     births = [b for dgms in self.data for dgm in dgms for b,_ in dgm]
     tpers = [d - b for dgms in self.data for dgm in dgms for b,d in dgm if d < np.inf]
     pim = PersistenceImager((min(births),max(max(births),pixel_size)),
                             (min(tpers),max(max(tpers),pixel_size)),
                             pixel_size=pixel_size)
     return np.vstack([np.stack([pim.transform(noinf(d)) for d in D]).ravel() for D in tqdm(self.data, desc='[ Persim')])
コード例 #2
0
def test_fit_diagram_list():
    persimgr = PersistenceImager(birth_range=(0, 1),
                                 pers_range=(0, 2),
                                 pixel_size=1)
    dgms = [
        np.array([[1, 2], [4, 8], [-1, 5.25]]),
        np.array([[1, 2], [2, 3], [3, 4]])
    ]
    persimgr.fit(dgms)

    np.testing.assert_equal(persimgr.pixel_size, 1)
    np.testing.assert_equal(persimgr._pixel_size, 1)
    np.testing.assert_equal(persimgr.birth_range, (-1, 4))
    np.testing.assert_equal(persimgr._birth_range, (-1, 4))
    np.testing.assert_equal(persimgr.pers_range, (0.625, 6.625))
    np.testing.assert_equal(persimgr._pers_range, (0.625, 6.625))
    np.testing.assert_equal(persimgr.height, 6)
    np.testing.assert_equal(persimgr._height, 6)
    np.testing.assert_equal(persimgr.width, 5)
    np.testing.assert_equal(persimgr._width, 5)
    np.testing.assert_equal(persimgr.resolution, (5, 6))
    np.testing.assert_equal(persimgr._resolution, (5, 6))
    np.testing.assert_array_equal(
        persimgr._ppnts, [0.625, 1.625, 2.625, 3.625, 4.625, 5.625, 6.625])
    np.testing.assert_array_equal(persimgr._bpnts, [-1., 0., 1., 2., 3., 4.])
コード例 #3
0
    def test_lists_of_lists(self):
        persimgr = PersistenceImager(birth_range=(0, 3),
                                     pers_range=(0, 3),
                                     pixel_size=1)
        dgm = [[0, 1], [1, 1], [3, 5]]
        img = persimgr.transform(dgm)

        np.testing.assert_equal(img.shape, (3, 3))
コード例 #4
0
    def test_n_pixels(self):
        persimgr = PersistenceImager(birth_range=(0, 5),
                                     pers_range=(0, 3),
                                     pixel_size=1)
        dgm = np.array([[0, 1], [1, 1], [3, 5]])
        img = persimgr.transform(dgm)

        np.testing.assert_equal(img.shape, (5, 3))

        img = persimgr.fit_transform(dgm)
        np.testing.assert_equal(img.shape, (3, 2))
コード例 #5
0
    def test_linear_ramp(self):
        persimgr = PersistenceImager(weight=images_weights.linear_ramp,
                                     weight_params={
                                         'low': 0.0,
                                         'high': 5.0,
                                         'start': 0.0,
                                         'end': 1.0
                                     })

        wf = persimgr.weight
        wf_params = persimgr.weight_params

        np.testing.assert_equal(wf(1, 0, **wf_params), 0)
        np.testing.assert_equal(wf(1, 1 / 5, **wf_params), 1)
        np.testing.assert_equal(wf(1, 1, **wf_params), 5)
        np.testing.assert_equal(wf(1, 2, **wf_params), 5)

        persimgr.weight_params = {
            'low': 0.0,
            'high': 5.0,
            'start': 0.0,
            'end': 5.0
        }
        wf_params = persimgr.weight_params

        np.testing.assert_equal(wf(1, 0, **wf_params), 0)
        np.testing.assert_equal(wf(1, 1 / 5, **wf_params), 1 / 5)
        np.testing.assert_equal(wf(1, 1, **wf_params), 1)
        np.testing.assert_equal(wf(1, 5, **wf_params), 5)

        persimgr.weight_params = {
            'low': 0.0,
            'high': 5.0,
            'start': 1.0,
            'end': 5.0
        }
        wf_params = persimgr.weight_params

        np.testing.assert_equal(wf(1, 0, **wf_params), 0)
        np.testing.assert_equal(wf(1, 1, **wf_params), 0)
        np.testing.assert_equal(wf(1, 5, **wf_params), 5)

        persimgr.weight_params = {
            'low': 1.0,
            'high': 5.0,
            'start': 1.0,
            'end': 5.0
        }
        wf_params = persimgr.weight_params
        np.testing.assert_equal(wf(1, 0, **wf_params), 1)
        np.testing.assert_equal(wf(1, 1, **wf_params), 1)
        np.testing.assert_equal(wf(1, 2, **wf_params), 2)
コード例 #6
0
    def test_persistence(self):
        persimgr = PersistenceImager(weight=images_weights.persistence,
                                     weight_params={'n': 1.0})

        wf = persimgr.weight
        wf_params = persimgr.weight_params

        x = np.random.rand()
        np.testing.assert_equal(wf(1, x, **wf_params), x)

        persimgr.weight_params = {'n': 1.5}
        wf_params = persimgr.weight_params

        np.testing.assert_equal(wf(1, x, **wf_params), x**1.5)
コード例 #7
0
    def test_multiple_diagrams(self):
        persimgr = PersistenceImager(birth_range=(0, 5),
                                     pers_range=(0, 3),
                                     pixel_size=1)

        dgm1 = np.array([[0, 1], [1, 1], [3, 5]])
        dgm2 = np.array([[0, 1], [1, 1], [3, 6], [1, 1]])
        imgs = persimgr.transform([dgm1, dgm2])

        np.testing.assert_equal(len(imgs), 2)
        np.testing.assert_equal(imgs[0].shape, imgs[1].shape)

        imgs = persimgr.fit_transform([dgm1, dgm2])
        np.testing.assert_equal(len(imgs), 2)
        np.testing.assert_equal(imgs[0].shape, imgs[1].shape)
        np.testing.assert_equal(imgs[0].shape, (3, 3))
コード例 #8
0
    def test_zero_on_birthaxis(self):
        persimgr = PersistenceImager(weight=images_weights.linear_ramp,
                                     weight_params={
                                         'low': 0.0,
                                         'high': 1.0,
                                         'start': 0.0,
                                         'end': 1.0
                                     })
        wf = persimgr.weight
        wf_params = persimgr.weight_params
        np.testing.assert_equal(wf(1, 0, **wf_params), 0)

        persimgr = PersistenceImager(weight=images_weights.persistence,
                                     weight_params={'n': 2})
        wf = persimgr.weight
        wf_params = persimgr.weight_params
        np.testing.assert_equal(wf(1, 0, **wf_params), 0)
コード例 #9
0
def test_mixed_pairs():
    """ This test is inspired by gh issue #3 by gh user muszyna25.
    Integer diagrams return nan values.
    This does not work: dgm = [[0, 2], [0, 6], [0, 8]];
    This one works fine: dgm = [[0.0, 2.0], [0.0, 6.0], [0.0, 8.0]];
    """
    persimgr = PersistenceImager()

    dgm = [[0, 2], [0, 6], [0, 8]]
    dgm2 = [[0.0, 2.0], [0.0, 6.0], [0.0, 8.0]]
    dgm3 = [[0.0, 2], [0.0, 6.0], [0, 8.0e0]]

    res = persimgr.transform(dgm)
    res2 = persimgr.transform(dgm2)
    res3 = persimgr.transform(dgm3)

    np.testing.assert_array_equal(res, res2)
    np.testing.assert_array_equal(res, res3)
コード例 #10
0
def test_empty_diagram_list():
    dgms1 = [np.array([[2, 3]]), np.zeros((0, 2))]
    persimgr1 = PersistenceImager(pixel_size=0.1)
    res1 = persimgr1.transform(dgms1)
    np.testing.assert_array_equal(res1[1], np.zeros((10, 10)))

    dgms2 = [np.zeros((0, 2)), np.array([[2, 3]])]
    persimgr2 = PersistenceImager(pixel_size=0.1)
    res2 = persimgr2.transform(dgms2)
    np.testing.assert_array_equal(res2[0], np.zeros((10, 10)))

    dgms3 = [np.zeros((0, 2)), np.zeros((0, 2))]
    persimgr3 = PersistenceImager(pixel_size=0.1)
    res3 = persimgr3.transform(dgms3)
    np.testing.assert_array_equal(res3[0], np.zeros((10, 10)))
    np.testing.assert_array_equal(res3[1], np.zeros((10, 10)))
コード例 #11
0
def test_pixel_size_setter():
    persimgr = PersistenceImager(birth_range=(0, 1),
                                 pers_range=(0, 2),
                                 pixel_size=1)
    persimgr.pixel_size = .75

    np.testing.assert_equal(persimgr.pixel_size, .75)
    np.testing.assert_equal(persimgr._pixel_size, .75)
    np.testing.assert_equal(persimgr.birth_range, (-0.25, 1.25))
    np.testing.assert_equal(persimgr._birth_range, (-0.25, 1.25))
    np.testing.assert_equal(persimgr.pers_range, (-0.125, 2.125))
    np.testing.assert_equal(persimgr._pers_range, (-0.125, 2.125))
    np.testing.assert_equal(persimgr.height, 2.25)
    np.testing.assert_equal(persimgr._height, 2.25)
    np.testing.assert_equal(persimgr.width, 1.5)
    np.testing.assert_equal(persimgr._width, 1.5)
    np.testing.assert_equal(persimgr.resolution, (2, 3))
    np.testing.assert_equal(persimgr._resolution, (2, 3))
    np.testing.assert_array_equal(persimgr._ppnts,
                                  [-0.125, 0.625, 1.375, 2.125])
    np.testing.assert_array_equal(persimgr._bpnts, [-0.25, 0.5, 1.25])
コード例 #12
0
def test_pers_range_setter():
    persimgr = PersistenceImager(birth_range=(0, 1),
                                 pers_range=(0, 2),
                                 pixel_size=1)
    persimgr.pers_range = (-1.5, 4.5)

    np.testing.assert_equal(persimgr.pixel_size, 1)
    np.testing.assert_equal(persimgr._pixel_size, 1)
    np.testing.assert_equal(persimgr.pers_range, (-1.5, 4.5))
    np.testing.assert_equal(persimgr._pers_range, (-1.5, 4.5))
    np.testing.assert_equal(persimgr.birth_range, (0, 1))
    np.testing.assert_equal(persimgr._birth_range, (0, 1))
    np.testing.assert_equal(persimgr.width, 1)
    np.testing.assert_equal(persimgr._width, 1)
    np.testing.assert_equal(persimgr.height, 6)
    np.testing.assert_equal(persimgr._height, 6)
    np.testing.assert_equal(persimgr.resolution, (1, 6))
    np.testing.assert_equal(persimgr._resolution, (1, 6))
    np.testing.assert_array_equal(persimgr._ppnts,
                                  [-1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5])
    np.testing.assert_array_equal(persimgr._bpnts, [0., 1.])
コード例 #13
0
def test_birth_range_setter():
    persimgr = PersistenceImager(birth_range=(0, 1),
                                 pers_range=(0, 2),
                                 pixel_size=1)
    persimgr.birth_range = (0.0, 4.5)

    np.testing.assert_equal(persimgr.pixel_size, 1)
    np.testing.assert_equal(persimgr._pixel_size, 1)
    np.testing.assert_equal(persimgr.pers_range, (0, 2))
    np.testing.assert_equal(persimgr._pers_range, (0, 2))
    np.testing.assert_equal(persimgr.birth_range, (-.25, 4.75))
    np.testing.assert_equal(persimgr._birth_range, (-.25, 4.75))
    np.testing.assert_equal(persimgr.width, 5)
    np.testing.assert_equal(persimgr._width, 5)
    np.testing.assert_equal(persimgr.height, 2)
    np.testing.assert_equal(persimgr._height, 2)
    np.testing.assert_equal(persimgr.resolution, (5, 2))
    np.testing.assert_equal(persimgr._resolution, (5, 2))
    np.testing.assert_array_equal(persimgr._bpnts,
                                  [-0.25, 0.75, 1.75, 2.75, 3.75, 4.75])
    np.testing.assert_array_equal(persimgr._ppnts, [0., 1., 2.])
コード例 #14
0
 def construct_imager(param_dict):
     pimgr = PersistenceImager(**param_dict)
コード例 #15
0
def test_empty_diagram():
    dgm = np.zeros((0, 2))
    persimgr = PersistenceImager(pixel_size=0.1)
    res = persimgr.transform(dgm)
    np.testing.assert_array_equal(res, np.zeros((10, 10)))
コード例 #16
0
def prepare_num_word_meanings_supervised_data(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    id_estimation_num_neighbours: list,
    semeval_2010_14_word_senses_filepath: str,
    tps_neighbourhood_sizes: list,
    raw_data_dir: str,
    output_dir: str,
) -> None:
    """
    Prepares data for the supervised word meanings prediction task.

    Parameters
    ----------
    model_dir : str
        Directory of the model to load.
    model_name : str
        Name of the trained word2vec model.
    dataset_name : str
        Name of the dataset the model is trained on.
    id_estimation_num_neighbours : list
        Number of neighbours to use when estimating intrinsic dimension for each word
    semeval_2010_14_word_senses_filepath : str
        Filepath of SemEval-2010 task 14 word senses joblib dict.
    tps_neighbourhood_sizes : list
        List of TPS neighbourhood sizes.
    raw_data_dir : str
        Directory where raw data will be saved to.
    output_dir: str
        Output directory.
    """
    # Convert list arguments to int
    tps_neighbourhood_sizes = [
        int(n_size) for n_size in tps_neighbourhood_sizes
    ]
    id_estimation_num_neighbours = [
        int(num_neighbours) for num_neighbours in id_estimation_num_neighbours
    ]

    # Prepare directory constants and create raw data dir for caching data files
    task_id = f"wme_{model_name}_{dataset_name}"  # wme = word meaning estimation
    task_raw_data_dir = join(raw_data_dir, task_id)
    task_raw_data_tps_dir = join(task_raw_data_dir, "tps")
    makedirs(task_raw_data_dir, exist_ok=True)

    # Load word embeddings from model
    print("Loading word embeddings...")
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        return_normalized_embeddings=True,
        return_scann_instance_filepath=True,
    )
    last_embedding_weights_normalized = w2v_training_output[
        "last_embedding_weights_normalized"]
    last_embedding_weights_scann_instance_filepath = w2v_training_output[
        "last_embedding_weights_scann_instance_filepath"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]
    print("Done!")

    # Prepare SemEval-2010 task 14 data
    semeval_2010_14_word_senses = joblib.load(
        semeval_2010_14_word_senses_filepath)
    semeval_target_words = np.array(
        list(semeval_2010_14_word_senses["all"].keys()))
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter]
    semeval_gs_clusters = np.array(
        list(semeval_2010_14_word_senses["all"].values()))
    semeval_gs_clusters_in_vocab = semeval_gs_clusters[
        semeval_target_words_in_vocab_filter]
    semeval_2010_14_word_senses_in_vocab = {
        word: gs_meanings
        for word, gs_meanings in zip(semeval_target_words_in_vocab,
                                     semeval_gs_clusters_in_vocab)
    }

    # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary --
    words_to_num_meanings_filepath = join(task_raw_data_dir,
                                          "words_to_num_meanings.joblib")
    if not isfile(words_to_num_meanings_filepath):
        words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy()
        print("Finding words in vocabulary with #Wordnet synsets > 0")
        for word in tqdm(words):
            if word in semeval_target_words_in_vocab:
                continue
            num_synsets = len(wn.synsets(word))
            if num_synsets > 0:
                words_to_num_meanings[word] = num_synsets
        joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath)
    else:
        words_to_num_meanings = joblib.load(words_to_num_meanings_filepath)
        print("Loaded words_to_num_meanings!")
    data_words = np.array(list(words_to_num_meanings.keys()))
    data_words_no_semeval = [
        word for word in data_words
        if word not in semeval_target_words_in_vocab
    ]
    data_word_to_int = {word: i for i, word in enumerate(data_words)}

    # Filter out word embeddings using Wordnet words (data_words)
    data_words_to_full_vocab_ints = np.array(
        [word_to_int[word] for word in data_words])

    # (2) -- Compute TPS_n for train/test words --
    makedirs(task_raw_data_tps_dir, exist_ok=True)
    tps_scores_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    tps_pds_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
            tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths):
        if isfile(tps_scores_filepath) and isfile(tps_pds_filepath):
            continue
        print(
            f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..."
        )

        # Load ScaNN instance
        scann_instance = ApproxNN(ann_alg="scann")
        scann_instance.load(
            ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute TPS
        tps_scores_ns, tps_pds_ns = tps_multiple(
            target_words=data_words,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            ann_instance=scann_instance,
            return_persistence_diagram=True,
            n_jobs=-1,
            progressbar_enabled=True,
        )

        # Save result
        print("Saving TPS result...")
        np.save(tps_scores_filepath, tps_scores_ns)
        np.save(tps_pds_filepath, tps_pds_ns)
        print("Done!")

        # Free resources
        del scann_instance

    # (3) -- Compute GAD --
    gad_dir = join(task_raw_data_dir, "gad")
    makedirs(gad_dir, exist_ok=True)
    gad_params = [
        (25, 250),
        (25, 500),
        (25, 750),
        (25, 1000),
        # ----------
        (50, 250),
        (50, 500),
        (50, 750),
        (50, 1000),
        # ----------
        (100, 1000),
        (100, 1250),
        (100, 1500),
        (100, 1750),
        (100, 2000),
        # ----------
        (150, 1000),
        (150, 1250),
        (150, 1500),
        (150, 1750),
        (150, 2000),
        # ----------
        (200, 1000),
        (200, 1250),
        (200, 1500),
        (200, 1750),
        (200, 2000),
    ]
    gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2}
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_filepath = join(gad_dir, f"{gad_id}.joblib")
        if isfile(gad_filepath):
            continue
        print(f"-- {gad_id} -- ")

        # Load ScaNN instance
        approx_nn = ApproxNN(ann_alg="scann")
        approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute features
        gad_result = compute_gad(
            data_points=last_embedding_weights_normalized,
            data_point_ints=data_words_to_full_vocab_ints,
            manifold_dimension=2,
            data_points_approx_nn=approx_nn,
            use_knn_annulus=True,
            knn_annulus_inner=inner_param,
            knn_annulus_outer=outer_param,
            return_annlus_persistence_diagrams=True,
            progressbar_enabled=True,
            n_jobs=-1,
        )
        print(
            "P_man:",
            len(gad_result["P_man"]),
            "P_int:",
            len(gad_result["P_int"]),
            "P_bnd:",
            len(gad_result["P_bnd"]),
        )
        joblib.dump(gad_result, gad_filepath, protocol=4)

        # Free resources
        del approx_nn

    # (4) -- Estimate the intrinsic dimension (ID) for each word vector --
    words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids")
    id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [
        ("lpca", est_ids.lPCA, {}),
        ("knn", est_ids.KNN, {}),
        ("twonn", est_ids.TwoNN, {}),
        ("mle", est_ids.MLE, {}),
        ("tle", est_ids.TLE, {}),
    ]
    makedirs(words_estimated_ids_dir, exist_ok=True)
    for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators:
        for num_neighbours in id_estimation_num_neighbours:
            estimated_ids_filepath = join(
                words_estimated_ids_dir,
                f"{id_estimator_name}_{num_neighbours}.npy")
            if isfile(estimated_ids_filepath):
                continue

            print(
                f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..."
            )
            id_estimator = id_estimator_cls(**id_estimator_params)
            estimated_ids = id_estimator.fit_predict_pw(
                X=last_embedding_weights_normalized[
                    data_words_to_full_vocab_ints],
                n_neighbors=num_neighbours,
                n_jobs=-1,
            )
            # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints]

            print("Done! Saving to file...")
            np.save(estimated_ids_filepath, estimated_ids)

    # (5) -- Create features from GAD result to speed up combining of data --
    gad_features_dir = join(task_raw_data_dir, "gad_features")
    makedirs(gad_features_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
        if isfile(gad_features_filepath):
            continue
        print(f"Creating GAD features for {gad_id}...")

        # Load GAD result
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Features from GAD (P_man, P_int, P_bnd)
        gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3),
                                dtype=int)
        for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)):
            for gad_category, gad_category_idx in gad_categories.items():
                if word_int in gad_result[gad_category]:
                    gad_features[i, gad_category_idx] = 1

        # Save GAD features
        np.save(gad_features_filepath, gad_features)

    # (6) -- Vectorize persistence diagrams from GAD features --
    gad_features_pd_vectorized_dir = join(task_raw_data_dir,
                                          "gad_features_pd_vectorized")
    gad_features_pd_vectorized_size = 5
    gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2
    makedirs(gad_features_pd_vectorized_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"
        gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir,
                                             f"{gad_id}.npy")
        if isfile(gad_features_pd_vecs_filepath):
            continue
        print(f"Vectorizing GAD features for {gad_id}...")

        # Load GAD features
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Use PersistenceImage to vectorize persistence diagrams
        gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints),
                                         gad_features_pd_vectorized_size_flat))
        for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)):

            # Get persistence diagram and create a range such that we get a square image from PersistenceImager
            gad_features_pd = gad_result["annulus_pds"][point_index]
            if len(gad_features_pd) == 0:
                gad_features_pd_vecs[i] = np.zeros(
                    gad_features_pd_vectorized_size_flat, dtype=int)
                continue

            births, deaths = gad_features_pd.T
            persistence = deaths - births
            square_min = min(births.min(), persistence.min())
            square_max = max(births.max(), persistence.max())
            square_range = (square_min, square_max)
            pixel_size = (square_max -
                          square_min) / gad_features_pd_vectorized_size

            # Vectorize persistence diagram
            pimgr = PersistenceImager(birth_range=square_range,
                                      pers_range=square_range,
                                      pixel_size=pixel_size)
            pd_vec = pimgr.transform(gad_features_pd)
            gad_features_pd_vecs[i] = pd_vec.flatten()

        # Save persistence image vectors to file
        np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs)

    # (7) -- Combine data into data (features and labels) for WME task --
    word_meaning_train_data_filepath = join(output_dir,
                                            "word_meaning_train_data.csv")
    word_meaning_test_data_filepath = join(output_dir,
                                           "word_meaning_test_data.csv")
    word_meaning_semeval_test_data_filepath = join(
        output_dir, "word_meaning_semeval_test_data.csv")
    if (not isfile(word_meaning_train_data_filepath)
            or not isfile(word_meaning_test_data_filepath)
            or not isfile(word_meaning_semeval_test_data_filepath)):
        # -- Load data for creating features --
        # Load estimated IDs from file
        words_estimated_ids = {
            f"{id_estimator_name}_{num_neighbours}": np.load(
                join(words_estimated_ids_dir,
                     f"{id_estimator_name}_{num_neighbours}.npy"))
            for num_neighbours in id_estimation_num_neighbours
            for id_estimator_name, _, _ in id_estimators
        }
        print("Loaded estimated IDs!")

        # Load GAD features
        gad_features_dict = {}
        for inner_param, outer_param in gad_params:
            gad_id = f"gad_knn_{inner_param}_{outer_param}"

            # Load GAD features
            gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
            gad_features_dict[gad_id] = np.load(gad_features_filepath)
        print("Loaded GAD features!")

        # Load TPS features
        tps_scores = {}
        tps_pds = {}
        for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
                tps_neighbourhood_sizes, tps_scores_filepaths,
                tps_pds_filepaths):
            tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath)
            tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath,
                                                      allow_pickle=True)
        print("Loaded TPS features!")

        data_words_train, data_words_test = train_test_split(
            data_words_no_semeval, test_size=0.05, random_state=rng_seed)
        if not isfile(word_meaning_train_data_filepath):
            print("Preparing data for training...")
            train_data_df = create_word_meaning_model_data_features(
                target_words=data_words_train,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            train_data_df.to_csv(word_meaning_train_data_filepath, index=False)
        if not isfile(word_meaning_test_data_filepath):
            print("Preparing data for testing...")
            test_data_df = create_word_meaning_model_data_features(
                target_words=data_words_test,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            test_data_df.to_csv(word_meaning_test_data_filepath, index=False)
        if not isfile(word_meaning_semeval_test_data_filepath):
            print("Preparing data for external testing (SemEval)...")
            semeval_test_data_df = create_word_meaning_model_data_features(
                target_words=semeval_target_words_in_vocab,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            semeval_test_data_df.to_csv(
                word_meaning_semeval_test_data_filepath, index=False)
    else:
        train_data_df = pd.read_csv(word_meaning_train_data_filepath)
        test_data_df = pd.read_csv(word_meaning_test_data_filepath)
        semeval_test_data_df = pd.read_csv(
            word_meaning_semeval_test_data_filepath)
    print("Train", train_data_df)
    print("Test", test_data_df)
    print("SemEval test", semeval_test_data_df)