예제 #1
0
def load_model_training_output(
    model_training_output_dir: str,
    model_name: str,
    dataset_name: str,
    word_embeddings_mmap_mode: str = "r",
    return_normalized_embeddings: bool = False,
    return_annoy_instance: bool = False,
    annoy_instance_prefault: bool = False,
    return_scann_instance: bool = False,
    return_scann_instance_filepath: bool = False,
) -> dict:
    """
    Loads and returns a dict object containing output from word2vec training

    Parameters
    ----------
    model_training_output_dir : str
        word2vec model training output directory
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    word_embeddings_mmap_mode : str, optional
        Memmap mode to use when loading last word embedding weights (defaults to "r", or read).
    return_normalized_embeddings : bool, optional
        Whether or not to return last embedding weights, normalized, if they
        are present (defaults to False).
    return_annoy_instance : bool, optional
        Whether or not to return Annoy index fit on last embedding weights, if they
        are present (defaults to False).
    annoy_instance_prefault : bool, optional
        Whether or not to enable the `prefault` option when loading
        Annoy index. `return_annoy_instance` must be set to True to have an affect.
        (Defaults to False).
    return_scann_instance : bool, optional
        Whether or not to return the ScaNN instance fit on the last embedding weights,
        if they are present (defaults to False).
    return_scann_instance_filepath : bool, optional
        Whether or not to return the filepath of the ScaNN instance fit on the last word
        embedding weights, if they are present (defaults to False).

    Returns
    -------
    model_training_output : dict
        Dictionary containing output from word2vec training
    """

    # Get filepaths of the model output
    checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
        output_dir=model_training_output_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )

    # Get last word embeddings from training
    last_embedding_weights_filepath = checkpoint_filepaths_dict[
        "intermediate_embedding_weight_filepaths"][-1]
    last_embedding_weights = np.load(last_embedding_weights_filepath,
                                     mmap_mode=word_embeddings_mmap_mode)

    # Get word counts from tokenizer of word2vec model
    with open(checkpoint_filepaths_dict["train_word_counts_filepath"],
              "r") as word_counts_file:
        word_counts = np.array([
            int(word_count)
            for word_count in word_counts_file.read().split("\n")
        ])

    # Get array of words and word_to_int lookup dictionary
    with open(checkpoint_filepaths_dict["train_words_filepath"],
              "r") as words_file:
        words = np.array(words_file.read().split("\n"))
    word_to_int = {word: i for i, word in enumerate(words)}

    # Normalized embedding weights
    last_embedding_weights_normalized = None
    if (return_normalized_embeddings
            and "intermediate_embedding_weight_normalized_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_normalized = np.load(
            checkpoint_filepaths_dict[
                "intermediate_embedding_weight_normalized_filepaths"][-1],
            mmap_mode="r",
        )

    # Annoy index
    last_embedding_weights_annoy_instance = None
    if (return_annoy_instance
            and "intermediate_embedding_weight_annoy_index_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_annoy_instance = ApproxNN(ann_alg="annoy")
        last_embedding_weights_annoy_instance.load(
            ann_path=checkpoint_filepaths_dict[
                "intermediate_embedding_weight_annoy_index_filepaths"][-1],
            annoy_data_dimensionality=last_embedding_weights.shape[1],
            annoy_mertic="euclidean",
            annoy_prefault=annoy_instance_prefault,
        )

    # ScaNN instance
    last_embedding_weights_scann_instance = None
    last_embedding_weights_scann_instance_filepath = None
    if "intermediate_embedding_weight_scann_artifact_dirs" in checkpoint_filepaths_dict:
        scann_instance_filepath = checkpoint_filepaths_dict[
            "intermediate_embedding_weight_scann_artifact_dirs"][-1]
        if return_scann_instance:
            last_embedding_weights_scann_instance = ApproxNN(ann_alg="scann")
            last_embedding_weights_scann_instance.load(
                ann_path=scann_instance_filepath)
        if return_scann_instance_filepath:
            last_embedding_weights_scann_instance_filepath = scann_instance_filepath

    return {
        "last_embedding_weights": last_embedding_weights,
        "last_embedding_weights_filepath": last_embedding_weights_filepath,
        "last_embedding_weights_normalized": last_embedding_weights_normalized,
        "last_embedding_weights_annoy_instance":
        last_embedding_weights_annoy_instance,
        "last_embedding_weights_scann_instance":
        last_embedding_weights_scann_instance,
        "last_embedding_weights_scann_instance_filepath":
        last_embedding_weights_scann_instance_filepath,
        "words": words,
        "word_to_int": word_to_int,
        "word_counts": word_counts,
    }
def topological_polysemy_pipeline(
    semeval_word_senses_filepath: str,
    word2vec_semeval_model_dir: str,
    word2vec_enwiki_model_dir: str,
    word2vec_google_news_model_dir: str,
    glove_model_dir: str,
    fasttext_model_dir: str,
    fasttext_tps_model_dir: str,
    tps_neighbourhood_sizes: str,
    num_top_k_words_frequencies: int,
    cyclo_octane_data_filepath: str,
    henneberg_data_filepath: str,
    custom_point_cloud_neighbourhood_size: int,
    output_dir: str,
) -> None:
    """
    Computes the topological polysemy of various word embeddings and data sets.
    Saves results in output dir with some additional plots.

    Parameters
    ----------
    semeval_word_senses_filepath : str
        Filepath of the SemEval-2010 task 14 word senses
    word2vec_semeval_model_dir : str
        Directory of the SemEval-2010 task 14 word2vec model.
    word2vec_enwiki_model_dir : str
        Directory of the enwiki word2vec model.
    word2vec_google_news_model_dir : str
        Directory of the Google News 3M word2vec model
    glove_model_dir : str
        Directory of the GloVe model.
    fasttext_model_dir : str
        Directory of the fastText model.
    fasttext_tps_model_dir : str
        Directory of the TPS fastText model.
    tps_neighbourhood_sizes : str
        Neighbourhood sizes to use when computing TPS (e.g. 50, 60).
    num_top_k_words_frequencies : int
        Number of top words to use when computing TPS scores vs. word frequencies.
    cyclo_octane_data_filepath : str
        Filepath of the cyclo-octane dataset.
    henneberg_data_filepath : str
        Filepath of the Henneberg dataset.
    custom_point_cloud_neighbourhood_size : int
        Neighbourhood size to use when computing TPS for custom point clouds.
    output_dir : str
        Output directory to save results.
    """
    # Ensure output directory exists
    makedirs(output_dir, exist_ok=True)

    # Load SemEval-2010 task 14 word senses
    semeval_word_senses: dict = joblib.load(semeval_word_senses_filepath)
    semeval_target_words = np.array(list(semeval_word_senses["all"].keys()))
    semeval_target_word_gs_clusters = np.array(
        list(semeval_word_senses["all"].values())
    )

    # Parse strings into int
    tps_neighbourhood_sizes = [int(n_size) for n_size in tps_neighbourhood_sizes]

    # -- Compute TPS for word embeddings (SemEval and enwiki) --
    for dataset_name, model_dir in zip(
        ["semeval_2010_task_14", "enwiki"],
        [word2vec_semeval_model_dir, word2vec_enwiki_model_dir],
    ):
        # Load word embeddings
        print(f"Loading {dataset_name} word embeddings...")
        w2v_training_output = load_model_training_output(
            model_training_output_dir=model_dir,
            model_name="word2vec",
            dataset_name=dataset_name,
            return_normalized_embeddings=True,
            return_scann_instance=True,
        )
        last_embedding_weights_normalized = w2v_training_output[
            "last_embedding_weights_normalized"
        ]
        last_embedding_weights_scann_instance = w2v_training_output[
            "last_embedding_weights_scann_instance"
        ]
        words = w2v_training_output["words"]
        word_to_int = w2v_training_output["word_to_int"]
        word_counts = w2v_training_output["word_counts"]
        print("Done!")

        print("Computing TPS for word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=dataset_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_to_int=word_to_int,
            word_vocabulary=words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            word_counts=word_counts,
            ann_instance=last_embedding_weights_scann_instance,
        )
        del last_embedding_weights_scann_instance
        print("Done!")

    # -- Compute TPS for external word embeddings --
    # Prepare constants
    external_word_embeddings = [
        (
            "google_news_3m",
            "GoogleNews-vectors-negative300",
            word2vec_google_news_model_dir,
        ),
        (
            "glove_cc_840b_300d",
            "glove.840B.300d",
            glove_model_dir,
        ),
        (
            "fasttext_cc_300d",
            "cc.en.300.vec",
            fasttext_model_dir,
        ),
        (
            "fasttext_tps_300d",
            "fastText.TPS.300d",
            fasttext_tps_model_dir,
        ),
    ]

    # Compute TPS for each external word embeddings
    for word_embeddings_name, model_name, model_dir in external_word_embeddings:

        # Prepare filepaths
        model_normalized_weights_filepath = join(
            model_dir, f"{model_name}_normalized.npy"
        )
        model_words_filepath = join(model_dir, f"{model_name}_words.txt")
        model_scann_artifacts_dir = join(model_dir, f"{model_name}_scann_artifacts")

        # Load data
        print(f"Loading {model_name} data...")
        model_weights_normalized = np.load(
            model_normalized_weights_filepath, mmap_mode="r"
        )
        with open(model_words_filepath, "r") as words_file:
            model_words = np.array(words_file.read().split("\n"))
        model_approx_nn = ApproxNN(ann_alg="scann")
        model_approx_nn.load(ann_path=model_scann_artifacts_dir)
        print("Done!")

        print(f"Computing TPS for {model_name} word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=word_embeddings_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=model_weights_normalized,
            word_to_int={word: i for i, word in enumerate(model_words)},
            word_vocabulary=model_words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            ann_instance=model_approx_nn,
        )
        del model_approx_nn
        print("Done!")

    # -- Compute TPS for custom point clouds --
    for point_cloud_name, point_cloud_filepath in zip(
        ["cyclo_octane", "henneberg"],
        [cyclo_octane_data_filepath, henneberg_data_filepath],
    ):
        # Load and prepare data for TPS
        point_cloud = pd.read_csv(point_cloud_filepath, header=None).values
        point_cloud_normalized = point_cloud / np.linalg.norm(
            point_cloud, axis=1
        ).reshape(-1, 1)
        point_cloud_pairwise_dists = euclidean_distances(point_cloud)

        # Compute TPS scores
        num_points = len(point_cloud)
        tps_scores = np.zeros(num_points)
        print(f"Computing TPS scores for {point_cloud_name}...")
        for point_index in tqdm(range(num_points)):
            tps_score = tps_point_cloud(
                point_index=point_index,
                neighbourhood_size=custom_point_cloud_neighbourhood_size,
                point_cloud_normalized=point_cloud_normalized,
                point_cloud_pairwise_dists=point_cloud_pairwise_dists,
            )
            tps_scores[point_index] = tps_score

        # Save result
        point_cloud_output_dir = join(output_dir, point_cloud_name)
        makedirs(point_cloud_output_dir, exist_ok=True)
        np.save(
            join(
                point_cloud_output_dir,
                f"tps_scores_{custom_point_cloud_neighbourhood_size}.npy",
            ),
            tps_scores,
        )
def evaluate_word2vec(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    sswr_dataset_filepath: str,
    msr_dataset_filepath: str,
    pad_dataset_filepath: str,
    vocab_size: int,
    approx_nn_path: str,
    approx_nn_alg: str,
    top_n_prediction: int,
    output_dir: str,
) -> None:
    """
    Evaluates a word2vec model on the SSWR and MSR test analogy datasets.

    Parameters
    ----------
    model_dir : str
        Directory of the model to evaluate.
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    sswr_dataset_filepath : str
        Filepath of the SSWR test dataset.
    msr_dataset_filepath : str
        Filepath of the MSR test dataset.
    pad_dataset_filepath : str
        Filepath of the PAD test dataset
    vocab_size : int
        Vocabulary size to use when evaluating on the test datasets.
    approx_nn_path : str
        Filepath of an ApproxNN instance, built on the word embeddings.
    approx_nn_alg : str
        Algorithm of ApproxNN instance.
    top_n_prediction : int
        Which top-N prediction we would like to do.
    output_dir : str
        Output directory to save evaluation results.
    """
    # Load output from training word2vec
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )
    last_embedding_weights = w2v_training_output["last_embedding_weights"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]

    # Append date/time to output directory.
    output_dir = join(output_dir, datetime.now().strftime("%d-%b-%Y_%H-%M-%S"))
    makedirs(output_dir, exist_ok=True)

    # Load ApproxNN instance
    approx_nn = None
    if approx_nn_path != "":
        approx_nn = ApproxNN(ann_alg=approx_nn_alg)
        load_args = {}
        if approx_nn_alg == "annoy":
            load_args[
                "annoy_data_dimensionality"] = last_embedding_weights.shape[1]
            load_args["annoy_mertic"] = "euclidean"
            load_args["annoy_prefault"] = True
        approx_nn.load(approx_nn_path, **load_args)

    # SSWR
    print("--- Evaluating SSWR ---")
    sswr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=sswr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )

    # Compute average semantic and syntactic accuracies
    sswr_categories = list(sswr_accuracies.keys())
    sswr_semantic_categories = sswr_categories[:5]
    sswr_syntactic_categories = sswr_categories[5:-1]
    sswr_semantic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_semantic_categories])
    sswr_syntactic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_syntactic_categories])
    sswr_accuracies["semantic_avg"] = sswr_semantic_avg_acc
    sswr_accuracies["syntactic_avg"] = sswr_syntactic_avg_acc
    save_analogies_accuracies_to_file("sswr", output_dir, sswr_accuracies)
    print(sswr_accuracies)

    # MSR
    print("--- Evaluating MSR ---")
    msr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=msr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("msr", output_dir, msr_accuracies)
    print(msr_accuracies)

    # PAD
    print("--- Evaluating PAD ---")
    pad_accuracies = evaluate_model_word_analogies(
        analogies_filepath=pad_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("pad", output_dir, pad_accuracies)
    print(pad_accuracies)
예제 #4
0
def prepare_num_word_meanings_supervised_data(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    id_estimation_num_neighbours: list,
    semeval_2010_14_word_senses_filepath: str,
    tps_neighbourhood_sizes: list,
    raw_data_dir: str,
    output_dir: str,
) -> None:
    """
    Prepares data for the supervised word meanings prediction task.

    Parameters
    ----------
    model_dir : str
        Directory of the model to load.
    model_name : str
        Name of the trained word2vec model.
    dataset_name : str
        Name of the dataset the model is trained on.
    id_estimation_num_neighbours : list
        Number of neighbours to use when estimating intrinsic dimension for each word
    semeval_2010_14_word_senses_filepath : str
        Filepath of SemEval-2010 task 14 word senses joblib dict.
    tps_neighbourhood_sizes : list
        List of TPS neighbourhood sizes.
    raw_data_dir : str
        Directory where raw data will be saved to.
    output_dir: str
        Output directory.
    """
    # Convert list arguments to int
    tps_neighbourhood_sizes = [
        int(n_size) for n_size in tps_neighbourhood_sizes
    ]
    id_estimation_num_neighbours = [
        int(num_neighbours) for num_neighbours in id_estimation_num_neighbours
    ]

    # Prepare directory constants and create raw data dir for caching data files
    task_id = f"wme_{model_name}_{dataset_name}"  # wme = word meaning estimation
    task_raw_data_dir = join(raw_data_dir, task_id)
    task_raw_data_tps_dir = join(task_raw_data_dir, "tps")
    makedirs(task_raw_data_dir, exist_ok=True)

    # Load word embeddings from model
    print("Loading word embeddings...")
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        return_normalized_embeddings=True,
        return_scann_instance_filepath=True,
    )
    last_embedding_weights_normalized = w2v_training_output[
        "last_embedding_weights_normalized"]
    last_embedding_weights_scann_instance_filepath = w2v_training_output[
        "last_embedding_weights_scann_instance_filepath"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]
    print("Done!")

    # Prepare SemEval-2010 task 14 data
    semeval_2010_14_word_senses = joblib.load(
        semeval_2010_14_word_senses_filepath)
    semeval_target_words = np.array(
        list(semeval_2010_14_word_senses["all"].keys()))
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter]
    semeval_gs_clusters = np.array(
        list(semeval_2010_14_word_senses["all"].values()))
    semeval_gs_clusters_in_vocab = semeval_gs_clusters[
        semeval_target_words_in_vocab_filter]
    semeval_2010_14_word_senses_in_vocab = {
        word: gs_meanings
        for word, gs_meanings in zip(semeval_target_words_in_vocab,
                                     semeval_gs_clusters_in_vocab)
    }

    # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary --
    words_to_num_meanings_filepath = join(task_raw_data_dir,
                                          "words_to_num_meanings.joblib")
    if not isfile(words_to_num_meanings_filepath):
        words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy()
        print("Finding words in vocabulary with #Wordnet synsets > 0")
        for word in tqdm(words):
            if word in semeval_target_words_in_vocab:
                continue
            num_synsets = len(wn.synsets(word))
            if num_synsets > 0:
                words_to_num_meanings[word] = num_synsets
        joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath)
    else:
        words_to_num_meanings = joblib.load(words_to_num_meanings_filepath)
        print("Loaded words_to_num_meanings!")
    data_words = np.array(list(words_to_num_meanings.keys()))
    data_words_no_semeval = [
        word for word in data_words
        if word not in semeval_target_words_in_vocab
    ]
    data_word_to_int = {word: i for i, word in enumerate(data_words)}

    # Filter out word embeddings using Wordnet words (data_words)
    data_words_to_full_vocab_ints = np.array(
        [word_to_int[word] for word in data_words])

    # (2) -- Compute TPS_n for train/test words --
    makedirs(task_raw_data_tps_dir, exist_ok=True)
    tps_scores_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    tps_pds_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
            tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths):
        if isfile(tps_scores_filepath) and isfile(tps_pds_filepath):
            continue
        print(
            f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..."
        )

        # Load ScaNN instance
        scann_instance = ApproxNN(ann_alg="scann")
        scann_instance.load(
            ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute TPS
        tps_scores_ns, tps_pds_ns = tps_multiple(
            target_words=data_words,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            ann_instance=scann_instance,
            return_persistence_diagram=True,
            n_jobs=-1,
            progressbar_enabled=True,
        )

        # Save result
        print("Saving TPS result...")
        np.save(tps_scores_filepath, tps_scores_ns)
        np.save(tps_pds_filepath, tps_pds_ns)
        print("Done!")

        # Free resources
        del scann_instance

    # (3) -- Compute GAD --
    gad_dir = join(task_raw_data_dir, "gad")
    makedirs(gad_dir, exist_ok=True)
    gad_params = [
        (25, 250),
        (25, 500),
        (25, 750),
        (25, 1000),
        # ----------
        (50, 250),
        (50, 500),
        (50, 750),
        (50, 1000),
        # ----------
        (100, 1000),
        (100, 1250),
        (100, 1500),
        (100, 1750),
        (100, 2000),
        # ----------
        (150, 1000),
        (150, 1250),
        (150, 1500),
        (150, 1750),
        (150, 2000),
        # ----------
        (200, 1000),
        (200, 1250),
        (200, 1500),
        (200, 1750),
        (200, 2000),
    ]
    gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2}
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_filepath = join(gad_dir, f"{gad_id}.joblib")
        if isfile(gad_filepath):
            continue
        print(f"-- {gad_id} -- ")

        # Load ScaNN instance
        approx_nn = ApproxNN(ann_alg="scann")
        approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute features
        gad_result = compute_gad(
            data_points=last_embedding_weights_normalized,
            data_point_ints=data_words_to_full_vocab_ints,
            manifold_dimension=2,
            data_points_approx_nn=approx_nn,
            use_knn_annulus=True,
            knn_annulus_inner=inner_param,
            knn_annulus_outer=outer_param,
            return_annlus_persistence_diagrams=True,
            progressbar_enabled=True,
            n_jobs=-1,
        )
        print(
            "P_man:",
            len(gad_result["P_man"]),
            "P_int:",
            len(gad_result["P_int"]),
            "P_bnd:",
            len(gad_result["P_bnd"]),
        )
        joblib.dump(gad_result, gad_filepath, protocol=4)

        # Free resources
        del approx_nn

    # (4) -- Estimate the intrinsic dimension (ID) for each word vector --
    words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids")
    id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [
        ("lpca", est_ids.lPCA, {}),
        ("knn", est_ids.KNN, {}),
        ("twonn", est_ids.TwoNN, {}),
        ("mle", est_ids.MLE, {}),
        ("tle", est_ids.TLE, {}),
    ]
    makedirs(words_estimated_ids_dir, exist_ok=True)
    for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators:
        for num_neighbours in id_estimation_num_neighbours:
            estimated_ids_filepath = join(
                words_estimated_ids_dir,
                f"{id_estimator_name}_{num_neighbours}.npy")
            if isfile(estimated_ids_filepath):
                continue

            print(
                f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..."
            )
            id_estimator = id_estimator_cls(**id_estimator_params)
            estimated_ids = id_estimator.fit_predict_pw(
                X=last_embedding_weights_normalized[
                    data_words_to_full_vocab_ints],
                n_neighbors=num_neighbours,
                n_jobs=-1,
            )
            # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints]

            print("Done! Saving to file...")
            np.save(estimated_ids_filepath, estimated_ids)

    # (5) -- Create features from GAD result to speed up combining of data --
    gad_features_dir = join(task_raw_data_dir, "gad_features")
    makedirs(gad_features_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
        if isfile(gad_features_filepath):
            continue
        print(f"Creating GAD features for {gad_id}...")

        # Load GAD result
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Features from GAD (P_man, P_int, P_bnd)
        gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3),
                                dtype=int)
        for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)):
            for gad_category, gad_category_idx in gad_categories.items():
                if word_int in gad_result[gad_category]:
                    gad_features[i, gad_category_idx] = 1

        # Save GAD features
        np.save(gad_features_filepath, gad_features)

    # (6) -- Vectorize persistence diagrams from GAD features --
    gad_features_pd_vectorized_dir = join(task_raw_data_dir,
                                          "gad_features_pd_vectorized")
    gad_features_pd_vectorized_size = 5
    gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2
    makedirs(gad_features_pd_vectorized_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"
        gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir,
                                             f"{gad_id}.npy")
        if isfile(gad_features_pd_vecs_filepath):
            continue
        print(f"Vectorizing GAD features for {gad_id}...")

        # Load GAD features
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Use PersistenceImage to vectorize persistence diagrams
        gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints),
                                         gad_features_pd_vectorized_size_flat))
        for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)):

            # Get persistence diagram and create a range such that we get a square image from PersistenceImager
            gad_features_pd = gad_result["annulus_pds"][point_index]
            if len(gad_features_pd) == 0:
                gad_features_pd_vecs[i] = np.zeros(
                    gad_features_pd_vectorized_size_flat, dtype=int)
                continue

            births, deaths = gad_features_pd.T
            persistence = deaths - births
            square_min = min(births.min(), persistence.min())
            square_max = max(births.max(), persistence.max())
            square_range = (square_min, square_max)
            pixel_size = (square_max -
                          square_min) / gad_features_pd_vectorized_size

            # Vectorize persistence diagram
            pimgr = PersistenceImager(birth_range=square_range,
                                      pers_range=square_range,
                                      pixel_size=pixel_size)
            pd_vec = pimgr.transform(gad_features_pd)
            gad_features_pd_vecs[i] = pd_vec.flatten()

        # Save persistence image vectors to file
        np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs)

    # (7) -- Combine data into data (features and labels) for WME task --
    word_meaning_train_data_filepath = join(output_dir,
                                            "word_meaning_train_data.csv")
    word_meaning_test_data_filepath = join(output_dir,
                                           "word_meaning_test_data.csv")
    word_meaning_semeval_test_data_filepath = join(
        output_dir, "word_meaning_semeval_test_data.csv")
    if (not isfile(word_meaning_train_data_filepath)
            or not isfile(word_meaning_test_data_filepath)
            or not isfile(word_meaning_semeval_test_data_filepath)):
        # -- Load data for creating features --
        # Load estimated IDs from file
        words_estimated_ids = {
            f"{id_estimator_name}_{num_neighbours}": np.load(
                join(words_estimated_ids_dir,
                     f"{id_estimator_name}_{num_neighbours}.npy"))
            for num_neighbours in id_estimation_num_neighbours
            for id_estimator_name, _, _ in id_estimators
        }
        print("Loaded estimated IDs!")

        # Load GAD features
        gad_features_dict = {}
        for inner_param, outer_param in gad_params:
            gad_id = f"gad_knn_{inner_param}_{outer_param}"

            # Load GAD features
            gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
            gad_features_dict[gad_id] = np.load(gad_features_filepath)
        print("Loaded GAD features!")

        # Load TPS features
        tps_scores = {}
        tps_pds = {}
        for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
                tps_neighbourhood_sizes, tps_scores_filepaths,
                tps_pds_filepaths):
            tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath)
            tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath,
                                                      allow_pickle=True)
        print("Loaded TPS features!")

        data_words_train, data_words_test = train_test_split(
            data_words_no_semeval, test_size=0.05, random_state=rng_seed)
        if not isfile(word_meaning_train_data_filepath):
            print("Preparing data for training...")
            train_data_df = create_word_meaning_model_data_features(
                target_words=data_words_train,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            train_data_df.to_csv(word_meaning_train_data_filepath, index=False)
        if not isfile(word_meaning_test_data_filepath):
            print("Preparing data for testing...")
            test_data_df = create_word_meaning_model_data_features(
                target_words=data_words_test,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            test_data_df.to_csv(word_meaning_test_data_filepath, index=False)
        if not isfile(word_meaning_semeval_test_data_filepath):
            print("Preparing data for external testing (SemEval)...")
            semeval_test_data_df = create_word_meaning_model_data_features(
                target_words=semeval_target_words_in_vocab,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            semeval_test_data_df.to_csv(
                word_meaning_semeval_test_data_filepath, index=False)
    else:
        train_data_df = pd.read_csv(word_meaning_train_data_filepath)
        test_data_df = pd.read_csv(word_meaning_test_data_filepath)
        semeval_test_data_df = pd.read_csv(
            word_meaning_semeval_test_data_filepath)
    print("Train", train_data_df)
    print("Test", test_data_df)
    print("SemEval test", semeval_test_data_df)