Esempio n. 1
0
def punctured_neighbourhood(
    target_word: str,
    word_to_int: dict,
    word_embeddings_norm: np.ndarray,
    neighbourhood_size: int,
    word_embeddings_pairwise_dists: np.ndarray,
    ann_instance: ApproxNN,
) -> np.ndarray:
    """
    Finds a punctured neighbourhood around a target word using
    cosine distances.

    Parameters
    ----------
    target_word : str
        Target word (w)
    word_to_int : dict of str and int
        Dictionary mapping from word to its integer representation.
    word_embeddings_norm : np.ndarray
        Normalized word embeddings
    neighbourhood_size : int
        Neighbourhood size (n)
    word_embeddings_pairwise_dists : np.ndarray
        Pairwise distances between word embeddings
    ann_instance : ApproxNN
        Approximate nearest neighbour (ANN) instance, built on the word embeddings
        If specified, the ANN index is used to find punctured neighbourhoods.

    Returns
    -------
    neighbouring_word_embeddings : np.ndarray
        Neighbouring word embeddings of `target_word`, excluding
        the word itself
    """
    # Find neighbouring words (excluding the target word itself)
    target_word_int = word_to_int[target_word]
    if ann_instance is not None:
        neighbourhood_sorted_indices = ann_instance.search(
            query_vector=word_embeddings_norm[target_word_int],
            k_neighbours=neighbourhood_size,
            excluded_neighbour_indices=[target_word_int],
        )
    else:
        if word_embeddings_pairwise_dists is not None:
            neighbourhood_distances = word_embeddings_pairwise_dists[
                target_word_int]
        else:
            neighbourhood_distances = vector_to_matrix_distance(
                u=word_embeddings_norm[target_word_int],
                m=word_embeddings_norm,
                metric=fastdist.euclidean,
                metric_name="euclidean",
            )
        neighbourhood_sorted_indices = np.argsort(
            neighbourhood_distances)[1:neighbourhood_size + 1]
    neighbouring_word_embeddings = word_embeddings_norm[
        neighbourhood_sorted_indices]
    return neighbouring_word_embeddings
Esempio n. 2
0
def get_knn_func_data_points(
    data_points: np.ndarray,
    pairwise_distances: np.ndarray = None,
    approx_nn: ApproxNN = None,
    metric: Callable = fastdist.euclidean,
    metric_name: str = "euclidean",
) -> KnnFunc:
    """
    Gets a K-nearest neighbour callable for data points, used in `compute_gad`.

    Parameters
    ----------
    data_points : np.ndarray
        Data points.
    pairwise_distances : np.ndarray, optional
        Pairwise distances of data points (defaults to None).
    approx_nn : ApproxNN, optional
        ApproxNN instance.
    metric : Callable, optional
        fastdist metric; only required if `pairwise_distances` and `approx_nn` are None
        (defaults to fastdist.euclidean).
    metric_name : str, optional
        String name of the `metric` callable (defaults to "euclidean").

    Returns
    -------
    knn_func : KnnFunc
        K-nearest neighbour callable for data points.
    """
    if approx_nn is not None:
        return lambda point_idx, k_neighbours: approx_nn.search(
            query_vector=data_points[point_idx],
            k_neighbours=k_neighbours,
            excluded_neighbour_indices=[point_idx],
            return_distances=True,
        )
    elif pairwise_distances is not None:
        return lambda point_idx, k_neighbours: get_nearest_neighbours(
            distances=pairwise_distances[point_idx],
            k_neighbours=k_neighbours,
        )
    else:
        return lambda point_idx, k_neighbours: get_nearest_neighbours(
            distances=fastdist.vector_to_matrix_distance(
                u=data_points[point_idx],
                m=data_points,
                metric=metric,
                metric_name=metric_name,
            ),
            k_neighbours=k_neighbours,
        )
Esempio n. 3
0
def postprocess_word2vec_embeddings(
    model_training_output_dir: str,
    model_name: str,
    dataset_name: str,
    vocab_size: int,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Applies post-processing to trained word2vec word embeddings:
    - Saves normalized word embeddings
    - Creates approximate nearest-neighbour index using Annoy

    Parameters
    ----------
    model_training_output_dir : str
        word2vec model training output directory.
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    vocab_size : int
        Size of the vocabulary to use, -1 denotes all words.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.
    """
    # Load output from training word2vec
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_training_output_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )
    last_embedding_weights = w2v_training_output["last_embedding_weights"]

    use_full_vocab = False
    if vocab_size == -1:
        vocab_size = last_embedding_weights.shape[0]
        use_full_vocab = True

    # Define filepaths
    last_embedding_weights_filepath = w2v_training_output[
        "last_embedding_weights_filepath"]
    last_embedding_weights_filepath_no_ext = Path(
        last_embedding_weights_filepath).stem
    if use_full_vocab:
        last_embedding_weights_normalized_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_normalized.npy",
        )
    else:
        last_embedding_weights_normalized_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_normalized.npy",
        )
    if use_full_vocab:
        model_annoy_index_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_annoy_index.ann",
        )
        model_scann_artifacts_dir = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_scann_artifacts",
        )
    else:
        model_annoy_index_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_annoy_index.ann",
        )
        model_scann_artifacts_dir = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_scann_artifacts",
        )

    # Normalize word embeddings and save to file
    if not isfile(last_embedding_weights_normalized_filepath):
        print("Normalizing word embeddings and saving to file...")

        # Normalize word embeddings
        if use_full_vocab:
            last_embedding_weights_in_vocab = last_embedding_weights
        else:
            last_embedding_weights_in_vocab = last_embedding_weights[:
                                                                     vocab_size]
        last_embedding_weights_normalized = (
            last_embedding_weights_in_vocab / np.linalg.norm(
                last_embedding_weights_in_vocab, axis=1).reshape(-1, 1))
        np.save(
            last_embedding_weights_normalized_filepath,
            last_embedding_weights_normalized,
        )
        print("Done!")
    else:
        last_embedding_weights_normalized = np.load(
            last_embedding_weights_normalized_filepath)

    annoy_index_created = isfile(model_annoy_index_filepath)
    scann_instance_created = isdir(model_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:

        # Add word embeddings to index and build it
        if use_full_vocab:
            last_embedding_weights_normalized_in_vocab = (
                last_embedding_weights_normalized)
        else:
            last_embedding_weights_normalized_in_vocab = (
                last_embedding_weights_normalized[:vocab_size])

        if not isfile(model_annoy_index_filepath):
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=last_embedding_weights_normalized_in_vocab,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(model_annoy_index_filepath)

        if not isdir(model_scann_artifacts_dir):
            scann_instance = ApproxNN(ann_alg="scann")
            scann_instance.build(
                data=last_embedding_weights_normalized_in_vocab,
                distance_measure="dot_product",
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            scann_instance.save(model_scann_artifacts_dir)
Esempio n. 4
0
def load_model_training_output(
    model_training_output_dir: str,
    model_name: str,
    dataset_name: str,
    word_embeddings_mmap_mode: str = "r",
    return_normalized_embeddings: bool = False,
    return_annoy_instance: bool = False,
    annoy_instance_prefault: bool = False,
    return_scann_instance: bool = False,
    return_scann_instance_filepath: bool = False,
) -> dict:
    """
    Loads and returns a dict object containing output from word2vec training

    Parameters
    ----------
    model_training_output_dir : str
        word2vec model training output directory
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    word_embeddings_mmap_mode : str, optional
        Memmap mode to use when loading last word embedding weights (defaults to "r", or read).
    return_normalized_embeddings : bool, optional
        Whether or not to return last embedding weights, normalized, if they
        are present (defaults to False).
    return_annoy_instance : bool, optional
        Whether or not to return Annoy index fit on last embedding weights, if they
        are present (defaults to False).
    annoy_instance_prefault : bool, optional
        Whether or not to enable the `prefault` option when loading
        Annoy index. `return_annoy_instance` must be set to True to have an affect.
        (Defaults to False).
    return_scann_instance : bool, optional
        Whether or not to return the ScaNN instance fit on the last embedding weights,
        if they are present (defaults to False).
    return_scann_instance_filepath : bool, optional
        Whether or not to return the filepath of the ScaNN instance fit on the last word
        embedding weights, if they are present (defaults to False).

    Returns
    -------
    model_training_output : dict
        Dictionary containing output from word2vec training
    """

    # Get filepaths of the model output
    checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
        output_dir=model_training_output_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )

    # Get last word embeddings from training
    last_embedding_weights_filepath = checkpoint_filepaths_dict[
        "intermediate_embedding_weight_filepaths"][-1]
    last_embedding_weights = np.load(last_embedding_weights_filepath,
                                     mmap_mode=word_embeddings_mmap_mode)

    # Get word counts from tokenizer of word2vec model
    with open(checkpoint_filepaths_dict["train_word_counts_filepath"],
              "r") as word_counts_file:
        word_counts = np.array([
            int(word_count)
            for word_count in word_counts_file.read().split("\n")
        ])

    # Get array of words and word_to_int lookup dictionary
    with open(checkpoint_filepaths_dict["train_words_filepath"],
              "r") as words_file:
        words = np.array(words_file.read().split("\n"))
    word_to_int = {word: i for i, word in enumerate(words)}

    # Normalized embedding weights
    last_embedding_weights_normalized = None
    if (return_normalized_embeddings
            and "intermediate_embedding_weight_normalized_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_normalized = np.load(
            checkpoint_filepaths_dict[
                "intermediate_embedding_weight_normalized_filepaths"][-1],
            mmap_mode="r",
        )

    # Annoy index
    last_embedding_weights_annoy_instance = None
    if (return_annoy_instance
            and "intermediate_embedding_weight_annoy_index_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_annoy_instance = ApproxNN(ann_alg="annoy")
        last_embedding_weights_annoy_instance.load(
            ann_path=checkpoint_filepaths_dict[
                "intermediate_embedding_weight_annoy_index_filepaths"][-1],
            annoy_data_dimensionality=last_embedding_weights.shape[1],
            annoy_mertic="euclidean",
            annoy_prefault=annoy_instance_prefault,
        )

    # ScaNN instance
    last_embedding_weights_scann_instance = None
    last_embedding_weights_scann_instance_filepath = None
    if "intermediate_embedding_weight_scann_artifact_dirs" in checkpoint_filepaths_dict:
        scann_instance_filepath = checkpoint_filepaths_dict[
            "intermediate_embedding_weight_scann_artifact_dirs"][-1]
        if return_scann_instance:
            last_embedding_weights_scann_instance = ApproxNN(ann_alg="scann")
            last_embedding_weights_scann_instance.load(
                ann_path=scann_instance_filepath)
        if return_scann_instance_filepath:
            last_embedding_weights_scann_instance_filepath = scann_instance_filepath

    return {
        "last_embedding_weights": last_embedding_weights,
        "last_embedding_weights_filepath": last_embedding_weights_filepath,
        "last_embedding_weights_normalized": last_embedding_weights_normalized,
        "last_embedding_weights_annoy_instance":
        last_embedding_weights_annoy_instance,
        "last_embedding_weights_scann_instance":
        last_embedding_weights_scann_instance,
        "last_embedding_weights_scann_instance_filepath":
        last_embedding_weights_scann_instance_filepath,
        "words": words,
        "word_to_int": word_to_int,
        "word_counts": word_counts,
    }
def preprocess_google_news(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
       Distributed Representations of Words and Phrases and their Compositionality
       (https://arxiv.org/pdf/1310.4546.pdf). In Proceedings of NIPS, 2013.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GoogleNews")
    makedirs(output_dir, exist_ok=True)

    # Define filepaths
    google_news_vectors_zip_raw_download_url = "https://filesender.uninett.no/download.php?token=b0aea55e-72a7-4ac0-9409-8d5dbb322505&files_ids=645861"
    google_news_vectors_zip_raw_filename = "GoogleNews-vectors-negative300.bin.gz"
    google_news_vectors_zip_raw_filepath = join(
        raw_data_dir, google_news_vectors_zip_raw_filename
    )
    google_news_vectors_bin_raw_filepath = join(
        raw_data_dir, "GoogleNews-vectors-negative300.bin"
    )
    google_news_words_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_words.txt"
    )
    google_news_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300.npy"
    )
    google_news_normalized_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_normalized.npy"
    )
    google_news_vectors_annoy_index_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_annoy_index.ann"
    )
    google_news_vectors_scann_artifacts_dir = join(
        output_dir, "GoogleNews-vectors-negative300_scann_artifacts"
    )

    # -- GoogleNews-vectors-negative300.bin.gz --
    if not isfile(google_news_vectors_zip_raw_filepath):
        print(f"Downloading {google_news_vectors_zip_raw_filename}...")
        download_from_url(
            url=google_news_vectors_zip_raw_download_url,
            destination_filepath=google_news_vectors_zip_raw_filepath,
        )
        print("Done!")

    if not isfile(google_news_vectors_bin_raw_filepath):
        print(f"Extracting {google_news_vectors_zip_raw_filename}...")
        with gzip.GzipFile(google_news_vectors_zip_raw_filepath, "rb") as gzip_file_raw:
            with open(google_news_vectors_bin_raw_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from binary file and save result
    should_load_vectors = (
        not isfile(google_news_words_filepath)
        or not isfile(google_news_vectors_filepath)
        or not isfile(google_news_normalized_vectors_filepath)
    )
    if should_load_vectors:
        google_news_word_embeddings, google_news_words = load_word2vec_binary_format(
            word2vec_filepath=google_news_vectors_bin_raw_filepath,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(google_news_words_filepath):
        with open(google_news_words_filepath, "w") as file:
            for i, word in enumerate(google_news_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(google_news_vectors_filepath):
        np.save(google_news_vectors_filepath, google_news_word_embeddings)

    # Save normalized word embeddings
    google_news_word_embeddings_normalized = None
    if not isfile(google_news_normalized_vectors_filepath):
        google_news_word_embeddings_normalized = (
            google_news_word_embeddings
            / np.linalg.norm(google_news_word_embeddings, axis=1).reshape(-1, 1)
        )
        np.save(
            google_news_normalized_vectors_filepath,
            google_news_word_embeddings_normalized,
        )

    annoy_index_created = isfile(google_news_vectors_annoy_index_filepath)
    scann_instance_created = isdir(google_news_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if google_news_word_embeddings_normalized is None:
            google_news_word_embeddings_normalized = np.load(
                google_news_normalized_vectors_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=google_news_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(google_news_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=google_news_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(google_news_vectors_scann_artifacts_dir)
def preprocess_fasttext_tps(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020).
       Topology of Word Embeddings: Singularities Reflect Polysemy.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastTextTPS")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    env_config = dotenv_values(join("..", ".env"))
    tps_fasttext_model_filesender_token = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN"
    ]
    tps_fasttext_model_filesender_token_files_ids = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN_FILES_IDS"
    ]
    tps_fasttext_model_url = f"https://filesender.uninett.no/download.php?token={tps_fasttext_model_filesender_token}&files_ids={tps_fasttext_model_filesender_token_files_ids}"
    tps_fasttext_model_name = "fastText.TPS.300d"
    tps_fasttext_model_raw_filepath = join(
        raw_data_dir, f"{tps_fasttext_model_name}.bin"
    )
    tps_fasttext_model_words_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_words.txt"
    )
    tps_fasttext_model_vectors_filepath = join(
        output_dir, f"{tps_fasttext_model_name}.npy"
    )
    tps_fasttext_model_vectors_normalized_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_normalized.npy"
    )
    tps_fasttext_model_annoy_index_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_annoy_index.ann"
    )
    tps_fasttext_model_scann_artifacts_dir = join(
        output_dir, f"{tps_fasttext_model_name}_scann_artifacts"
    )

    if not isfile(tps_fasttext_model_raw_filepath):
        print(f"Downloading {tps_fasttext_model_name}...")
        download_from_url(
            url=tps_fasttext_model_url,
            destination_filepath=tps_fasttext_model_raw_filepath,
        )
        print("Done!")

    # Load output from trained fastText model
    fasttext_model = fasttext.load_model(tps_fasttext_model_raw_filepath)
    fasttext_model_words = fasttext_model.words
    fasttext_model_embedding_weights = np.zeros(
        (len(fasttext_model_words), fasttext_model.get_dimension())
    )
    for i, word in enumerate(fasttext_model.words):
        fasttext_model_embedding_weights[i] = fasttext_model.get_word_vector(word)

    # Save words
    if not isfile(tps_fasttext_model_words_filepath):
        with open(tps_fasttext_model_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_model.words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(tps_fasttext_model_vectors_filepath):
        np.save(tps_fasttext_model_vectors_filepath, fasttext_model_embedding_weights)

    # Save normalized word embeddings
    fasttext_model_embedding_weights_normalized = None
    if not isfile(tps_fasttext_model_vectors_normalized_filepath):
        fasttext_model_embedding_weights_normalized = (
            fasttext_model_embedding_weights
            / np.linalg.norm(fasttext_model_embedding_weights, axis=1).reshape(-1, 1)
        )
        np.save(
            tps_fasttext_model_vectors_normalized_filepath,
            fasttext_model_embedding_weights_normalized,
        )

    annoy_index_created = isfile(tps_fasttext_model_annoy_index_filepath)
    scann_instance_created = isdir(tps_fasttext_model_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_model_embedding_weights_normalized is None:
            fasttext_model_embedding_weights_normalized = np.load(
                tps_fasttext_model_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_model_embedding_weights_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(tps_fasttext_model_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_model_embedding_weights_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(tps_fasttext_model_scann_artifacts_dir)
def preprocess_fasttext(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Grave, E., Bojanowski, P., Gupta, P., Joulin, A., & Mikolov, T. (2018).
       Learning Word Vectors for 157 Languages. In Proceedings of the International
       Conference on Language Resources and Evaluation (LREC 2018).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastText")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    fasttext_data_filename = "cc.en.300.vec"
    fasttext_vectors_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{fasttext_data_filename}.gz"
    fasttext_word_vectors_raw_gzip_filepath = join(
        raw_data_dir, f"{fasttext_data_filename}.gz"
    )
    fasttext_word_vectors_raw_txt_filepath = join(raw_data_dir, fasttext_data_filename)
    fasttext_word_vectors_words_filepath = join(
        output_dir, f"{fasttext_data_filename}_words.txt"
    )
    fasttext_word_vectors_filepath = join(output_dir, f"{fasttext_data_filename}.npy")
    fasttext_word_vectors_normalized_filepath = join(
        output_dir, f"{fasttext_data_filename}_normalized.npy"
    )
    fasttext_word_vectors_annoy_index_filepath = join(
        output_dir, f"{fasttext_data_filename}_annoy_index.ann"
    )
    fasttext_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{fasttext_data_filename}_scann_artifacts"
    )

    if not isfile(fasttext_word_vectors_raw_gzip_filepath):
        print(f"Downloading {fasttext_data_filename}...")
        download_from_url(
            url=fasttext_vectors_url,
            destination_filepath=fasttext_word_vectors_raw_gzip_filepath,
        )
        print("Done!")

    if not isfile(fasttext_word_vectors_raw_txt_filepath):
        print(f"Extracting {fasttext_data_filename}...")
        with gzip.GzipFile(
            fasttext_word_vectors_raw_gzip_filepath, "rb"
        ) as gzip_file_raw:
            with open(fasttext_word_vectors_raw_txt_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(fasttext_word_vectors_words_filepath)
        or not isfile(fasttext_word_vectors_filepath)
        or not isfile(fasttext_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        fasttext_word_embeddings, fasttext_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=fasttext_word_vectors_raw_txt_filepath,
            first_line_header=True,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(fasttext_word_vectors_words_filepath):
        with open(fasttext_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(fasttext_word_vectors_filepath):
        np.save(fasttext_word_vectors_filepath, fasttext_word_embeddings)

    # Save normalized word embeddings
    fasttext_word_embeddings_normalized = None
    if not isfile(fasttext_word_vectors_normalized_filepath):
        fasttext_word_embeddings_normalized = fasttext_word_embeddings / np.linalg.norm(
            fasttext_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            fasttext_word_vectors_normalized_filepath,
            fasttext_word_embeddings_normalized,
        )

    annoy_index_created = isfile(fasttext_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(fasttext_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_word_embeddings_normalized is None:
            fasttext_word_embeddings_normalized = np.load(
                fasttext_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(fasttext_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(fasttext_word_vectors_scann_artifacts_dir)
def preprocess_glove(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Jeffrey Pennington, Richard Socher, & Christopher D. Manning (2014).
       GloVe: Global Vectors for Word Representation. In Empirical Methods in Natural
       Language Processing (EMNLP) (pp. 1532–1543).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GloVe")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    glove_data_filename = "glove.840B.300d"
    glove_word_vectors_url = f"http://nlp.stanford.edu/data/{glove_data_filename}.zip"
    glove_word_vectors_raw_zip_filepath = join(
        raw_data_dir, f"{glove_data_filename}.zip"
    )
    glove_word_vectors_raw_txt_filename = f"{glove_data_filename}.txt"
    glove_word_vectors_raw_txt_filepath = join(
        raw_data_dir, glove_word_vectors_raw_txt_filename
    )
    glove_word_vectors_words_filepath = join(
        output_dir, f"{glove_data_filename}_words.txt"
    )
    glove_word_vectors_filepath = join(output_dir, f"{glove_data_filename}.npy")
    glove_word_vectors_normalized_filepath = join(
        output_dir, f"{glove_data_filename}_normalized.npy"
    )
    glove_word_vectors_annoy_index_filepath = join(
        output_dir, f"{glove_data_filename}_annoy_index.ann"
    )
    glove_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{glove_data_filename}_scann_artifacts"
    )

    if not isfile(glove_word_vectors_raw_zip_filepath):
        print(f"Downloading {glove_data_filename}...")
        download_from_url(
            url=glove_word_vectors_url,
            destination_filepath=glove_word_vectors_raw_zip_filepath,
        )
        print("Done!")

    if not isfile(glove_word_vectors_raw_txt_filepath):
        print(f"Extracting {glove_data_filename}...")
        with zipfile.ZipFile(glove_word_vectors_raw_zip_filepath, "r") as zip_ref:
            zip_ref.extractall(raw_data_dir)
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(glove_word_vectors_words_filepath)
        or not isfile(glove_word_vectors_filepath)
        or not isfile(glove_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        glove_word_embeddings, glove_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=glove_word_vectors_raw_txt_filepath,
            first_line_header=False,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(glove_word_vectors_words_filepath):
        with open(glove_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(glove_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(glove_word_vectors_filepath):
        np.save(glove_word_vectors_filepath, glove_word_embeddings)

    # Save normalized word embeddings
    glove_word_embeddings_normalized = None
    if not isfile(glove_word_vectors_normalized_filepath):
        glove_word_embeddings_normalized = glove_word_embeddings / np.linalg.norm(
            glove_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            glove_word_vectors_normalized_filepath,
            glove_word_embeddings_normalized,
        )

    annoy_index_created = isfile(glove_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(glove_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if glove_word_embeddings_normalized is None:
            glove_word_embeddings_normalized = np.load(
                glove_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=glove_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(glove_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=glove_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(glove_word_vectors_scann_artifacts_dir)
def evaluate_word2vec(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    sswr_dataset_filepath: str,
    msr_dataset_filepath: str,
    pad_dataset_filepath: str,
    vocab_size: int,
    approx_nn_path: str,
    approx_nn_alg: str,
    top_n_prediction: int,
    output_dir: str,
) -> None:
    """
    Evaluates a word2vec model on the SSWR and MSR test analogy datasets.

    Parameters
    ----------
    model_dir : str
        Directory of the model to evaluate.
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    sswr_dataset_filepath : str
        Filepath of the SSWR test dataset.
    msr_dataset_filepath : str
        Filepath of the MSR test dataset.
    pad_dataset_filepath : str
        Filepath of the PAD test dataset
    vocab_size : int
        Vocabulary size to use when evaluating on the test datasets.
    approx_nn_path : str
        Filepath of an ApproxNN instance, built on the word embeddings.
    approx_nn_alg : str
        Algorithm of ApproxNN instance.
    top_n_prediction : int
        Which top-N prediction we would like to do.
    output_dir : str
        Output directory to save evaluation results.
    """
    # Load output from training word2vec
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )
    last_embedding_weights = w2v_training_output["last_embedding_weights"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]

    # Append date/time to output directory.
    output_dir = join(output_dir, datetime.now().strftime("%d-%b-%Y_%H-%M-%S"))
    makedirs(output_dir, exist_ok=True)

    # Load ApproxNN instance
    approx_nn = None
    if approx_nn_path != "":
        approx_nn = ApproxNN(ann_alg=approx_nn_alg)
        load_args = {}
        if approx_nn_alg == "annoy":
            load_args[
                "annoy_data_dimensionality"] = last_embedding_weights.shape[1]
            load_args["annoy_mertic"] = "euclidean"
            load_args["annoy_prefault"] = True
        approx_nn.load(approx_nn_path, **load_args)

    # SSWR
    print("--- Evaluating SSWR ---")
    sswr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=sswr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )

    # Compute average semantic and syntactic accuracies
    sswr_categories = list(sswr_accuracies.keys())
    sswr_semantic_categories = sswr_categories[:5]
    sswr_syntactic_categories = sswr_categories[5:-1]
    sswr_semantic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_semantic_categories])
    sswr_syntactic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_syntactic_categories])
    sswr_accuracies["semantic_avg"] = sswr_semantic_avg_acc
    sswr_accuracies["syntactic_avg"] = sswr_syntactic_avg_acc
    save_analogies_accuracies_to_file("sswr", output_dir, sswr_accuracies)
    print(sswr_accuracies)

    # MSR
    print("--- Evaluating MSR ---")
    msr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=msr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("msr", output_dir, msr_accuracies)
    print(msr_accuracies)

    # PAD
    print("--- Evaluating PAD ---")
    pad_accuracies = evaluate_model_word_analogies(
        analogies_filepath=pad_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("pad", output_dir, pad_accuracies)
    print(pad_accuracies)
def topological_polysemy_pipeline(
    semeval_word_senses_filepath: str,
    word2vec_semeval_model_dir: str,
    word2vec_enwiki_model_dir: str,
    word2vec_google_news_model_dir: str,
    glove_model_dir: str,
    fasttext_model_dir: str,
    fasttext_tps_model_dir: str,
    tps_neighbourhood_sizes: str,
    num_top_k_words_frequencies: int,
    cyclo_octane_data_filepath: str,
    henneberg_data_filepath: str,
    custom_point_cloud_neighbourhood_size: int,
    output_dir: str,
) -> None:
    """
    Computes the topological polysemy of various word embeddings and data sets.
    Saves results in output dir with some additional plots.

    Parameters
    ----------
    semeval_word_senses_filepath : str
        Filepath of the SemEval-2010 task 14 word senses
    word2vec_semeval_model_dir : str
        Directory of the SemEval-2010 task 14 word2vec model.
    word2vec_enwiki_model_dir : str
        Directory of the enwiki word2vec model.
    word2vec_google_news_model_dir : str
        Directory of the Google News 3M word2vec model
    glove_model_dir : str
        Directory of the GloVe model.
    fasttext_model_dir : str
        Directory of the fastText model.
    fasttext_tps_model_dir : str
        Directory of the TPS fastText model.
    tps_neighbourhood_sizes : str
        Neighbourhood sizes to use when computing TPS (e.g. 50, 60).
    num_top_k_words_frequencies : int
        Number of top words to use when computing TPS scores vs. word frequencies.
    cyclo_octane_data_filepath : str
        Filepath of the cyclo-octane dataset.
    henneberg_data_filepath : str
        Filepath of the Henneberg dataset.
    custom_point_cloud_neighbourhood_size : int
        Neighbourhood size to use when computing TPS for custom point clouds.
    output_dir : str
        Output directory to save results.
    """
    # Ensure output directory exists
    makedirs(output_dir, exist_ok=True)

    # Load SemEval-2010 task 14 word senses
    semeval_word_senses: dict = joblib.load(semeval_word_senses_filepath)
    semeval_target_words = np.array(list(semeval_word_senses["all"].keys()))
    semeval_target_word_gs_clusters = np.array(
        list(semeval_word_senses["all"].values())
    )

    # Parse strings into int
    tps_neighbourhood_sizes = [int(n_size) for n_size in tps_neighbourhood_sizes]

    # -- Compute TPS for word embeddings (SemEval and enwiki) --
    for dataset_name, model_dir in zip(
        ["semeval_2010_task_14", "enwiki"],
        [word2vec_semeval_model_dir, word2vec_enwiki_model_dir],
    ):
        # Load word embeddings
        print(f"Loading {dataset_name} word embeddings...")
        w2v_training_output = load_model_training_output(
            model_training_output_dir=model_dir,
            model_name="word2vec",
            dataset_name=dataset_name,
            return_normalized_embeddings=True,
            return_scann_instance=True,
        )
        last_embedding_weights_normalized = w2v_training_output[
            "last_embedding_weights_normalized"
        ]
        last_embedding_weights_scann_instance = w2v_training_output[
            "last_embedding_weights_scann_instance"
        ]
        words = w2v_training_output["words"]
        word_to_int = w2v_training_output["word_to_int"]
        word_counts = w2v_training_output["word_counts"]
        print("Done!")

        print("Computing TPS for word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=dataset_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_to_int=word_to_int,
            word_vocabulary=words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            word_counts=word_counts,
            ann_instance=last_embedding_weights_scann_instance,
        )
        del last_embedding_weights_scann_instance
        print("Done!")

    # -- Compute TPS for external word embeddings --
    # Prepare constants
    external_word_embeddings = [
        (
            "google_news_3m",
            "GoogleNews-vectors-negative300",
            word2vec_google_news_model_dir,
        ),
        (
            "glove_cc_840b_300d",
            "glove.840B.300d",
            glove_model_dir,
        ),
        (
            "fasttext_cc_300d",
            "cc.en.300.vec",
            fasttext_model_dir,
        ),
        (
            "fasttext_tps_300d",
            "fastText.TPS.300d",
            fasttext_tps_model_dir,
        ),
    ]

    # Compute TPS for each external word embeddings
    for word_embeddings_name, model_name, model_dir in external_word_embeddings:

        # Prepare filepaths
        model_normalized_weights_filepath = join(
            model_dir, f"{model_name}_normalized.npy"
        )
        model_words_filepath = join(model_dir, f"{model_name}_words.txt")
        model_scann_artifacts_dir = join(model_dir, f"{model_name}_scann_artifacts")

        # Load data
        print(f"Loading {model_name} data...")
        model_weights_normalized = np.load(
            model_normalized_weights_filepath, mmap_mode="r"
        )
        with open(model_words_filepath, "r") as words_file:
            model_words = np.array(words_file.read().split("\n"))
        model_approx_nn = ApproxNN(ann_alg="scann")
        model_approx_nn.load(ann_path=model_scann_artifacts_dir)
        print("Done!")

        print(f"Computing TPS for {model_name} word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=word_embeddings_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=model_weights_normalized,
            word_to_int={word: i for i, word in enumerate(model_words)},
            word_vocabulary=model_words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            ann_instance=model_approx_nn,
        )
        del model_approx_nn
        print("Done!")

    # -- Compute TPS for custom point clouds --
    for point_cloud_name, point_cloud_filepath in zip(
        ["cyclo_octane", "henneberg"],
        [cyclo_octane_data_filepath, henneberg_data_filepath],
    ):
        # Load and prepare data for TPS
        point_cloud = pd.read_csv(point_cloud_filepath, header=None).values
        point_cloud_normalized = point_cloud / np.linalg.norm(
            point_cloud, axis=1
        ).reshape(-1, 1)
        point_cloud_pairwise_dists = euclidean_distances(point_cloud)

        # Compute TPS scores
        num_points = len(point_cloud)
        tps_scores = np.zeros(num_points)
        print(f"Computing TPS scores for {point_cloud_name}...")
        for point_index in tqdm(range(num_points)):
            tps_score = tps_point_cloud(
                point_index=point_index,
                neighbourhood_size=custom_point_cloud_neighbourhood_size,
                point_cloud_normalized=point_cloud_normalized,
                point_cloud_pairwise_dists=point_cloud_pairwise_dists,
            )
            tps_scores[point_index] = tps_score

        # Save result
        point_cloud_output_dir = join(output_dir, point_cloud_name)
        makedirs(point_cloud_output_dir, exist_ok=True)
        np.save(
            join(
                point_cloud_output_dir,
                f"tps_scores_{custom_point_cloud_neighbourhood_size}.npy",
            ),
            tps_scores,
        )
Esempio n. 11
0
def similar_words(
    weights: np.ndarray,
    word_to_int: Dict[str, int],
    words: np.ndarray,
    ann_instance: ApproxNN = None,
    top_n: int = 10,
    positive_words: Optional[List[str]] = None,
    negative_words: Optional[List[str]] = None,
    vocab_size: int = -1,
    return_similarity_score: bool = True,
) -> List[Union[Tuple, str]]:
    """
    Finds the most similar words of a linear combination of positively and negatively
    contributing words.

    Parameters
    ----------
    weights : np.ndarray
        Numpy matrix (vocabulary size, embedding dim) containing word vectors.
    word_to_int : dict of str and int
        Dictionary mapping from word to its integer representation.
    words : np.ndarray
        Numpy array containing words from the vocabulary.
    ann_instance : ApproxNN, optional
        ApproxNN instance, built on word embeddings (defaults to None).
    top_n : int, optional
        Number of similar words (defaults to 10).
    positive_words : list of str, optional
        List of words contribution positively (defaults to empty list).
    negative_words : list of str, optional
        List of words contribution negatively (defaults to empty list).
    vocab_size : int, optional
        Vocabulary size to use, e.g., only most common `vocab_size` words to taken
        into account (defaults to -1 meaning all words).
    return_similarity_score : bool, optional
        Whether or not to return the cosine similarity score (`ann_instance`
        must be set to None to have an effect).

    Returns
    -------
    If return_similarity_score is True, then
        pairs : list of tuples of str and int
            List of `top_n` similar words and their cosine similarities.
    else:
        closest_words : list of str
            List of `top_n` similar words.
    """
    # Default values
    if positive_words is None:
        positive_words = []
    if negative_words is None:
        negative_words = []

    # Restrict vocabulary
    if vocab_size > 0:
        weights = weights[:vocab_size]
        words = words[:vocab_size]

    # Create query word vector
    query_word_vec = np.zeros((weights.shape[1], ), dtype=np.float64)
    query_word_vec += np.array([
        get_word_vec(pos_word, word_to_int, weights)
        for pos_word in positive_words
    ]).sum(axis=0)
    query_word_vec -= np.array([
        get_word_vec(neg_word, word_to_int, weights)
        for neg_word in negative_words
    ]).sum(axis=0)

    # Create indices list of query words to exclude from search
    exclude_words_indices = [
        word_to_int[word] for word in positive_words + negative_words
    ]

    # Find closest words
    if ann_instance is None:

        # Use cosine similarity to find similar words
        cos_sims = fastdist.cosine_vector_to_matrix(query_word_vec, weights)
        sorted_indices = cos_sims.argsort()[::-1]
        sorted_indices = [
            idx for idx in sorted_indices if idx not in exclude_words_indices
        ]
    else:
        query_word_vec_norm = query_word_vec / np.linalg.norm(query_word_vec)
        sorted_indices = ann_instance.search(
            query_vector=query_word_vec_norm,
            k_neighbours=top_n,
            excluded_neighbour_indices=exclude_words_indices,
        )

    # Filter top words/similarities
    top_words = words[sorted_indices][:top_n]

    # Create word similarity pairs
    if return_similarity_score and ann_instance is None:
        top_sims = cos_sims[sorted_indices][:top_n]
        result = list(zip(top_words, top_sims))
    else:
        result = top_words

    return result
Esempio n. 12
0
def prepare_num_word_meanings_supervised_data(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    id_estimation_num_neighbours: list,
    semeval_2010_14_word_senses_filepath: str,
    tps_neighbourhood_sizes: list,
    raw_data_dir: str,
    output_dir: str,
) -> None:
    """
    Prepares data for the supervised word meanings prediction task.

    Parameters
    ----------
    model_dir : str
        Directory of the model to load.
    model_name : str
        Name of the trained word2vec model.
    dataset_name : str
        Name of the dataset the model is trained on.
    id_estimation_num_neighbours : list
        Number of neighbours to use when estimating intrinsic dimension for each word
    semeval_2010_14_word_senses_filepath : str
        Filepath of SemEval-2010 task 14 word senses joblib dict.
    tps_neighbourhood_sizes : list
        List of TPS neighbourhood sizes.
    raw_data_dir : str
        Directory where raw data will be saved to.
    output_dir: str
        Output directory.
    """
    # Convert list arguments to int
    tps_neighbourhood_sizes = [
        int(n_size) for n_size in tps_neighbourhood_sizes
    ]
    id_estimation_num_neighbours = [
        int(num_neighbours) for num_neighbours in id_estimation_num_neighbours
    ]

    # Prepare directory constants and create raw data dir for caching data files
    task_id = f"wme_{model_name}_{dataset_name}"  # wme = word meaning estimation
    task_raw_data_dir = join(raw_data_dir, task_id)
    task_raw_data_tps_dir = join(task_raw_data_dir, "tps")
    makedirs(task_raw_data_dir, exist_ok=True)

    # Load word embeddings from model
    print("Loading word embeddings...")
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        return_normalized_embeddings=True,
        return_scann_instance_filepath=True,
    )
    last_embedding_weights_normalized = w2v_training_output[
        "last_embedding_weights_normalized"]
    last_embedding_weights_scann_instance_filepath = w2v_training_output[
        "last_embedding_weights_scann_instance_filepath"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]
    print("Done!")

    # Prepare SemEval-2010 task 14 data
    semeval_2010_14_word_senses = joblib.load(
        semeval_2010_14_word_senses_filepath)
    semeval_target_words = np.array(
        list(semeval_2010_14_word_senses["all"].keys()))
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter]
    semeval_gs_clusters = np.array(
        list(semeval_2010_14_word_senses["all"].values()))
    semeval_gs_clusters_in_vocab = semeval_gs_clusters[
        semeval_target_words_in_vocab_filter]
    semeval_2010_14_word_senses_in_vocab = {
        word: gs_meanings
        for word, gs_meanings in zip(semeval_target_words_in_vocab,
                                     semeval_gs_clusters_in_vocab)
    }

    # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary --
    words_to_num_meanings_filepath = join(task_raw_data_dir,
                                          "words_to_num_meanings.joblib")
    if not isfile(words_to_num_meanings_filepath):
        words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy()
        print("Finding words in vocabulary with #Wordnet synsets > 0")
        for word in tqdm(words):
            if word in semeval_target_words_in_vocab:
                continue
            num_synsets = len(wn.synsets(word))
            if num_synsets > 0:
                words_to_num_meanings[word] = num_synsets
        joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath)
    else:
        words_to_num_meanings = joblib.load(words_to_num_meanings_filepath)
        print("Loaded words_to_num_meanings!")
    data_words = np.array(list(words_to_num_meanings.keys()))
    data_words_no_semeval = [
        word for word in data_words
        if word not in semeval_target_words_in_vocab
    ]
    data_word_to_int = {word: i for i, word in enumerate(data_words)}

    # Filter out word embeddings using Wordnet words (data_words)
    data_words_to_full_vocab_ints = np.array(
        [word_to_int[word] for word in data_words])

    # (2) -- Compute TPS_n for train/test words --
    makedirs(task_raw_data_tps_dir, exist_ok=True)
    tps_scores_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    tps_pds_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
            tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths):
        if isfile(tps_scores_filepath) and isfile(tps_pds_filepath):
            continue
        print(
            f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..."
        )

        # Load ScaNN instance
        scann_instance = ApproxNN(ann_alg="scann")
        scann_instance.load(
            ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute TPS
        tps_scores_ns, tps_pds_ns = tps_multiple(
            target_words=data_words,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            ann_instance=scann_instance,
            return_persistence_diagram=True,
            n_jobs=-1,
            progressbar_enabled=True,
        )

        # Save result
        print("Saving TPS result...")
        np.save(tps_scores_filepath, tps_scores_ns)
        np.save(tps_pds_filepath, tps_pds_ns)
        print("Done!")

        # Free resources
        del scann_instance

    # (3) -- Compute GAD --
    gad_dir = join(task_raw_data_dir, "gad")
    makedirs(gad_dir, exist_ok=True)
    gad_params = [
        (25, 250),
        (25, 500),
        (25, 750),
        (25, 1000),
        # ----------
        (50, 250),
        (50, 500),
        (50, 750),
        (50, 1000),
        # ----------
        (100, 1000),
        (100, 1250),
        (100, 1500),
        (100, 1750),
        (100, 2000),
        # ----------
        (150, 1000),
        (150, 1250),
        (150, 1500),
        (150, 1750),
        (150, 2000),
        # ----------
        (200, 1000),
        (200, 1250),
        (200, 1500),
        (200, 1750),
        (200, 2000),
    ]
    gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2}
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_filepath = join(gad_dir, f"{gad_id}.joblib")
        if isfile(gad_filepath):
            continue
        print(f"-- {gad_id} -- ")

        # Load ScaNN instance
        approx_nn = ApproxNN(ann_alg="scann")
        approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute features
        gad_result = compute_gad(
            data_points=last_embedding_weights_normalized,
            data_point_ints=data_words_to_full_vocab_ints,
            manifold_dimension=2,
            data_points_approx_nn=approx_nn,
            use_knn_annulus=True,
            knn_annulus_inner=inner_param,
            knn_annulus_outer=outer_param,
            return_annlus_persistence_diagrams=True,
            progressbar_enabled=True,
            n_jobs=-1,
        )
        print(
            "P_man:",
            len(gad_result["P_man"]),
            "P_int:",
            len(gad_result["P_int"]),
            "P_bnd:",
            len(gad_result["P_bnd"]),
        )
        joblib.dump(gad_result, gad_filepath, protocol=4)

        # Free resources
        del approx_nn

    # (4) -- Estimate the intrinsic dimension (ID) for each word vector --
    words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids")
    id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [
        ("lpca", est_ids.lPCA, {}),
        ("knn", est_ids.KNN, {}),
        ("twonn", est_ids.TwoNN, {}),
        ("mle", est_ids.MLE, {}),
        ("tle", est_ids.TLE, {}),
    ]
    makedirs(words_estimated_ids_dir, exist_ok=True)
    for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators:
        for num_neighbours in id_estimation_num_neighbours:
            estimated_ids_filepath = join(
                words_estimated_ids_dir,
                f"{id_estimator_name}_{num_neighbours}.npy")
            if isfile(estimated_ids_filepath):
                continue

            print(
                f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..."
            )
            id_estimator = id_estimator_cls(**id_estimator_params)
            estimated_ids = id_estimator.fit_predict_pw(
                X=last_embedding_weights_normalized[
                    data_words_to_full_vocab_ints],
                n_neighbors=num_neighbours,
                n_jobs=-1,
            )
            # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints]

            print("Done! Saving to file...")
            np.save(estimated_ids_filepath, estimated_ids)

    # (5) -- Create features from GAD result to speed up combining of data --
    gad_features_dir = join(task_raw_data_dir, "gad_features")
    makedirs(gad_features_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
        if isfile(gad_features_filepath):
            continue
        print(f"Creating GAD features for {gad_id}...")

        # Load GAD result
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Features from GAD (P_man, P_int, P_bnd)
        gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3),
                                dtype=int)
        for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)):
            for gad_category, gad_category_idx in gad_categories.items():
                if word_int in gad_result[gad_category]:
                    gad_features[i, gad_category_idx] = 1

        # Save GAD features
        np.save(gad_features_filepath, gad_features)

    # (6) -- Vectorize persistence diagrams from GAD features --
    gad_features_pd_vectorized_dir = join(task_raw_data_dir,
                                          "gad_features_pd_vectorized")
    gad_features_pd_vectorized_size = 5
    gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2
    makedirs(gad_features_pd_vectorized_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"
        gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir,
                                             f"{gad_id}.npy")
        if isfile(gad_features_pd_vecs_filepath):
            continue
        print(f"Vectorizing GAD features for {gad_id}...")

        # Load GAD features
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Use PersistenceImage to vectorize persistence diagrams
        gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints),
                                         gad_features_pd_vectorized_size_flat))
        for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)):

            # Get persistence diagram and create a range such that we get a square image from PersistenceImager
            gad_features_pd = gad_result["annulus_pds"][point_index]
            if len(gad_features_pd) == 0:
                gad_features_pd_vecs[i] = np.zeros(
                    gad_features_pd_vectorized_size_flat, dtype=int)
                continue

            births, deaths = gad_features_pd.T
            persistence = deaths - births
            square_min = min(births.min(), persistence.min())
            square_max = max(births.max(), persistence.max())
            square_range = (square_min, square_max)
            pixel_size = (square_max -
                          square_min) / gad_features_pd_vectorized_size

            # Vectorize persistence diagram
            pimgr = PersistenceImager(birth_range=square_range,
                                      pers_range=square_range,
                                      pixel_size=pixel_size)
            pd_vec = pimgr.transform(gad_features_pd)
            gad_features_pd_vecs[i] = pd_vec.flatten()

        # Save persistence image vectors to file
        np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs)

    # (7) -- Combine data into data (features and labels) for WME task --
    word_meaning_train_data_filepath = join(output_dir,
                                            "word_meaning_train_data.csv")
    word_meaning_test_data_filepath = join(output_dir,
                                           "word_meaning_test_data.csv")
    word_meaning_semeval_test_data_filepath = join(
        output_dir, "word_meaning_semeval_test_data.csv")
    if (not isfile(word_meaning_train_data_filepath)
            or not isfile(word_meaning_test_data_filepath)
            or not isfile(word_meaning_semeval_test_data_filepath)):
        # -- Load data for creating features --
        # Load estimated IDs from file
        words_estimated_ids = {
            f"{id_estimator_name}_{num_neighbours}": np.load(
                join(words_estimated_ids_dir,
                     f"{id_estimator_name}_{num_neighbours}.npy"))
            for num_neighbours in id_estimation_num_neighbours
            for id_estimator_name, _, _ in id_estimators
        }
        print("Loaded estimated IDs!")

        # Load GAD features
        gad_features_dict = {}
        for inner_param, outer_param in gad_params:
            gad_id = f"gad_knn_{inner_param}_{outer_param}"

            # Load GAD features
            gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
            gad_features_dict[gad_id] = np.load(gad_features_filepath)
        print("Loaded GAD features!")

        # Load TPS features
        tps_scores = {}
        tps_pds = {}
        for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
                tps_neighbourhood_sizes, tps_scores_filepaths,
                tps_pds_filepaths):
            tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath)
            tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath,
                                                      allow_pickle=True)
        print("Loaded TPS features!")

        data_words_train, data_words_test = train_test_split(
            data_words_no_semeval, test_size=0.05, random_state=rng_seed)
        if not isfile(word_meaning_train_data_filepath):
            print("Preparing data for training...")
            train_data_df = create_word_meaning_model_data_features(
                target_words=data_words_train,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            train_data_df.to_csv(word_meaning_train_data_filepath, index=False)
        if not isfile(word_meaning_test_data_filepath):
            print("Preparing data for testing...")
            test_data_df = create_word_meaning_model_data_features(
                target_words=data_words_test,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            test_data_df.to_csv(word_meaning_test_data_filepath, index=False)
        if not isfile(word_meaning_semeval_test_data_filepath):
            print("Preparing data for external testing (SemEval)...")
            semeval_test_data_df = create_word_meaning_model_data_features(
                target_words=semeval_target_words_in_vocab,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            semeval_test_data_df.to_csv(
                word_meaning_semeval_test_data_filepath, index=False)
    else:
        train_data_df = pd.read_csv(word_meaning_train_data_filepath)
        test_data_df = pd.read_csv(word_meaning_test_data_filepath)
        semeval_test_data_df = pd.read_csv(
            word_meaning_semeval_test_data_filepath)
    print("Train", train_data_df)
    print("Test", test_data_df)
    print("SemEval test", semeval_test_data_df)