Esempi in Python per ApproxNN

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: approx_nn

Classe/tipologia: ApproxNN

Esempi su hotexamples.com: 12

ApproxNN in Python: 12 esempi trovati. Questi sono i migliori esempi reali in Python per approx_nn.ApproxNN, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

ApproxNN(9)

build(5)

save(5)

load(4)

search(3)

Metodi utilizzati di frequente

ApproxNN (9)

build (5)

save (5)

load (4)

search (3)

Esempio n. 1

Mostra file

def punctured_neighbourhood(
    target_word: str,
    word_to_int: dict,
    word_embeddings_norm: np.ndarray,
    neighbourhood_size: int,
    word_embeddings_pairwise_dists: np.ndarray,
    ann_instance: ApproxNN,
) -> np.ndarray:
    """
    Finds a punctured neighbourhood around a target word using
    cosine distances.

    Parameters
    ----------
    target_word : str
        Target word (w)
    word_to_int : dict of str and int
        Dictionary mapping from word to its integer representation.
    word_embeddings_norm : np.ndarray
        Normalized word embeddings
    neighbourhood_size : int
        Neighbourhood size (n)
    word_embeddings_pairwise_dists : np.ndarray
        Pairwise distances between word embeddings
    ann_instance : ApproxNN
        Approximate nearest neighbour (ANN) instance, built on the word embeddings
        If specified, the ANN index is used to find punctured neighbourhoods.

    Returns
    -------
    neighbouring_word_embeddings : np.ndarray
        Neighbouring word embeddings of `target_word`, excluding
        the word itself
    """
    # Find neighbouring words (excluding the target word itself)
    target_word_int = word_to_int[target_word]
    if ann_instance is not None:
        neighbourhood_sorted_indices = ann_instance.search(
            query_vector=word_embeddings_norm[target_word_int],
            k_neighbours=neighbourhood_size,
            excluded_neighbour_indices=[target_word_int],
        )
    else:
        if word_embeddings_pairwise_dists is not None:
            neighbourhood_distances = word_embeddings_pairwise_dists[
                target_word_int]
        else:
            neighbourhood_distances = vector_to_matrix_distance(
                u=word_embeddings_norm[target_word_int],
                m=word_embeddings_norm,
                metric=fastdist.euclidean,
                metric_name="euclidean",
            )
        neighbourhood_sorted_indices = np.argsort(
            neighbourhood_distances)[1:neighbourhood_size + 1]
    neighbouring_word_embeddings = word_embeddings_norm[
        neighbourhood_sorted_indices]
    return neighbouring_word_embeddings

Esempio n. 2

Mostra file

def get_knn_func_data_points(
    data_points: np.ndarray,
    pairwise_distances: np.ndarray = None,
    approx_nn: ApproxNN = None,
    metric: Callable = fastdist.euclidean,
    metric_name: str = "euclidean",
) -> KnnFunc:
    """
    Gets a K-nearest neighbour callable for data points, used in `compute_gad`.

    Parameters
    ----------
    data_points : np.ndarray
        Data points.
    pairwise_distances : np.ndarray, optional
        Pairwise distances of data points (defaults to None).
    approx_nn : ApproxNN, optional
        ApproxNN instance.
    metric : Callable, optional
        fastdist metric; only required if `pairwise_distances` and `approx_nn` are None
        (defaults to fastdist.euclidean).
    metric_name : str, optional
        String name of the `metric` callable (defaults to "euclidean").

    Returns
    -------
    knn_func : KnnFunc
        K-nearest neighbour callable for data points.
    """
    if approx_nn is not None:
        return lambda point_idx, k_neighbours: approx_nn.search(
            query_vector=data_points[point_idx],
            k_neighbours=k_neighbours,
            excluded_neighbour_indices=[point_idx],
            return_distances=True,
        )
    elif pairwise_distances is not None:
        return lambda point_idx, k_neighbours: get_nearest_neighbours(
            distances=pairwise_distances[point_idx],
            k_neighbours=k_neighbours,
        )
    else:
        return lambda point_idx, k_neighbours: get_nearest_neighbours(
            distances=fastdist.vector_to_matrix_distance(
                u=data_points[point_idx],
                m=data_points,
                metric=metric,
                metric_name=metric_name,
            ),
            k_neighbours=k_neighbours,
        )

Esempio n. 3

Mostra file

def postprocess_word2vec_embeddings(
    model_training_output_dir: str,
    model_name: str,
    dataset_name: str,
    vocab_size: int,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Applies post-processing to trained word2vec word embeddings:
    - Saves normalized word embeddings
    - Creates approximate nearest-neighbour index using Annoy

    Parameters
    ----------
    model_training_output_dir : str
        word2vec model training output directory.
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    vocab_size : int
        Size of the vocabulary to use, -1 denotes all words.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.
    """
    # Load output from training word2vec
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_training_output_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )
    last_embedding_weights = w2v_training_output["last_embedding_weights"]

    use_full_vocab = False
    if vocab_size == -1:
        vocab_size = last_embedding_weights.shape[0]
        use_full_vocab = True

    # Define filepaths
    last_embedding_weights_filepath = w2v_training_output[
        "last_embedding_weights_filepath"]
    last_embedding_weights_filepath_no_ext = Path(
        last_embedding_weights_filepath).stem
    if use_full_vocab:
        last_embedding_weights_normalized_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_normalized.npy",
        )
    else:
        last_embedding_weights_normalized_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_normalized.npy",
        )
    if use_full_vocab:
        model_annoy_index_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_annoy_index.ann",
        )
        model_scann_artifacts_dir = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_scann_artifacts",
        )
    else:
        model_annoy_index_filepath = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_annoy_index.ann",
        )
        model_scann_artifacts_dir = join(
            model_training_output_dir,
            f"{last_embedding_weights_filepath_no_ext}_{vocab_size}_scann_artifacts",
        )

    # Normalize word embeddings and save to file
    if not isfile(last_embedding_weights_normalized_filepath):
        print("Normalizing word embeddings and saving to file...")

        # Normalize word embeddings
        if use_full_vocab:
            last_embedding_weights_in_vocab = last_embedding_weights
        else:
            last_embedding_weights_in_vocab = last_embedding_weights[:
                                                                     vocab_size]
        last_embedding_weights_normalized = (
            last_embedding_weights_in_vocab / np.linalg.norm(
                last_embedding_weights_in_vocab, axis=1).reshape(-1, 1))
        np.save(
            last_embedding_weights_normalized_filepath,
            last_embedding_weights_normalized,
        )
        print("Done!")
    else:
        last_embedding_weights_normalized = np.load(
            last_embedding_weights_normalized_filepath)

    annoy_index_created = isfile(model_annoy_index_filepath)
    scann_instance_created = isdir(model_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:

        # Add word embeddings to index and build it
        if use_full_vocab:
            last_embedding_weights_normalized_in_vocab = (
                last_embedding_weights_normalized)
        else:
            last_embedding_weights_normalized_in_vocab = (
                last_embedding_weights_normalized[:vocab_size])

        if not isfile(model_annoy_index_filepath):
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=last_embedding_weights_normalized_in_vocab,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(model_annoy_index_filepath)

        if not isdir(model_scann_artifacts_dir):
            scann_instance = ApproxNN(ann_alg="scann")
            scann_instance.build(
                data=last_embedding_weights_normalized_in_vocab,
                distance_measure="dot_product",
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            scann_instance.save(model_scann_artifacts_dir)

Esempio n. 4

Mostra file

def load_model_training_output(
    model_training_output_dir: str,
    model_name: str,
    dataset_name: str,
    word_embeddings_mmap_mode: str = "r",
    return_normalized_embeddings: bool = False,
    return_annoy_instance: bool = False,
    annoy_instance_prefault: bool = False,
    return_scann_instance: bool = False,
    return_scann_instance_filepath: bool = False,
) -> dict:
    """
    Loads and returns a dict object containing output from word2vec training

    Parameters
    ----------
    model_training_output_dir : str
        word2vec model training output directory
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    word_embeddings_mmap_mode : str, optional
        Memmap mode to use when loading last word embedding weights (defaults to "r", or read).
    return_normalized_embeddings : bool, optional
        Whether or not to return last embedding weights, normalized, if they
        are present (defaults to False).
    return_annoy_instance : bool, optional
        Whether or not to return Annoy index fit on last embedding weights, if they
        are present (defaults to False).
    annoy_instance_prefault : bool, optional
        Whether or not to enable the `prefault` option when loading
        Annoy index. `return_annoy_instance` must be set to True to have an affect.
        (Defaults to False).
    return_scann_instance : bool, optional
        Whether or not to return the ScaNN instance fit on the last embedding weights,
        if they are present (defaults to False).
    return_scann_instance_filepath : bool, optional
        Whether or not to return the filepath of the ScaNN instance fit on the last word
        embedding weights, if they are present (defaults to False).

    Returns
    -------
    model_training_output : dict
        Dictionary containing output from word2vec training
    """

    # Get filepaths of the model output
    checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
        output_dir=model_training_output_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )

    # Get last word embeddings from training
    last_embedding_weights_filepath = checkpoint_filepaths_dict[
        "intermediate_embedding_weight_filepaths"][-1]
    last_embedding_weights = np.load(last_embedding_weights_filepath,
                                     mmap_mode=word_embeddings_mmap_mode)

    # Get word counts from tokenizer of word2vec model
    with open(checkpoint_filepaths_dict["train_word_counts_filepath"],
              "r") as word_counts_file:
        word_counts = np.array([
            int(word_count)
            for word_count in word_counts_file.read().split("\n")
        ])

    # Get array of words and word_to_int lookup dictionary
    with open(checkpoint_filepaths_dict["train_words_filepath"],
              "r") as words_file:
        words = np.array(words_file.read().split("\n"))
    word_to_int = {word: i for i, word in enumerate(words)}

    # Normalized embedding weights
    last_embedding_weights_normalized = None
    if (return_normalized_embeddings
            and "intermediate_embedding_weight_normalized_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_normalized = np.load(
            checkpoint_filepaths_dict[
                "intermediate_embedding_weight_normalized_filepaths"][-1],
            mmap_mode="r",
        )

    # Annoy index
    last_embedding_weights_annoy_instance = None
    if (return_annoy_instance
            and "intermediate_embedding_weight_annoy_index_filepaths"
            in checkpoint_filepaths_dict):
        last_embedding_weights_annoy_instance = ApproxNN(ann_alg="annoy")
        last_embedding_weights_annoy_instance.load(
            ann_path=checkpoint_filepaths_dict[
                "intermediate_embedding_weight_annoy_index_filepaths"][-1],
            annoy_data_dimensionality=last_embedding_weights.shape[1],
            annoy_mertic="euclidean",
            annoy_prefault=annoy_instance_prefault,
        )

    # ScaNN instance
    last_embedding_weights_scann_instance = None
    last_embedding_weights_scann_instance_filepath = None
    if "intermediate_embedding_weight_scann_artifact_dirs" in checkpoint_filepaths_dict:
        scann_instance_filepath = checkpoint_filepaths_dict[
            "intermediate_embedding_weight_scann_artifact_dirs"][-1]
        if return_scann_instance:
            last_embedding_weights_scann_instance = ApproxNN(ann_alg="scann")
            last_embedding_weights_scann_instance.load(
                ann_path=scann_instance_filepath)
        if return_scann_instance_filepath:
            last_embedding_weights_scann_instance_filepath = scann_instance_filepath

    return {
        "last_embedding_weights": last_embedding_weights,
        "last_embedding_weights_filepath": last_embedding_weights_filepath,
        "last_embedding_weights_normalized": last_embedding_weights_normalized,
        "last_embedding_weights_annoy_instance":
        last_embedding_weights_annoy_instance,
        "last_embedding_weights_scann_instance":
        last_embedding_weights_scann_instance,
        "last_embedding_weights_scann_instance_filepath":
        last_embedding_weights_scann_instance_filepath,
        "words": words,
        "word_to_int": word_to_int,
        "word_counts": word_counts,
    }

Esempio n. 5

Mostra file

File: preprocess_external_word_embeddings.py Progetto: JonasTriki/masters-thesis-ml

def preprocess_google_news(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
       Distributed Representations of Words and Phrases and their Compositionality
       (https://arxiv.org/pdf/1310.4546.pdf). In Proceedings of NIPS, 2013.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GoogleNews")
    makedirs(output_dir, exist_ok=True)

    # Define filepaths
    google_news_vectors_zip_raw_download_url = "https://filesender.uninett.no/download.php?token=b0aea55e-72a7-4ac0-9409-8d5dbb322505&files_ids=645861"
    google_news_vectors_zip_raw_filename = "GoogleNews-vectors-negative300.bin.gz"
    google_news_vectors_zip_raw_filepath = join(
        raw_data_dir, google_news_vectors_zip_raw_filename
    )
    google_news_vectors_bin_raw_filepath = join(
        raw_data_dir, "GoogleNews-vectors-negative300.bin"
    )
    google_news_words_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_words.txt"
    )
    google_news_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300.npy"
    )
    google_news_normalized_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_normalized.npy"
    )
    google_news_vectors_annoy_index_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_annoy_index.ann"
    )
    google_news_vectors_scann_artifacts_dir = join(
        output_dir, "GoogleNews-vectors-negative300_scann_artifacts"
    )

    # -- GoogleNews-vectors-negative300.bin.gz --
    if not isfile(google_news_vectors_zip_raw_filepath):
        print(f"Downloading {google_news_vectors_zip_raw_filename}...")
        download_from_url(
            url=google_news_vectors_zip_raw_download_url,
            destination_filepath=google_news_vectors_zip_raw_filepath,
        )
        print("Done!")

    if not isfile(google_news_vectors_bin_raw_filepath):
        print(f"Extracting {google_news_vectors_zip_raw_filename}...")
        with gzip.GzipFile(google_news_vectors_zip_raw_filepath, "rb") as gzip_file_raw:
            with open(google_news_vectors_bin_raw_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from binary file and save result
    should_load_vectors = (
        not isfile(google_news_words_filepath)
        or not isfile(google_news_vectors_filepath)
        or not isfile(google_news_normalized_vectors_filepath)
    )
    if should_load_vectors:
        google_news_word_embeddings, google_news_words = load_word2vec_binary_format(
            word2vec_filepath=google_news_vectors_bin_raw_filepath,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(google_news_words_filepath):
        with open(google_news_words_filepath, "w") as file:
            for i, word in enumerate(google_news_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(google_news_vectors_filepath):
        np.save(google_news_vectors_filepath, google_news_word_embeddings)

    # Save normalized word embeddings
    google_news_word_embeddings_normalized = None
    if not isfile(google_news_normalized_vectors_filepath):
        google_news_word_embeddings_normalized = (
            google_news_word_embeddings
            / np.linalg.norm(google_news_word_embeddings, axis=1).reshape(-1, 1)
        )
        np.save(
            google_news_normalized_vectors_filepath,
            google_news_word_embeddings_normalized,
        )

    annoy_index_created = isfile(google_news_vectors_annoy_index_filepath)
    scann_instance_created = isdir(google_news_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if google_news_word_embeddings_normalized is None:
            google_news_word_embeddings_normalized = np.load(
                google_news_normalized_vectors_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=google_news_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(google_news_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=google_news_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(google_news_vectors_scann_artifacts_dir)

Esempio n. 6

Mostra file

File: preprocess_external_word_embeddings.py Progetto: JonasTriki/masters-thesis-ml

def preprocess_fasttext_tps(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020).
       Topology of Word Embeddings: Singularities Reflect Polysemy.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastTextTPS")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    env_config = dotenv_values(join("..", ".env"))
    tps_fasttext_model_filesender_token = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN"
    ]
    tps_fasttext_model_filesender_token_files_ids = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN_FILES_IDS"
    ]
    tps_fasttext_model_url = f"https://filesender.uninett.no/download.php?token={tps_fasttext_model_filesender_token}&files_ids={tps_fasttext_model_filesender_token_files_ids}"
    tps_fasttext_model_name = "fastText.TPS.300d"
    tps_fasttext_model_raw_filepath = join(
        raw_data_dir, f"{tps_fasttext_model_name}.bin"
    )
    tps_fasttext_model_words_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_words.txt"
    )
    tps_fasttext_model_vectors_filepath = join(
        output_dir, f"{tps_fasttext_model_name}.npy"
    )
    tps_fasttext_model_vectors_normalized_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_normalized.npy"
    )
    tps_fasttext_model_annoy_index_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_annoy_index.ann"
    )
    tps_fasttext_model_scann_artifacts_dir = join(
        output_dir, f"{tps_fasttext_model_name}_scann_artifacts"
    )

    if not isfile(tps_fasttext_model_raw_filepath):
        print(f"Downloading {tps_fasttext_model_name}...")
        download_from_url(
            url=tps_fasttext_model_url,
            destination_filepath=tps_fasttext_model_raw_filepath,
        )
        print("Done!")

    # Load output from trained fastText model
    fasttext_model = fasttext.load_model(tps_fasttext_model_raw_filepath)
    fasttext_model_words = fasttext_model.words
    fasttext_model_embedding_weights = np.zeros(
        (len(fasttext_model_words), fasttext_model.get_dimension())
    )
    for i, word in enumerate(fasttext_model.words):
        fasttext_model_embedding_weights[i] = fasttext_model.get_word_vector(word)

    # Save words
    if not isfile(tps_fasttext_model_words_filepath):
        with open(tps_fasttext_model_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_model.words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(tps_fasttext_model_vectors_filepath):
        np.save(tps_fasttext_model_vectors_filepath, fasttext_model_embedding_weights)

    # Save normalized word embeddings
    fasttext_model_embedding_weights_normalized = None
    if not isfile(tps_fasttext_model_vectors_normalized_filepath):
        fasttext_model_embedding_weights_normalized = (
            fasttext_model_embedding_weights
            / np.linalg.norm(fasttext_model_embedding_weights, axis=1).reshape(-1, 1)
        )
        np.save(
            tps_fasttext_model_vectors_normalized_filepath,
            fasttext_model_embedding_weights_normalized,
        )

    annoy_index_created = isfile(tps_fasttext_model_annoy_index_filepath)
    scann_instance_created = isdir(tps_fasttext_model_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_model_embedding_weights_normalized is None:
            fasttext_model_embedding_weights_normalized = np.load(
                tps_fasttext_model_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_model_embedding_weights_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(tps_fasttext_model_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_model_embedding_weights_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(tps_fasttext_model_scann_artifacts_dir)

Esempio n. 7

Mostra file

File: preprocess_external_word_embeddings.py Progetto: JonasTriki/masters-thesis-ml

def preprocess_fasttext(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Grave, E., Bojanowski, P., Gupta, P., Joulin, A., & Mikolov, T. (2018).
       Learning Word Vectors for 157 Languages. In Proceedings of the International
       Conference on Language Resources and Evaluation (LREC 2018).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastText")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    fasttext_data_filename = "cc.en.300.vec"
    fasttext_vectors_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{fasttext_data_filename}.gz"
    fasttext_word_vectors_raw_gzip_filepath = join(
        raw_data_dir, f"{fasttext_data_filename}.gz"
    )
    fasttext_word_vectors_raw_txt_filepath = join(raw_data_dir, fasttext_data_filename)
    fasttext_word_vectors_words_filepath = join(
        output_dir, f"{fasttext_data_filename}_words.txt"
    )
    fasttext_word_vectors_filepath = join(output_dir, f"{fasttext_data_filename}.npy")
    fasttext_word_vectors_normalized_filepath = join(
        output_dir, f"{fasttext_data_filename}_normalized.npy"
    )
    fasttext_word_vectors_annoy_index_filepath = join(
        output_dir, f"{fasttext_data_filename}_annoy_index.ann"
    )
    fasttext_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{fasttext_data_filename}_scann_artifacts"
    )

    if not isfile(fasttext_word_vectors_raw_gzip_filepath):
        print(f"Downloading {fasttext_data_filename}...")
        download_from_url(
            url=fasttext_vectors_url,
            destination_filepath=fasttext_word_vectors_raw_gzip_filepath,
        )
        print("Done!")

    if not isfile(fasttext_word_vectors_raw_txt_filepath):
        print(f"Extracting {fasttext_data_filename}...")
        with gzip.GzipFile(
            fasttext_word_vectors_raw_gzip_filepath, "rb"
        ) as gzip_file_raw:
            with open(fasttext_word_vectors_raw_txt_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(fasttext_word_vectors_words_filepath)
        or not isfile(fasttext_word_vectors_filepath)
        or not isfile(fasttext_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        fasttext_word_embeddings, fasttext_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=fasttext_word_vectors_raw_txt_filepath,
            first_line_header=True,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(fasttext_word_vectors_words_filepath):
        with open(fasttext_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(fasttext_word_vectors_filepath):
        np.save(fasttext_word_vectors_filepath, fasttext_word_embeddings)

    # Save normalized word embeddings
    fasttext_word_embeddings_normalized = None
    if not isfile(fasttext_word_vectors_normalized_filepath):
        fasttext_word_embeddings_normalized = fasttext_word_embeddings / np.linalg.norm(
            fasttext_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            fasttext_word_vectors_normalized_filepath,
            fasttext_word_embeddings_normalized,
        )

    annoy_index_created = isfile(fasttext_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(fasttext_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_word_embeddings_normalized is None:
            fasttext_word_embeddings_normalized = np.load(
                fasttext_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(fasttext_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(fasttext_word_vectors_scann_artifacts_dir)

Esempio n. 8

Mostra file

File: preprocess_external_word_embeddings.py Progetto: JonasTriki/masters-thesis-ml

def preprocess_glove(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Jeffrey Pennington, Richard Socher, & Christopher D. Manning (2014).
       GloVe: Global Vectors for Word Representation. In Empirical Methods in Natural
       Language Processing (EMNLP) (pp. 1532–1543).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GloVe")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    glove_data_filename = "glove.840B.300d"
    glove_word_vectors_url = f"http://nlp.stanford.edu/data/{glove_data_filename}.zip"
    glove_word_vectors_raw_zip_filepath = join(
        raw_data_dir, f"{glove_data_filename}.zip"
    )
    glove_word_vectors_raw_txt_filename = f"{glove_data_filename}.txt"
    glove_word_vectors_raw_txt_filepath = join(
        raw_data_dir, glove_word_vectors_raw_txt_filename
    )
    glove_word_vectors_words_filepath = join(
        output_dir, f"{glove_data_filename}_words.txt"
    )
    glove_word_vectors_filepath = join(output_dir, f"{glove_data_filename}.npy")
    glove_word_vectors_normalized_filepath = join(
        output_dir, f"{glove_data_filename}_normalized.npy"
    )
    glove_word_vectors_annoy_index_filepath = join(
        output_dir, f"{glove_data_filename}_annoy_index.ann"
    )
    glove_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{glove_data_filename}_scann_artifacts"
    )

    if not isfile(glove_word_vectors_raw_zip_filepath):
        print(f"Downloading {glove_data_filename}...")
        download_from_url(
            url=glove_word_vectors_url,
            destination_filepath=glove_word_vectors_raw_zip_filepath,
        )
        print("Done!")

    if not isfile(glove_word_vectors_raw_txt_filepath):
        print(f"Extracting {glove_data_filename}...")
        with zipfile.ZipFile(glove_word_vectors_raw_zip_filepath, "r") as zip_ref:
            zip_ref.extractall(raw_data_dir)
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(glove_word_vectors_words_filepath)
        or not isfile(glove_word_vectors_filepath)
        or not isfile(glove_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        glove_word_embeddings, glove_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=glove_word_vectors_raw_txt_filepath,
            first_line_header=False,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(glove_word_vectors_words_filepath):
        with open(glove_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(glove_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(glove_word_vectors_filepath):
        np.save(glove_word_vectors_filepath, glove_word_embeddings)

    # Save normalized word embeddings
    glove_word_embeddings_normalized = None
    if not isfile(glove_word_vectors_normalized_filepath):
        glove_word_embeddings_normalized = glove_word_embeddings / np.linalg.norm(
            glove_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            glove_word_vectors_normalized_filepath,
            glove_word_embeddings_normalized,
        )

    annoy_index_created = isfile(glove_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(glove_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if glove_word_embeddings_normalized is None:
            glove_word_embeddings_normalized = np.load(
                glove_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=glove_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(glove_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=glove_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(glove_word_vectors_scann_artifacts_dir)

Esempio n. 9

Mostra file

File: evaluate_word2vec.py Progetto: JonasTriki/masters-thesis-ml

def evaluate_word2vec(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    sswr_dataset_filepath: str,
    msr_dataset_filepath: str,
    pad_dataset_filepath: str,
    vocab_size: int,
    approx_nn_path: str,
    approx_nn_alg: str,
    top_n_prediction: int,
    output_dir: str,
) -> None:
    """
    Evaluates a word2vec model on the SSWR and MSR test analogy datasets.

    Parameters
    ----------
    model_dir : str
        Directory of the model to evaluate.
    model_name : str
        Name of the trained model.
    dataset_name : str
        Name of the dataset the model is trained on.
    sswr_dataset_filepath : str
        Filepath of the SSWR test dataset.
    msr_dataset_filepath : str
        Filepath of the MSR test dataset.
    pad_dataset_filepath : str
        Filepath of the PAD test dataset
    vocab_size : int
        Vocabulary size to use when evaluating on the test datasets.
    approx_nn_path : str
        Filepath of an ApproxNN instance, built on the word embeddings.
    approx_nn_alg : str
        Algorithm of ApproxNN instance.
    top_n_prediction : int
        Which top-N prediction we would like to do.
    output_dir : str
        Output directory to save evaluation results.
    """
    # Load output from training word2vec
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
    )
    last_embedding_weights = w2v_training_output["last_embedding_weights"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]

    # Append date/time to output directory.
    output_dir = join(output_dir, datetime.now().strftime("%d-%b-%Y_%H-%M-%S"))
    makedirs(output_dir, exist_ok=True)

    # Load ApproxNN instance
    approx_nn = None
    if approx_nn_path != "":
        approx_nn = ApproxNN(ann_alg=approx_nn_alg)
        load_args = {}
        if approx_nn_alg == "annoy":
            load_args[
                "annoy_data_dimensionality"] = last_embedding_weights.shape[1]
            load_args["annoy_mertic"] = "euclidean"
            load_args["annoy_prefault"] = True
        approx_nn.load(approx_nn_path, **load_args)

    # SSWR
    print("--- Evaluating SSWR ---")
    sswr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=sswr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )

    # Compute average semantic and syntactic accuracies
    sswr_categories = list(sswr_accuracies.keys())
    sswr_semantic_categories = sswr_categories[:5]
    sswr_syntactic_categories = sswr_categories[5:-1]
    sswr_semantic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_semantic_categories])
    sswr_syntactic_avg_acc = np.mean(
        [sswr_accuracies[cat] for cat in sswr_syntactic_categories])
    sswr_accuracies["semantic_avg"] = sswr_semantic_avg_acc
    sswr_accuracies["syntactic_avg"] = sswr_syntactic_avg_acc
    save_analogies_accuracies_to_file("sswr", output_dir, sswr_accuracies)
    print(sswr_accuracies)

    # MSR
    print("--- Evaluating MSR ---")
    msr_accuracies = evaluate_model_word_analogies(
        analogies_filepath=msr_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("msr", output_dir, msr_accuracies)
    print(msr_accuracies)

    # PAD
    print("--- Evaluating PAD ---")
    pad_accuracies = evaluate_model_word_analogies(
        analogies_filepath=pad_dataset_filepath,
        word_embeddings=last_embedding_weights,
        word_to_int=word_to_int,
        words=words,
        vocab_size=vocab_size,
        ann_instance=approx_nn,
        top_n=top_n_prediction,
    )
    save_analogies_accuracies_to_file("pad", output_dir, pad_accuracies)
    print(pad_accuracies)

Esempio n. 10

Mostra file

File: topological_polysemy_pipeline.py Progetto: JonasTriki/masters-thesis-ml

def topological_polysemy_pipeline(
    semeval_word_senses_filepath: str,
    word2vec_semeval_model_dir: str,
    word2vec_enwiki_model_dir: str,
    word2vec_google_news_model_dir: str,
    glove_model_dir: str,
    fasttext_model_dir: str,
    fasttext_tps_model_dir: str,
    tps_neighbourhood_sizes: str,
    num_top_k_words_frequencies: int,
    cyclo_octane_data_filepath: str,
    henneberg_data_filepath: str,
    custom_point_cloud_neighbourhood_size: int,
    output_dir: str,
) -> None:
    """
    Computes the topological polysemy of various word embeddings and data sets.
    Saves results in output dir with some additional plots.

    Parameters
    ----------
    semeval_word_senses_filepath : str
        Filepath of the SemEval-2010 task 14 word senses
    word2vec_semeval_model_dir : str
        Directory of the SemEval-2010 task 14 word2vec model.
    word2vec_enwiki_model_dir : str
        Directory of the enwiki word2vec model.
    word2vec_google_news_model_dir : str
        Directory of the Google News 3M word2vec model
    glove_model_dir : str
        Directory of the GloVe model.
    fasttext_model_dir : str
        Directory of the fastText model.
    fasttext_tps_model_dir : str
        Directory of the TPS fastText model.
    tps_neighbourhood_sizes : str
        Neighbourhood sizes to use when computing TPS (e.g. 50, 60).
    num_top_k_words_frequencies : int
        Number of top words to use when computing TPS scores vs. word frequencies.
    cyclo_octane_data_filepath : str
        Filepath of the cyclo-octane dataset.
    henneberg_data_filepath : str
        Filepath of the Henneberg dataset.
    custom_point_cloud_neighbourhood_size : int
        Neighbourhood size to use when computing TPS for custom point clouds.
    output_dir : str
        Output directory to save results.
    """
    # Ensure output directory exists
    makedirs(output_dir, exist_ok=True)

    # Load SemEval-2010 task 14 word senses
    semeval_word_senses: dict = joblib.load(semeval_word_senses_filepath)
    semeval_target_words = np.array(list(semeval_word_senses["all"].keys()))
    semeval_target_word_gs_clusters = np.array(
        list(semeval_word_senses["all"].values())
    )

    # Parse strings into int
    tps_neighbourhood_sizes = [int(n_size) for n_size in tps_neighbourhood_sizes]

    # -- Compute TPS for word embeddings (SemEval and enwiki) --
    for dataset_name, model_dir in zip(
        ["semeval_2010_task_14", "enwiki"],
        [word2vec_semeval_model_dir, word2vec_enwiki_model_dir],
    ):
        # Load word embeddings
        print(f"Loading {dataset_name} word embeddings...")
        w2v_training_output = load_model_training_output(
            model_training_output_dir=model_dir,
            model_name="word2vec",
            dataset_name=dataset_name,
            return_normalized_embeddings=True,
            return_scann_instance=True,
        )
        last_embedding_weights_normalized = w2v_training_output[
            "last_embedding_weights_normalized"
        ]
        last_embedding_weights_scann_instance = w2v_training_output[
            "last_embedding_weights_scann_instance"
        ]
        words = w2v_training_output["words"]
        word_to_int = w2v_training_output["word_to_int"]
        word_counts = w2v_training_output["word_counts"]
        print("Done!")

        print("Computing TPS for word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=dataset_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_to_int=word_to_int,
            word_vocabulary=words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            word_counts=word_counts,
            ann_instance=last_embedding_weights_scann_instance,
        )
        del last_embedding_weights_scann_instance
        print("Done!")

    # -- Compute TPS for external word embeddings --
    # Prepare constants
    external_word_embeddings = [
        (
            "google_news_3m",
            "GoogleNews-vectors-negative300",
            word2vec_google_news_model_dir,
        ),
        (
            "glove_cc_840b_300d",
            "glove.840B.300d",
            glove_model_dir,
        ),
        (
            "fasttext_cc_300d",
            "cc.en.300.vec",
            fasttext_model_dir,
        ),
        (
            "fasttext_tps_300d",
            "fastText.TPS.300d",
            fasttext_tps_model_dir,
        ),
    ]

    # Compute TPS for each external word embeddings
    for word_embeddings_name, model_name, model_dir in external_word_embeddings:

        # Prepare filepaths
        model_normalized_weights_filepath = join(
            model_dir, f"{model_name}_normalized.npy"
        )
        model_words_filepath = join(model_dir, f"{model_name}_words.txt")
        model_scann_artifacts_dir = join(model_dir, f"{model_name}_scann_artifacts")

        # Load data
        print(f"Loading {model_name} data...")
        model_weights_normalized = np.load(
            model_normalized_weights_filepath, mmap_mode="r"
        )
        with open(model_words_filepath, "r") as words_file:
            model_words = np.array(words_file.read().split("\n"))
        model_approx_nn = ApproxNN(ann_alg="scann")
        model_approx_nn.load(ann_path=model_scann_artifacts_dir)
        print("Done!")

        print(f"Computing TPS for {model_name} word embeddings...")
        tps_word_embeddings(
            word_embeddings_name=word_embeddings_name,
            neighbourhood_sizes=tps_neighbourhood_sizes,
            semeval_target_words=semeval_target_words,
            semeval_target_words_gs_clusters=semeval_target_word_gs_clusters,
            word_embeddings_normalized=model_weights_normalized,
            word_to_int={word: i for i, word in enumerate(model_words)},
            word_vocabulary=model_words,
            num_top_k_words_frequencies=num_top_k_words_frequencies,
            output_dir=output_dir,
            ann_instance=model_approx_nn,
        )
        del model_approx_nn
        print("Done!")

    # -- Compute TPS for custom point clouds --
    for point_cloud_name, point_cloud_filepath in zip(
        ["cyclo_octane", "henneberg"],
        [cyclo_octane_data_filepath, henneberg_data_filepath],
    ):
        # Load and prepare data for TPS
        point_cloud = pd.read_csv(point_cloud_filepath, header=None).values
        point_cloud_normalized = point_cloud / np.linalg.norm(
            point_cloud, axis=1
        ).reshape(-1, 1)
        point_cloud_pairwise_dists = euclidean_distances(point_cloud)

        # Compute TPS scores
        num_points = len(point_cloud)
        tps_scores = np.zeros(num_points)
        print(f"Computing TPS scores for {point_cloud_name}...")
        for point_index in tqdm(range(num_points)):
            tps_score = tps_point_cloud(
                point_index=point_index,
                neighbourhood_size=custom_point_cloud_neighbourhood_size,
                point_cloud_normalized=point_cloud_normalized,
                point_cloud_pairwise_dists=point_cloud_pairwise_dists,
            )
            tps_scores[point_index] = tps_score

        # Save result
        point_cloud_output_dir = join(output_dir, point_cloud_name)
        makedirs(point_cloud_output_dir, exist_ok=True)
        np.save(
            join(
                point_cloud_output_dir,
                f"tps_scores_{custom_point_cloud_neighbourhood_size}.npy",
            ),
            tps_scores,
        )

Esempio n. 11

Mostra file

File: eval_utils.py Progetto: JonasTriki/masters-thesis-ml

def similar_words(
    weights: np.ndarray,
    word_to_int: Dict[str, int],
    words: np.ndarray,
    ann_instance: ApproxNN = None,
    top_n: int = 10,
    positive_words: Optional[List[str]] = None,
    negative_words: Optional[List[str]] = None,
    vocab_size: int = -1,
    return_similarity_score: bool = True,
) -> List[Union[Tuple, str]]:
    """
    Finds the most similar words of a linear combination of positively and negatively
    contributing words.

    Parameters
    ----------
    weights : np.ndarray
        Numpy matrix (vocabulary size, embedding dim) containing word vectors.
    word_to_int : dict of str and int
        Dictionary mapping from word to its integer representation.
    words : np.ndarray
        Numpy array containing words from the vocabulary.
    ann_instance : ApproxNN, optional
        ApproxNN instance, built on word embeddings (defaults to None).
    top_n : int, optional
        Number of similar words (defaults to 10).
    positive_words : list of str, optional
        List of words contribution positively (defaults to empty list).
    negative_words : list of str, optional
        List of words contribution negatively (defaults to empty list).
    vocab_size : int, optional
        Vocabulary size to use, e.g., only most common `vocab_size` words to taken
        into account (defaults to -1 meaning all words).
    return_similarity_score : bool, optional
        Whether or not to return the cosine similarity score (`ann_instance`
        must be set to None to have an effect).

    Returns
    -------
    If return_similarity_score is True, then
        pairs : list of tuples of str and int
            List of `top_n` similar words and their cosine similarities.
    else:
        closest_words : list of str
            List of `top_n` similar words.
    """
    # Default values
    if positive_words is None:
        positive_words = []
    if negative_words is None:
        negative_words = []

    # Restrict vocabulary
    if vocab_size > 0:
        weights = weights[:vocab_size]
        words = words[:vocab_size]

    # Create query word vector
    query_word_vec = np.zeros((weights.shape[1], ), dtype=np.float64)
    query_word_vec += np.array([
        get_word_vec(pos_word, word_to_int, weights)
        for pos_word in positive_words
    ]).sum(axis=0)
    query_word_vec -= np.array([
        get_word_vec(neg_word, word_to_int, weights)
        for neg_word in negative_words
    ]).sum(axis=0)

    # Create indices list of query words to exclude from search
    exclude_words_indices = [
        word_to_int[word] for word in positive_words + negative_words
    ]

    # Find closest words
    if ann_instance is None:

        # Use cosine similarity to find similar words
        cos_sims = fastdist.cosine_vector_to_matrix(query_word_vec, weights)
        sorted_indices = cos_sims.argsort()[::-1]
        sorted_indices = [
            idx for idx in sorted_indices if idx not in exclude_words_indices
        ]
    else:
        query_word_vec_norm = query_word_vec / np.linalg.norm(query_word_vec)
        sorted_indices = ann_instance.search(
            query_vector=query_word_vec_norm,
            k_neighbours=top_n,
            excluded_neighbour_indices=exclude_words_indices,
        )

    # Filter top words/similarities
    top_words = words[sorted_indices][:top_n]

    # Create word similarity pairs
    if return_similarity_score and ann_instance is None:
        top_sims = cos_sims[sorted_indices][:top_n]
        result = list(zip(top_words, top_sims))
    else:
        result = top_words

    return result

Esempio n. 12

Mostra file

def prepare_num_word_meanings_supervised_data(
    model_dir: str,
    model_name: str,
    dataset_name: str,
    id_estimation_num_neighbours: list,
    semeval_2010_14_word_senses_filepath: str,
    tps_neighbourhood_sizes: list,
    raw_data_dir: str,
    output_dir: str,
) -> None:
    """
    Prepares data for the supervised word meanings prediction task.

    Parameters
    ----------
    model_dir : str
        Directory of the model to load.
    model_name : str
        Name of the trained word2vec model.
    dataset_name : str
        Name of the dataset the model is trained on.
    id_estimation_num_neighbours : list
        Number of neighbours to use when estimating intrinsic dimension for each word
    semeval_2010_14_word_senses_filepath : str
        Filepath of SemEval-2010 task 14 word senses joblib dict.
    tps_neighbourhood_sizes : list
        List of TPS neighbourhood sizes.
    raw_data_dir : str
        Directory where raw data will be saved to.
    output_dir: str
        Output directory.
    """
    # Convert list arguments to int
    tps_neighbourhood_sizes = [
        int(n_size) for n_size in tps_neighbourhood_sizes
    ]
    id_estimation_num_neighbours = [
        int(num_neighbours) for num_neighbours in id_estimation_num_neighbours
    ]

    # Prepare directory constants and create raw data dir for caching data files
    task_id = f"wme_{model_name}_{dataset_name}"  # wme = word meaning estimation
    task_raw_data_dir = join(raw_data_dir, task_id)
    task_raw_data_tps_dir = join(task_raw_data_dir, "tps")
    makedirs(task_raw_data_dir, exist_ok=True)

    # Load word embeddings from model
    print("Loading word embeddings...")
    w2v_training_output = load_model_training_output(
        model_training_output_dir=model_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        return_normalized_embeddings=True,
        return_scann_instance_filepath=True,
    )
    last_embedding_weights_normalized = w2v_training_output[
        "last_embedding_weights_normalized"]
    last_embedding_weights_scann_instance_filepath = w2v_training_output[
        "last_embedding_weights_scann_instance_filepath"]
    words = w2v_training_output["words"]
    word_to_int = w2v_training_output["word_to_int"]
    print("Done!")

    # Prepare SemEval-2010 task 14 data
    semeval_2010_14_word_senses = joblib.load(
        semeval_2010_14_word_senses_filepath)
    semeval_target_words = np.array(
        list(semeval_2010_14_word_senses["all"].keys()))
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter]
    semeval_gs_clusters = np.array(
        list(semeval_2010_14_word_senses["all"].values()))
    semeval_gs_clusters_in_vocab = semeval_gs_clusters[
        semeval_target_words_in_vocab_filter]
    semeval_2010_14_word_senses_in_vocab = {
        word: gs_meanings
        for word, gs_meanings in zip(semeval_target_words_in_vocab,
                                     semeval_gs_clusters_in_vocab)
    }

    # (1) -- Find words in Wordnet that are in the word2vec model's vocabulary --
    words_to_num_meanings_filepath = join(task_raw_data_dir,
                                          "words_to_num_meanings.joblib")
    if not isfile(words_to_num_meanings_filepath):
        words_to_num_meanings = semeval_2010_14_word_senses_in_vocab.copy()
        print("Finding words in vocabulary with #Wordnet synsets > 0")
        for word in tqdm(words):
            if word in semeval_target_words_in_vocab:
                continue
            num_synsets = len(wn.synsets(word))
            if num_synsets > 0:
                words_to_num_meanings[word] = num_synsets
        joblib.dump(words_to_num_meanings, words_to_num_meanings_filepath)
    else:
        words_to_num_meanings = joblib.load(words_to_num_meanings_filepath)
        print("Loaded words_to_num_meanings!")
    data_words = np.array(list(words_to_num_meanings.keys()))
    data_words_no_semeval = [
        word for word in data_words
        if word not in semeval_target_words_in_vocab
    ]
    data_word_to_int = {word: i for i, word in enumerate(data_words)}

    # Filter out word embeddings using Wordnet words (data_words)
    data_words_to_full_vocab_ints = np.array(
        [word_to_int[word] for word in data_words])

    # (2) -- Compute TPS_n for train/test words --
    makedirs(task_raw_data_tps_dir, exist_ok=True)
    tps_scores_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_scores.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    tps_pds_filepaths = [
        join(task_raw_data_tps_dir, f"tps_{tps_neighbourhood_size}_pds.npy")
        for tps_neighbourhood_size in tps_neighbourhood_sizes
    ]
    for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
            tps_neighbourhood_sizes, tps_scores_filepaths, tps_pds_filepaths):
        if isfile(tps_scores_filepath) and isfile(tps_pds_filepath):
            continue
        print(
            f"Computing TPS scores using neighbourhood size {tps_neighbourhood_size}..."
        )

        # Load ScaNN instance
        scann_instance = ApproxNN(ann_alg="scann")
        scann_instance.load(
            ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute TPS
        tps_scores_ns, tps_pds_ns = tps_multiple(
            target_words=data_words,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            ann_instance=scann_instance,
            return_persistence_diagram=True,
            n_jobs=-1,
            progressbar_enabled=True,
        )

        # Save result
        print("Saving TPS result...")
        np.save(tps_scores_filepath, tps_scores_ns)
        np.save(tps_pds_filepath, tps_pds_ns)
        print("Done!")

        # Free resources
        del scann_instance

    # (3) -- Compute GAD --
    gad_dir = join(task_raw_data_dir, "gad")
    makedirs(gad_dir, exist_ok=True)
    gad_params = [
        (25, 250),
        (25, 500),
        (25, 750),
        (25, 1000),
        # ----------
        (50, 250),
        (50, 500),
        (50, 750),
        (50, 1000),
        # ----------
        (100, 1000),
        (100, 1250),
        (100, 1500),
        (100, 1750),
        (100, 2000),
        # ----------
        (150, 1000),
        (150, 1250),
        (150, 1500),
        (150, 1750),
        (150, 2000),
        # ----------
        (200, 1000),
        (200, 1250),
        (200, 1500),
        (200, 1750),
        (200, 2000),
    ]
    gad_categories = {"P_man": 0, "P_int": 1, "P_bnd": 2}
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_filepath = join(gad_dir, f"{gad_id}.joblib")
        if isfile(gad_filepath):
            continue
        print(f"-- {gad_id} -- ")

        # Load ScaNN instance
        approx_nn = ApproxNN(ann_alg="scann")
        approx_nn.load(ann_path=last_embedding_weights_scann_instance_filepath)

        # Compute features
        gad_result = compute_gad(
            data_points=last_embedding_weights_normalized,
            data_point_ints=data_words_to_full_vocab_ints,
            manifold_dimension=2,
            data_points_approx_nn=approx_nn,
            use_knn_annulus=True,
            knn_annulus_inner=inner_param,
            knn_annulus_outer=outer_param,
            return_annlus_persistence_diagrams=True,
            progressbar_enabled=True,
            n_jobs=-1,
        )
        print(
            "P_man:",
            len(gad_result["P_man"]),
            "P_int:",
            len(gad_result["P_int"]),
            "P_bnd:",
            len(gad_result["P_bnd"]),
        )
        joblib.dump(gad_result, gad_filepath, protocol=4)

        # Free resources
        del approx_nn

    # (4) -- Estimate the intrinsic dimension (ID) for each word vector --
    words_estimated_ids_dir = join(task_raw_data_dir, "estimated_ids")
    id_estimators: List[Tuple[str, GlobalEstimator, dict]] = [
        ("lpca", est_ids.lPCA, {}),
        ("knn", est_ids.KNN, {}),
        ("twonn", est_ids.TwoNN, {}),
        ("mle", est_ids.MLE, {}),
        ("tle", est_ids.TLE, {}),
    ]
    makedirs(words_estimated_ids_dir, exist_ok=True)
    for id_estimator_name, id_estimator_cls, id_estimator_params in id_estimators:
        for num_neighbours in id_estimation_num_neighbours:
            estimated_ids_filepath = join(
                words_estimated_ids_dir,
                f"{id_estimator_name}_{num_neighbours}.npy")
            if isfile(estimated_ids_filepath):
                continue

            print(
                f"Estimating IDs using {id_estimator_cls.__name__} with {num_neighbours} neighbours..."
            )
            id_estimator = id_estimator_cls(**id_estimator_params)
            estimated_ids = id_estimator.fit_predict_pw(
                X=last_embedding_weights_normalized[
                    data_words_to_full_vocab_ints],
                n_neighbors=num_neighbours,
                n_jobs=-1,
            )
            # estimated_ids = estimated_ids_full[data_words_to_full_vocab_ints]

            print("Done! Saving to file...")
            np.save(estimated_ids_filepath, estimated_ids)

    # (5) -- Create features from GAD result to speed up combining of data --
    gad_features_dir = join(task_raw_data_dir, "gad_features")
    makedirs(gad_features_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"

        gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
        if isfile(gad_features_filepath):
            continue
        print(f"Creating GAD features for {gad_id}...")

        # Load GAD result
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Features from GAD (P_man, P_int, P_bnd)
        gad_features = np.zeros((len(data_words_to_full_vocab_ints), 3),
                                dtype=int)
        for i, word_int in enumerate(tqdm(data_words_to_full_vocab_ints)):
            for gad_category, gad_category_idx in gad_categories.items():
                if word_int in gad_result[gad_category]:
                    gad_features[i, gad_category_idx] = 1

        # Save GAD features
        np.save(gad_features_filepath, gad_features)

    # (6) -- Vectorize persistence diagrams from GAD features --
    gad_features_pd_vectorized_dir = join(task_raw_data_dir,
                                          "gad_features_pd_vectorized")
    gad_features_pd_vectorized_size = 5
    gad_features_pd_vectorized_size_flat = gad_features_pd_vectorized_size**2
    makedirs(gad_features_pd_vectorized_dir, exist_ok=True)
    for inner_param, outer_param in gad_params:
        gad_id = f"gad_knn_{inner_param}_{outer_param}"
        gad_features_pd_vecs_filepath = join(gad_features_pd_vectorized_dir,
                                             f"{gad_id}.npy")
        if isfile(gad_features_pd_vecs_filepath):
            continue
        print(f"Vectorizing GAD features for {gad_id}...")

        # Load GAD features
        gad_result_filepath = join(gad_dir, f"{gad_id}.joblib")
        gad_result = joblib.load(gad_result_filepath)

        # Use PersistenceImage to vectorize persistence diagrams
        gad_features_pd_vecs = np.zeros((len(data_words_to_full_vocab_ints),
                                         gad_features_pd_vectorized_size_flat))
        for i, point_index in enumerate(tqdm(data_words_to_full_vocab_ints)):

            # Get persistence diagram and create a range such that we get a square image from PersistenceImager
            gad_features_pd = gad_result["annulus_pds"][point_index]
            if len(gad_features_pd) == 0:
                gad_features_pd_vecs[i] = np.zeros(
                    gad_features_pd_vectorized_size_flat, dtype=int)
                continue

            births, deaths = gad_features_pd.T
            persistence = deaths - births
            square_min = min(births.min(), persistence.min())
            square_max = max(births.max(), persistence.max())
            square_range = (square_min, square_max)
            pixel_size = (square_max -
                          square_min) / gad_features_pd_vectorized_size

            # Vectorize persistence diagram
            pimgr = PersistenceImager(birth_range=square_range,
                                      pers_range=square_range,
                                      pixel_size=pixel_size)
            pd_vec = pimgr.transform(gad_features_pd)
            gad_features_pd_vecs[i] = pd_vec.flatten()

        # Save persistence image vectors to file
        np.save(gad_features_pd_vecs_filepath, gad_features_pd_vecs)

    # (7) -- Combine data into data (features and labels) for WME task --
    word_meaning_train_data_filepath = join(output_dir,
                                            "word_meaning_train_data.csv")
    word_meaning_test_data_filepath = join(output_dir,
                                           "word_meaning_test_data.csv")
    word_meaning_semeval_test_data_filepath = join(
        output_dir, "word_meaning_semeval_test_data.csv")
    if (not isfile(word_meaning_train_data_filepath)
            or not isfile(word_meaning_test_data_filepath)
            or not isfile(word_meaning_semeval_test_data_filepath)):
        # -- Load data for creating features --
        # Load estimated IDs from file
        words_estimated_ids = {
            f"{id_estimator_name}_{num_neighbours}": np.load(
                join(words_estimated_ids_dir,
                     f"{id_estimator_name}_{num_neighbours}.npy"))
            for num_neighbours in id_estimation_num_neighbours
            for id_estimator_name, _, _ in id_estimators
        }
        print("Loaded estimated IDs!")

        # Load GAD features
        gad_features_dict = {}
        for inner_param, outer_param in gad_params:
            gad_id = f"gad_knn_{inner_param}_{outer_param}"

            # Load GAD features
            gad_features_filepath = join(gad_features_dir, f"{gad_id}.npy")
            gad_features_dict[gad_id] = np.load(gad_features_filepath)
        print("Loaded GAD features!")

        # Load TPS features
        tps_scores = {}
        tps_pds = {}
        for tps_neighbourhood_size, tps_scores_filepath, tps_pds_filepath in zip(
                tps_neighbourhood_sizes, tps_scores_filepaths,
                tps_pds_filepaths):
            tps_scores[tps_neighbourhood_size] = np.load(tps_scores_filepath)
            tps_pds[tps_neighbourhood_size] = np.load(tps_pds_filepath,
                                                      allow_pickle=True)
        print("Loaded TPS features!")

        data_words_train, data_words_test = train_test_split(
            data_words_no_semeval, test_size=0.05, random_state=rng_seed)
        if not isfile(word_meaning_train_data_filepath):
            print("Preparing data for training...")
            train_data_df = create_word_meaning_model_data_features(
                target_words=data_words_train,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            train_data_df.to_csv(word_meaning_train_data_filepath, index=False)
        if not isfile(word_meaning_test_data_filepath):
            print("Preparing data for testing...")
            test_data_df = create_word_meaning_model_data_features(
                target_words=data_words_test,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            test_data_df.to_csv(word_meaning_test_data_filepath, index=False)
        if not isfile(word_meaning_semeval_test_data_filepath):
            print("Preparing data for external testing (SemEval)...")
            semeval_test_data_df = create_word_meaning_model_data_features(
                target_words=semeval_target_words_in_vocab,
                word_to_int=data_word_to_int,
                tps_scores=tps_scores,
                tps_pds=tps_pds,
                tps_neighbourhood_sizes=tps_neighbourhood_sizes,
                words_estimated_ids=words_estimated_ids,
                words_to_meanings=words_to_num_meanings,
                gad_categories=gad_categories,
                gad_features_dict=gad_features_dict,
            )
            semeval_test_data_df.to_csv(
                word_meaning_semeval_test_data_filepath, index=False)
    else:
        train_data_df = pd.read_csv(word_meaning_train_data_filepath)
        test_data_df = pd.read_csv(word_meaning_test_data_filepath)
        semeval_test_data_df = pd.read_csv(
            word_meaning_semeval_test_data_filepath)
    print("Train", train_data_df)
    print("Test", test_data_df)
    print("SemEval test", semeval_test_data_df)