Esempio n. 1
0
def test_s3e_fit():
    # small test data
    language_model = Path("samples/s3e/tiny_fasttext_model")
    corpus_path = Path("samples/s3e/tiny_corpus.txt")
    save_dir = Path("testsave/fitted_s3e/")
    do_lower_case = False
    batch_size = 2
    use_gpu = False

    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[],
        embeds_dropout_prob=0.1,
        lm_output_types=[],
        device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=3,
                                                    pca_n_components=30,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu,
                       batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1,
                       s3e_stats=s3e_stats, num_processes=0)

    # Input
    basic_texts = [
        {"text": "a man is walking on the street."},
        {"text": "a woman is walking on the street."},
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    assert result[0]["context"] == basic_texts[0]["text"]
    assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6
    assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
Esempio n. 2
0
def fit_s3e_on_corpus(processor,
                      model,
                      corpus,
                      n_clusters=10,
                      mean_removal=True,
                      pca_removal=True,
                      pca_n_components=300,
                      pca_n_top_components=10,
                      default_token_weight=1,
                      min_token_occurrences=0,
                      svd_postprocessing=False,
                      use_gpu=False,
                      batch_size=50):
    """
    Pooling of word/token embeddings as described by Wang et al in the paper
    "Efficient Sentence Embedding via Semantic Subspace Analysis"
    (https://arxiv.org/abs/2002.09620)
    Adjusted their implementation from here: https://github.com/BinWang28/Sentence-Embedding-S3E

    This method fits the "model" on a custom corpus. This includes the derivation of token_weights depending on
    token occurences in the corpus, creation of the semantic clusters via k-means and a couple of
    pre-/post-processing steps to normalize the embeddings.

    The resulting objects can be saved or directly passed to the Inferencer to get the actual embeddings for your sentences.
    Note: Some operations like `mean_removal` imply changes on the AdaptiveModel or Processor. That's why we return them.

    :param processor: FARM Processor with a Tokenizer used for reading the corpus (e.g. Inference Processor)
    :param model: FARM AdaptiveModel with an embedding layer in the LM (currently only supporting 'WordEmbedding_LM' as a language model)
    :param corpus: Path to a text file or a str 
    :param n_clusters: Number of clusters for S3E. The more clusters, the higher the dimensionality of the resulting embeddings.
    :param mean_removal: Bool, whether to remove the mean from the token embeddings (preprocessing) 
    :param pca_removal: Bool, whether to remove pca components from the token embeddings (preprocessing)
    :param pca_n_components: int, how many PCA components to fit if `pca_removal` is enabled 
    :param pca_n_top_components: int, how many top PCA components to remove if `pca_removal` is enabled 
    :param default_token_weight: float, what weight to assign for tokens that are in vocab but not in corpus
    :param min_token_occurrences: int, mininum number of token occurrences in the corpus for keeping it in the vocab.
                                  Helps to shrink the model & speed it up.
    :param svd_postprocessing: Bool, whether to remove the top truncated SVD / LSA components from the sentence embeddings (postprocessing).
                               Note: Requires creating all sentence embeddings once for the corpus slowing down this method substantially.
                                     Doesn't impact later inference speed though.
    :param use_gpu: bool, whether to use a GPU
    :param batch_size: int, size of batch for the inferencer (only needed when `svd_postprocessing` is enabled)
    :return: model, processor, s3e_stats
    """

    from farm.infer import Inferencer
    from farm.modeling.tokenization import tokenize_with_metadata

    # Get tokens of corpus
    if isinstance(corpus, Path):
        logger.info("Reading corpus for fitting S3E ")
        with open(corpus, "r") as f:
            corpus = f.read()
    else:
        assert type(corpus) == str, "`corpus` must be of type str or Path()"

    tokenized_corpus = tokenize_with_metadata(corpus,
                                              processor.tokenizer)["tokens"]
    token_counts = dict(Counter(tokenized_corpus))
    n_tokens = sum(token_counts.values())

    # Trim vocab & embeddings to most frequent tokens (only to improve speed & ram consumption)
    model.language_model.trim_vocab(token_counts,
                                    processor,
                                    min_threshold=min_token_occurrences)

    # Normalize embeddings
    model.language_model.normalize_embeddings(
        zero_mean=mean_removal,
        pca_removal=pca_removal,
        pca_n_components=pca_n_components,
        pca_n_top_components=pca_n_top_components)
    normalized_word_embs = model.language_model.model.embeddings.cpu().numpy()

    # Get token weights
    token_weights = {}
    eps = 1e-3
    for word, id in processor.tokenizer.vocab.items():
        if word in token_counts:
            token_weights[id] = eps / (eps + token_counts[word] / n_tokens)
        else:
            # words that are in vocab but not present in corpus get the default weight
            token_weights[id] = default_token_weight

    # Construct Cluster
    weight_list = np.array(list(token_weights.values()))
    logger.info('Creating clusters for S3E embeddings')
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=42).fit(normalized_word_embs,
                                         sample_weight=weight_list)

    s3e_stats = {
        "token_to_cluster": kmeans.labels_,
        "centroids": kmeans.cluster_centers_,
        "token_weights": token_weights,
        "svd_components": None
    }

    if svd_postprocessing:
        logger.info(
            'Post processing sentence embeddings using principal component removal'
        )

        # Input
        sentences = [{
            "text": s
        } for s in corpus.split("\n") if len(s.strip()) > 0]

        # Get embeddings
        try:
            inferencer = Inferencer(model=model,
                                    processor=processor,
                                    task_type="embeddings",
                                    gpu=use_gpu,
                                    batch_size=batch_size,
                                    extraction_strategy="s3e",
                                    extraction_layer=-1,
                                    s3e_stats=s3e_stats)
            result = inferencer.inference_from_dicts(dicts=sentences)
        finally:
            inferencer.close_multiprocessing_pool()
        sentence_embeddings = [s["vec"] for s in result]
        sentence_embeddings = np.vstack(sentence_embeddings)

        # Principal Component Removal
        svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
        svd.fit(sentence_embeddings)
        s3e_stats["svd_components"] = svd.components_

    return model, processor, s3e_stats
def fit(language_model,
        corpus_path,
        save_dir,
        do_lower_case,
        batch_size=4,
        use_gpu=False):
    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=10,
                                                    pca_n_components=300,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings",
                            gpu=use_gpu,
                            batch_size=batch_size,
                            extraction_strategy="s3e",
                            extraction_layer=-1,
                            s3e_stats=s3e_stats)

    # Input
    basic_texts = [
        {
            "text": "a man is walking on the street."
        },
        {
            "text": "a woman is walking on the street."
        },
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    print(result)
    inferencer.close_multiprocessing_pool()
Esempio n. 4
0
    model=model,
    processor=processor,
    batch_size=4,
    gpu=True,
    # TODO: how to mix for multihead?
    task_type="classification")
basic_texts = [
    {
        "text": "Some text you want to classify"
    },
    {
        "text": "A second sample"
    },
]

ret = inferencer.inference_from_dicts(basic_texts)
logger.info(f"Result of inference: {ret}")

logger.info(f"Evaluating on training set...")
evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"),
                      tasks=processor.tasks,
                      device=device)

result = evaluator.eval(inferencer.model, return_preds_and_labels=True)

evaluator.log_results(result,
                      "Test",
                      steps=len(data_silo.get_data_loader("test")))

inferencer.close_multiprocessing_pool()
logger.info("PROCESSING FINISHED")