Esempio n. 1
0
def embedding_extraction():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    # load from a local path:
    #lang_model = Path("../saved_models/glove-german-uncased")
    # or through s3
    lang_model = "glove-german-uncased"  #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported
    do_lower_case = True
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(lang_model)
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # Create Inferencer for embedding extraction
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings")

    # Extract vectors
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    result = inferencer.extract_vectors(dicts=basic_texts,
                                        extraction_strategy="cls_token",
                                        extraction_layer=-1)
    print(result)
    inferencer.close_multiprocessing_pool()
Esempio n. 2
0
def fit_s3e_on_corpus(processor,
                      model,
                      corpus,
                      n_clusters=10,
                      mean_removal=True,
                      pca_removal=True,
                      pca_n_components=300,
                      pca_n_top_components=10,
                      default_token_weight=1,
                      min_token_occurrences=0,
                      svd_postprocessing=False,
                      use_gpu=False,
                      batch_size=50):
    """
    Pooling of word/token embeddings as described by Wang et al in the paper
    "Efficient Sentence Embedding via Semantic Subspace Analysis"
    (https://arxiv.org/abs/2002.09620)
    Adjusted their implementation from here: https://github.com/BinWang28/Sentence-Embedding-S3E

    This method fits the "model" on a custom corpus. This includes the derivation of token_weights depending on
    token occurences in the corpus, creation of the semantic clusters via k-means and a couple of
    pre-/post-processing steps to normalize the embeddings.

    The resulting objects can be saved or directly passed to the Inferencer to get the actual embeddings for your sentences.
    Note: Some operations like `mean_removal` imply changes on the AdaptiveModel or Processor. That's why we return them.

    :param processor: FARM Processor with a Tokenizer used for reading the corpus (e.g. Inference Processor)
    :param model: FARM AdaptiveModel with an embedding layer in the LM (currently only supporting 'WordEmbedding_LM' as a language model)
    :param corpus: Path to a text file or a str 
    :param n_clusters: Number of clusters for S3E. The more clusters, the higher the dimensionality of the resulting embeddings.
    :param mean_removal: Bool, whether to remove the mean from the token embeddings (preprocessing) 
    :param pca_removal: Bool, whether to remove pca components from the token embeddings (preprocessing)
    :param pca_n_components: int, how many PCA components to fit if `pca_removal` is enabled 
    :param pca_n_top_components: int, how many top PCA components to remove if `pca_removal` is enabled 
    :param default_token_weight: float, what weight to assign for tokens that are in vocab but not in corpus
    :param min_token_occurrences: int, mininum number of token occurrences in the corpus for keeping it in the vocab.
                                  Helps to shrink the model & speed it up.
    :param svd_postprocessing: Bool, whether to remove the top truncated SVD / LSA components from the sentence embeddings (postprocessing).
                               Note: Requires creating all sentence embeddings once for the corpus slowing down this method substantially.
                                     Doesn't impact later inference speed though.
    :param use_gpu: bool, whether to use a GPU
    :param batch_size: int, size of batch for the inferencer (only needed when `svd_postprocessing` is enabled)
    :return: model, processor, s3e_stats
    """

    from farm.infer import Inferencer
    from farm.modeling.tokenization import tokenize_with_metadata

    # Get tokens of corpus
    if isinstance(corpus, Path):
        logger.info("Reading corpus for fitting S3E ")
        with open(corpus, "r") as f:
            corpus = f.read()
    else:
        assert type(corpus) == str, "`corpus` must be of type str or Path()"

    tokenized_corpus = tokenize_with_metadata(corpus,
                                              processor.tokenizer)["tokens"]
    token_counts = dict(Counter(tokenized_corpus))
    n_tokens = sum(token_counts.values())

    # Trim vocab & embeddings to most frequent tokens (only to improve speed & ram consumption)
    model.language_model.trim_vocab(token_counts,
                                    processor,
                                    min_threshold=min_token_occurrences)

    # Normalize embeddings
    model.language_model.normalize_embeddings(
        zero_mean=mean_removal,
        pca_removal=pca_removal,
        pca_n_components=pca_n_components,
        pca_n_top_components=pca_n_top_components)
    normalized_word_embs = model.language_model.model.embeddings.cpu().numpy()

    # Get token weights
    token_weights = {}
    eps = 1e-3
    for word, id in processor.tokenizer.vocab.items():
        if word in token_counts:
            token_weights[id] = eps / (eps + token_counts[word] / n_tokens)
        else:
            # words that are in vocab but not present in corpus get the default weight
            token_weights[id] = default_token_weight

    # Construct Cluster
    weight_list = np.array(list(token_weights.values()))
    logger.info('Creating clusters for S3E embeddings')
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=42).fit(normalized_word_embs,
                                         sample_weight=weight_list)

    s3e_stats = {
        "token_to_cluster": kmeans.labels_,
        "centroids": kmeans.cluster_centers_,
        "token_weights": token_weights,
        "svd_components": None
    }

    if svd_postprocessing:
        logger.info(
            'Post processing sentence embeddings using principal component removal'
        )

        # Input
        sentences = [{
            "text": s
        } for s in corpus.split("\n") if len(s.strip()) > 0]

        # Get embeddings
        try:
            inferencer = Inferencer(model=model,
                                    processor=processor,
                                    task_type="embeddings",
                                    gpu=use_gpu,
                                    batch_size=batch_size,
                                    extraction_strategy="s3e",
                                    extraction_layer=-1,
                                    s3e_stats=s3e_stats)
            result = inferencer.inference_from_dicts(dicts=sentences)
        finally:
            inferencer.close_multiprocessing_pool()
        sentence_embeddings = [s["vec"] for s in result]
        sentence_embeddings = np.vstack(sentence_embeddings)

        # Principal Component Removal
        svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
        svd.fit(sentence_embeddings)
        s3e_stats["svd_components"] = svd.components_

    return model, processor, s3e_stats
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = True

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=40 * 4)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    model, _ = optimize_model(model=model,
                              device=device,
                              local_rank=-1,
                              optimizer=None,
                              distributed=False,
                              use_amp=None)

    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 77.7478
    gold_f1 = 82.1557
    gold_tnacc = 84.0646  # top 1 recall
    gold_elapsed = 40  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnacc,
            rtol=0.001,
            err_msg=
            f"FARM Eval changed for top 1 accuracy by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    # # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=40 * 4,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename,
                                       return_json=False,
                                       multiprocessing_chunksize=80)
    results_squad = [x.to_squad_eval() for x in result]
    model.close_multiprocessing_pool()

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=results_squad,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 78.4890
    gold_f1 = 81.7104
    gold_elapsed = 27  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
def fit(language_model,
        corpus_path,
        save_dir,
        do_lower_case,
        batch_size=4,
        use_gpu=False):
    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=10,
                                                    pca_n_components=300,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings",
                            gpu=use_gpu,
                            batch_size=batch_size,
                            extraction_strategy="s3e",
                            extraction_layer=-1,
                            s3e_stats=s3e_stats)

    # Input
    basic_texts = [
        {
            "text": "a man is walking on the street."
        },
        {
            "text": "a woman is walking on the street."
        },
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    print(result)
    inferencer.close_multiprocessing_pool()
Esempio n. 5
0
    processor=processor,
    batch_size=4,
    gpu=True,
    # TODO: how to mix for multihead?
    task_type="classification")
basic_texts = [
    {
        "text": "Some text you want to classify"
    },
    {
        "text": "A second sample"
    },
]

ret = inferencer.inference_from_dicts(basic_texts)
logger.info(f"Result of inference: {ret}")

logger.info(f"Evaluating on training set...")
evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"),
                      tasks=processor.tasks,
                      device=device)

result = evaluator.eval(inferencer.model, return_preds_and_labels=True)

evaluator.log_results(result,
                      "Test",
                      steps=len(data_silo.get_data_loader("test")))

inferencer.close_multiprocessing_pool()
logger.info("PROCESSING FINISHED")