Ejemplo n.º 1
0
def test_end_to_end_runner():
    scraper = DocumentScraper(
        os.getenv("FIXTURES_DIR"),
        os.path.join(os.getenv("FIXTURES_DIR"), 'words_to_replace.json'))
    df_by_page = scraper.document_corpus_to_pandas_df()
    generator = CorpusGenerator(df_by_page)
    df_by_sentence = generator.df_by_page_to_df_by_sentence()
    list_of_sentences = df_by_sentence['sentence'].values.tolist()
    assert list_of_sentences == [
        'Mr Michael went to the store to buy some eggs.',
        'Joel rolled down the street on his skateboard.',
        'test / this is a first sentence',
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
    ]

    embedder = Embedder(list_of_sentences)
    models_to_be_run = ['Word2Vec_tfidf_weighted', 'Word2Vec', 'BERT', 'ELMo']
    for model in models_to_be_run:
        if model == 'Word2Vec_tfidf_weighted':
            sentence_embeddings, w2v_tfidf, tfidf_vectorizer = embedder.compute_word2vec_embeddings(
                tfidf_weights=True)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings,
                'Word2Vec_with_TfIdf_weights')
        elif model == 'Word2Vec':
            sentence_embeddings, w2v, _ = embedder.compute_word2vec_embeddings(
                tfidf_weights=False)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'Word2Vec')
        elif model == 'BERT':
            bert_model = 'bert-base-nli-stsb-mean-tokens'  # This line is specific to BERT
            sentence_embeddings, bert = embedder.compute_bert_embeddings(
                bert_model)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'BERT')
        elif model == 'ELMo':
            sentence_embeddings, elmo = embedder.compute_elmo_embeddings()
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'ELMo_layer_3')
        else:
            raise KeyError(f'The model {model} is not recognized as input.')

    w2v_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test1')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test1',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test1')
    bert_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test1')

    df_by_sentence.sort_values('w2v_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(w2v_emb, df_by_sentence['Word2Vec'][0])

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(
        w2v_tfidf_emb, df_by_sentence['Word2Vec_with_TfIdf_weights'][0])

    df_by_sentence.sort_values('elmo_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    # np.testing.assert_array_almost_equal(elmo_emb, df_by_sentence['ELMo_layer_3'][0])
    # This test does not work, see https://github.com/allenai/allennlp/issues/3995#

    df_by_sentence.sort_values('bert_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_almost_equal(bert_emb, df_by_sentence['BERT'][0])

    w2v_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test2')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test2',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test2')
    bert_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test2')

    df_by_sentence.sort_values('w2v_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('elmo_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('bert_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
Ejemplo n.º 2
0
    corpus_filename = "corpus_by_sentence.csv"
    corpus_by_sentence = pd.read_csv(os.path.join(DATA_DIR, "processed", corpus_filename))
    list_of_sentences = corpus_by_sentence['sentence'].values.tolist()
    print("Instantiating Embedder class.")
    embedder = Embedder(list_of_sentences)

    for model in models_to_be_run:
        print(f"Calculating {model} embeddings.")
        if model == 'Word2Vec_tfidf_weighted':
            sentence_embeddings, model_obj, tfidf_vectorizer = embedder.compute_word2vec_embeddings(tfidf_weights=True)
            embedder.save_model(tfidf_vectorizer, MODELS_DIR, file_names[model]['vectorizer_filename'])
            # the line above is specific to Word2Vec with TfIdf vectorizer and cannot be generalized to other models
        elif model == 'Word2Vec':
            sentence_embeddings, model_obj, _ = embedder.compute_word2vec_embeddings(tfidf_weights=False)
        elif model == 'BERT':
            bert_model = 'bert-base-nli-stsb-mean-tokens'  # This line is specific to BERT
            sentence_embeddings, model_obj = embedder.compute_bert_embeddings(bert_model)
        elif model == 'ELMo':
            sentence_embeddings, model_obj = embedder.compute_elmo_embeddings()
        else:
            raise KeyError(f'The model {model} is not recognized as input.')
        print(f"{model} embeddings calculated. Saving model.")
        embedder.save_embeddings(sentence_embeddings, MODELS_DIR, file_names[model]['embeddings_filename'])
        embedder.save_model(model_obj, MODELS_DIR, file_names[model]['model_filename'])
        print(f"{model} model saved. Saving .parquet file.")
        df = embedder.add_embeddings_to_corpus_df(
            os.path.join(DATA_DIR, "processed", corpus_filename), sentence_embeddings, file_names[model]['column_name']
        )
        embedder.df_to_parquet(df, os.path.join(DATA_DIR, "processed", file_names[model]['parquet_filename']))
        print(f"Parquet file saved. All steps done for the {model} model.")
Ejemplo n.º 3
0
 def test_compute_bert_embeddings_is_empty_sentence(self):
     embedder = Embedder([])
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert len(embeddings) == 0
     np.testing.assert_array_equal(np.array([], dtype=np.float64), embeddings)
Ejemplo n.º 4
0
 def test_compute_bert_embeddings_contains_empty_sentence(self):
     embedder = Embedder(['Sentence one.', ''])
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert embeddings.shape == (2, 768)
Ejemplo n.º 5
0
 def test_compute_bert_embeddings(self, list_of_sentences, expected_bert_embeddings):
     embedder = Embedder(list_of_sentences)
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert len(list_of_sentences) == len(embeddings)
     assert embeddings.shape == (len(list_of_sentences), 768)
     np.testing.assert_array_almost_equal(expected_bert_embeddings, embeddings, decimal=5)