def test_query_embeddings_with_word2vec_when_model_not_given_raises_error():
    with pytest.raises(FileNotFoundError):
        query_embeddings(
            "Michael went to the store to buy some eggs .",
            os.path.join(os.getenv('FIXTURES_DIR'),
                         'full_df_with_embeddings.parquet.gzip'), 'Word2Vec',
            'Word2Vec')
def test_query_embeddings_raises_error_when_input_is_empty():
    with pytest.raises(KeyError):
        query_embeddings(
            "  ",
            os.path.join(os.getenv('FIXTURES_DIR'),
                         'full_df_with_embeddings.parquet.gzip'), 'BERT',
            'BERT')
def test_query_embeddings_with_bert_with_exact_query():
    embedding, trained_df = query_embeddings(
        "Michael went to the store to buy some eggs .",
        os.path.join(os.getenv('FIXTURES_DIR'),
                     'full_df_with_embeddings.parquet.gzip'), 'BERT', 'BERT')
    trained_df.sort_values('metric_distance', ascending=True, inplace=True)
    trained_df.reset_index(inplace=True, drop=True)
    assert trained_df['sentence'][
        0] == "Michael went to the store to buy some eggs ."
    np.testing.assert_array_almost_equal(embedding, trained_df['BERT'][0])
def test_query_embeddings_with_bert_with_non_exact_query():
    embedding, trained_df = query_embeddings(
        "New York",
        os.path.join(os.getenv('FIXTURES_DIR'),
                     'full_df_with_embeddings.parquet.gzip'), 'BERT', 'BERT')
    trained_df.sort_values('metric_distance', ascending=True, inplace=True)
    trained_df.reset_index(inplace=True, drop=True)
    assert trained_df['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia :"
    with pytest.raises(AssertionError):
        # checks arrays are no longer almost equal as the query 'New York' doesn't exactly match any sentence
        np.testing.assert_array_almost_equal(embedding, trained_df['BERT'][0])
def test_query_embeddings_with_word2vec_raises_logger_warning_when_some_words_out_of_vocabulary(
        caplog):
    with caplog.at_level(logging.WARNING):
        embedding, trained_df = query_embeddings(
            "Hello Michael, this is a trial sentence!",
            os.path.join(os.getenv('FIXTURES_DIR'),
                         'full_df_with_embeddings.parquet.gzip'), 'Word2Vec',
            'Word2Vec',
            os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle'))
    expected_log_message = \
        "The following words are not in the trained vocabulary and were therefore excluded from the search: " \
        "['Hello', 'trial', '!']"
    assert expected_log_message in caplog.text
def test_query_embeddings_with_word2vec_tfidf_weighted_with_exact_query():
    embedding, trained_df = query_embeddings(
        "Michael went to the store to buy some eggs .",
        os.path.join(os.getenv('FIXTURES_DIR'),
                     'full_df_with_embeddings.parquet.gzip'),
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle'),
        tfidf_vectorizer=os.path.join(os.getenv('FIXTURES_DIR'),
                                      'tfidf_vectorizer.pickle'))
    trained_df.sort_values('metric_distance', ascending=True, inplace=True)
    trained_df.reset_index(inplace=True, drop=True)
    assert trained_df['sentence'][
        0] == "Michael went to the store to buy some eggs ."
    np.testing.assert_array_equal(embedding,
                                  trained_df['Word2Vec_with_TfIdf_weights'][0])
def test_query_embeddings_with_word2vec_raises_logger_error_when_all_words_out_of_vocabulary(
        caplog):
    with caplog.at_level(logging.ERROR):
        embedding, trained_df = query_embeddings(
            "Hello there how are you?",
            os.path.join(os.getenv('FIXTURES_DIR'),
                         'full_df_with_embeddings.parquet.gzip'), 'Word2Vec',
            'Word2Vec',
            os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle'))
    expected_log_message = \
        'None of the words inputted are in the Word2Vec vocabulary. Please change your input or try a different ' \
        'model, such as ELMo or BERT. Returning empty array and DataFrame.'
    # "The following words are not in the trained vocabulary and were therefore excluded from the search: " \
    # "['Hello', 'there', 'how', 'are', 'you', '?']"
    print(caplog.text)
    print(vars(caplog))
    assert expected_log_message in caplog.text
    np.testing.assert_array_equal(embedding, np.array([]))
    pdt.assert_frame_equal(trained_df, pd.DataFrame())
コード例 #8
0
import json
import logging.config
from pdf2embeddings.process_user_queries import query_embeddings


if __name__ == '__main__':
    user_search_input = 'cell phone'
    model_name = 'BERT'  # change as appropriate
    DATA_DIR = os.getenv("DATA_DIR")
    CONFIG_DIR = os.getenv('CONFIG_DIR')
    MODELS_DIR = os.getenv("MODELS_DIR")
    LOGGING_CONFIG = os.getenv("LOGGING_CONFIG")
    with open(LOGGING_CONFIG, 'r') as f:
        config = yaml.safe_load(f)
    logging.config.dictConfig(config)
    with open(os.path.join(CONFIG_DIR, 'filenames.json'), 'r') as f:
        file_names = json.load(f)

    tfidf_vectorizer = os.path.join(MODELS_DIR, "tfidf_vectorizer.pickle")

    model = os.path.join(MODELS_DIR, file_names[model_name]["model_filename"])  # this is optional for ELMo and BERT.
    trained_df_path = os.path.join(DATA_DIR, 'processed', file_names[model_name]["parquet_filename"])
    user_input_embedding, trained_df = query_embeddings(
        user_search_input, trained_df_path, file_names[model_name]["column_name"], model_name, model,
        distance_metric='cosine', tfidf_vectorizer=tfidf_vectorizer
    )
    # tfidf_vectorizer is not used (and optional) when model is not 'Word2Vec_TfIdf_weighted'
    if user_input_embedding.size and not trained_df.empty:  # they must not be empty
        print(trained_df.sort_values('metric_distance', ascending=True)[['sentence', 'metric_distance']].
              reset_index(drop=True).head(10))
def test_end_to_end_runner():
    scraper = DocumentScraper(
        os.getenv("FIXTURES_DIR"),
        os.path.join(os.getenv("FIXTURES_DIR"), 'words_to_replace.json'))
    df_by_page = scraper.document_corpus_to_pandas_df()
    generator = CorpusGenerator(df_by_page)
    df_by_sentence = generator.df_by_page_to_df_by_sentence()
    list_of_sentences = df_by_sentence['sentence'].values.tolist()
    assert list_of_sentences == [
        'Mr Michael went to the store to buy some eggs.',
        'Joel rolled down the street on his skateboard.',
        'test / this is a first sentence',
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
    ]

    embedder = Embedder(list_of_sentences)
    models_to_be_run = ['Word2Vec_tfidf_weighted', 'Word2Vec', 'BERT', 'ELMo']
    for model in models_to_be_run:
        if model == 'Word2Vec_tfidf_weighted':
            sentence_embeddings, w2v_tfidf, tfidf_vectorizer = embedder.compute_word2vec_embeddings(
                tfidf_weights=True)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings,
                'Word2Vec_with_TfIdf_weights')
        elif model == 'Word2Vec':
            sentence_embeddings, w2v, _ = embedder.compute_word2vec_embeddings(
                tfidf_weights=False)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'Word2Vec')
        elif model == 'BERT':
            bert_model = 'bert-base-nli-stsb-mean-tokens'  # This line is specific to BERT
            sentence_embeddings, bert = embedder.compute_bert_embeddings(
                bert_model)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'BERT')
        elif model == 'ELMo':
            sentence_embeddings, elmo = embedder.compute_elmo_embeddings()
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'ELMo_layer_3')
        else:
            raise KeyError(f'The model {model} is not recognized as input.')

    w2v_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test1')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test1',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test1')
    bert_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test1')

    df_by_sentence.sort_values('w2v_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(w2v_emb, df_by_sentence['Word2Vec'][0])

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(
        w2v_tfidf_emb, df_by_sentence['Word2Vec_with_TfIdf_weights'][0])

    df_by_sentence.sort_values('elmo_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    # np.testing.assert_array_almost_equal(elmo_emb, df_by_sentence['ELMo_layer_3'][0])
    # This test does not work, see https://github.com/allenai/allennlp/issues/3995#

    df_by_sentence.sort_values('bert_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_almost_equal(bert_emb, df_by_sentence['BERT'][0])

    w2v_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test2')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test2',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test2')
    bert_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test2')

    df_by_sentence.sort_values('w2v_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('elmo_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('bert_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"