def test_query_embeddings_with_word2vec_when_model_not_given_raises_error(): with pytest.raises(FileNotFoundError): query_embeddings( "Michael went to the store to buy some eggs .", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'Word2Vec', 'Word2Vec')
def test_query_embeddings_raises_error_when_input_is_empty(): with pytest.raises(KeyError): query_embeddings( " ", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'BERT', 'BERT')
def test_query_embeddings_with_bert_with_exact_query(): embedding, trained_df = query_embeddings( "Michael went to the store to buy some eggs .", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'BERT', 'BERT') trained_df.sort_values('metric_distance', ascending=True, inplace=True) trained_df.reset_index(inplace=True, drop=True) assert trained_df['sentence'][ 0] == "Michael went to the store to buy some eggs ." np.testing.assert_array_almost_equal(embedding, trained_df['BERT'][0])
def test_query_embeddings_with_bert_with_non_exact_query(): embedding, trained_df = query_embeddings( "New York", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'BERT', 'BERT') trained_df.sort_values('metric_distance', ascending=True, inplace=True) trained_df.reset_index(inplace=True, drop=True) assert trained_df['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia :" with pytest.raises(AssertionError): # checks arrays are no longer almost equal as the query 'New York' doesn't exactly match any sentence np.testing.assert_array_almost_equal(embedding, trained_df['BERT'][0])
def test_query_embeddings_with_word2vec_raises_logger_warning_when_some_words_out_of_vocabulary( caplog): with caplog.at_level(logging.WARNING): embedding, trained_df = query_embeddings( "Hello Michael, this is a trial sentence!", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'Word2Vec', 'Word2Vec', os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle')) expected_log_message = \ "The following words are not in the trained vocabulary and were therefore excluded from the search: " \ "['Hello', 'trial', '!']" assert expected_log_message in caplog.text
def test_query_embeddings_with_word2vec_tfidf_weighted_with_exact_query(): embedding, trained_df = query_embeddings( "Michael went to the store to buy some eggs .", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'Word2Vec_with_TfIdf_weights', 'Word2Vec_TfIdf_weighted', os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle'), tfidf_vectorizer=os.path.join(os.getenv('FIXTURES_DIR'), 'tfidf_vectorizer.pickle')) trained_df.sort_values('metric_distance', ascending=True, inplace=True) trained_df.reset_index(inplace=True, drop=True) assert trained_df['sentence'][ 0] == "Michael went to the store to buy some eggs ." np.testing.assert_array_equal(embedding, trained_df['Word2Vec_with_TfIdf_weights'][0])
def test_query_embeddings_with_word2vec_raises_logger_error_when_all_words_out_of_vocabulary( caplog): with caplog.at_level(logging.ERROR): embedding, trained_df = query_embeddings( "Hello there how are you?", os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'), 'Word2Vec', 'Word2Vec', os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle')) expected_log_message = \ 'None of the words inputted are in the Word2Vec vocabulary. Please change your input or try a different ' \ 'model, such as ELMo or BERT. Returning empty array and DataFrame.' # "The following words are not in the trained vocabulary and were therefore excluded from the search: " \ # "['Hello', 'there', 'how', 'are', 'you', '?']" print(caplog.text) print(vars(caplog)) assert expected_log_message in caplog.text np.testing.assert_array_equal(embedding, np.array([])) pdt.assert_frame_equal(trained_df, pd.DataFrame())
import json import logging.config from pdf2embeddings.process_user_queries import query_embeddings if __name__ == '__main__': user_search_input = 'cell phone' model_name = 'BERT' # change as appropriate DATA_DIR = os.getenv("DATA_DIR") CONFIG_DIR = os.getenv('CONFIG_DIR') MODELS_DIR = os.getenv("MODELS_DIR") LOGGING_CONFIG = os.getenv("LOGGING_CONFIG") with open(LOGGING_CONFIG, 'r') as f: config = yaml.safe_load(f) logging.config.dictConfig(config) with open(os.path.join(CONFIG_DIR, 'filenames.json'), 'r') as f: file_names = json.load(f) tfidf_vectorizer = os.path.join(MODELS_DIR, "tfidf_vectorizer.pickle") model = os.path.join(MODELS_DIR, file_names[model_name]["model_filename"]) # this is optional for ELMo and BERT. trained_df_path = os.path.join(DATA_DIR, 'processed', file_names[model_name]["parquet_filename"]) user_input_embedding, trained_df = query_embeddings( user_search_input, trained_df_path, file_names[model_name]["column_name"], model_name, model, distance_metric='cosine', tfidf_vectorizer=tfidf_vectorizer ) # tfidf_vectorizer is not used (and optional) when model is not 'Word2Vec_TfIdf_weighted' if user_input_embedding.size and not trained_df.empty: # they must not be empty print(trained_df.sort_values('metric_distance', ascending=True)[['sentence', 'metric_distance']]. reset_index(drop=True).head(10))
def test_end_to_end_runner(): scraper = DocumentScraper( os.getenv("FIXTURES_DIR"), os.path.join(os.getenv("FIXTURES_DIR"), 'words_to_replace.json')) df_by_page = scraper.document_corpus_to_pandas_df() generator = CorpusGenerator(df_by_page) df_by_sentence = generator.df_by_page_to_df_by_sentence() list_of_sentences = df_by_sentence['sentence'].values.tolist() assert list_of_sentences == [ 'Mr Michael went to the store to buy some eggs.', 'Joel rolled down the street on his skateboard.', 'test / this is a first sentence', "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" ] embedder = Embedder(list_of_sentences) models_to_be_run = ['Word2Vec_tfidf_weighted', 'Word2Vec', 'BERT', 'ELMo'] for model in models_to_be_run: if model == 'Word2Vec_tfidf_weighted': sentence_embeddings, w2v_tfidf, tfidf_vectorizer = embedder.compute_word2vec_embeddings( tfidf_weights=True) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'Word2Vec_with_TfIdf_weights') elif model == 'Word2Vec': sentence_embeddings, w2v, _ = embedder.compute_word2vec_embeddings( tfidf_weights=False) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'Word2Vec') elif model == 'BERT': bert_model = 'bert-base-nli-stsb-mean-tokens' # This line is specific to BERT sentence_embeddings, bert = embedder.compute_bert_embeddings( bert_model) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'BERT') elif model == 'ELMo': sentence_embeddings, elmo = embedder.compute_elmo_embeddings() df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'ELMo_layer_3') else: raise KeyError(f'The model {model} is not recognized as input.') w2v_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'Word2Vec', 'Word2Vec', w2v, metric_colname='w2v_distance_test1') w2v_tfidf_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'Word2Vec_with_TfIdf_weights', 'Word2Vec_TfIdf_weighted', w2v_tfidf, metric_colname='w2v_tfidf_weighted_distance_test1', tfidf_vectorizer=tfidf_vectorizer) elmo_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'ELMo_layer_3', 'ELMo', elmo, metric_colname='elmo_distance_test1') bert_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'BERT', 'BERT', bert, metric_colname='bert_distance_test1') df_by_sentence.sort_values('w2v_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_equal(w2v_emb, df_by_sentence['Word2Vec'][0]) df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_equal( w2v_tfidf_emb, df_by_sentence['Word2Vec_with_TfIdf_weights'][0]) df_by_sentence.sort_values('elmo_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." # np.testing.assert_array_almost_equal(elmo_emb, df_by_sentence['ELMo_layer_3'][0]) # This test does not work, see https://github.com/allenai/allennlp/issues/3995# df_by_sentence.sort_values('bert_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_almost_equal(bert_emb, df_by_sentence['BERT'][0]) w2v_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'Word2Vec', 'Word2Vec', w2v, metric_colname='w2v_distance_test2') w2v_tfidf_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'Word2Vec_with_TfIdf_weights', 'Word2Vec_TfIdf_weighted', w2v_tfidf, metric_colname='w2v_tfidf_weighted_distance_test2', tfidf_vectorizer=tfidf_vectorizer) elmo_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'ELMo_layer_3', 'ELMo', elmo, metric_colname='elmo_distance_test2') bert_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'BERT', 'BERT', bert, metric_colname='bert_distance_test2') df_by_sentence.sort_values('w2v_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('elmo_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('bert_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"