def test_compute_elmo_embeddings_contains_empty_sentence(self): embedder = Embedder(['Sentence one.', '']) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) # avoids printing expected warning due to averaging an empty vector. embeddings, _ = embedder.compute_elmo_embeddings() assert embeddings.shape == (2, 1024)
def test_add_embeddings_to_corpus_df_from_csv(self, list_of_sentences, expected_df_with_embeddings): # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1) embedder = Embedder(list_of_sentences) output_df = embedder.add_embeddings_to_corpus_df( os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_sentences.csv'), np.array(((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))), 'dummy_embeddings' ) pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
def test_add_embeddings_to_corpus_df_with_emb_from_list_raises_error(self, list_of_sentences): # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1) embedder = Embedder(list_of_sentences) with pytest.raises(TypeError): embedder.add_embeddings_to_corpus_df( os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_sentences.csv'), [(np.array((1.0, 2.0, 3.0)), np.array((4.0, 5.0, 6.0)), np.array((7.0, 8.0, 9.0)))], 'dummy_embeddings' )
def test_add_embeddings_to_corpus_df_with_emb_from_npy(self, list_of_sentences, expected_df_with_embeddings): # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1) embedder = Embedder(list_of_sentences) output_df = embedder.add_embeddings_to_corpus_df( pd.DataFrame({'dummy_sentences': ['First sentence.', 'Second sentence.', 'Third sentence.']}), os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_embeddings.npy'), 'dummy_embeddings' ) pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
def test_add_embeddings_to_corpus_df_from_df(self, list_of_sentences, expected_df_with_embeddings): # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1) embedder = Embedder(list_of_sentences) output_df = embedder.add_embeddings_to_corpus_df( pd.DataFrame({'dummy_sentences': ['First sentence.', 'Second sentence.', 'Third sentence.']}), np.array(((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))), 'dummy_embeddings' ) pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
def test_compute_word2vec_embeddings_when_tfidf_weights_is_false( self, list_of_sentences, expected_w2v_embeddings_tfidf_false ): assert os.getenv("PYTHONHASHSEED") == "123", \ 'Please set PYTHONHASHSEED environment variable to 123, or else the test will not be deterministically ' \ 'reproducible.' embedder = Embedder(list_of_sentences) embeddings, _, _ = embedder.compute_word2vec_embeddings(tfidf_weights=False, workers=1, seed=42, hashfxn=hash) assert len(list_of_sentences) == len(embeddings) assert embeddings.shape == (len(list_of_sentences), 300) np.testing.assert_array_equal(expected_w2v_embeddings_tfidf_false, embeddings)
def test_end_to_end_runner(): scraper = DocumentScraper( os.getenv("FIXTURES_DIR"), os.path.join(os.getenv("FIXTURES_DIR"), 'words_to_replace.json')) df_by_page = scraper.document_corpus_to_pandas_df() generator = CorpusGenerator(df_by_page) df_by_sentence = generator.df_by_page_to_df_by_sentence() list_of_sentences = df_by_sentence['sentence'].values.tolist() assert list_of_sentences == [ 'Mr Michael went to the store to buy some eggs.', 'Joel rolled down the street on his skateboard.', 'test / this is a first sentence', "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" ] embedder = Embedder(list_of_sentences) models_to_be_run = ['Word2Vec_tfidf_weighted', 'Word2Vec', 'BERT', 'ELMo'] for model in models_to_be_run: if model == 'Word2Vec_tfidf_weighted': sentence_embeddings, w2v_tfidf, tfidf_vectorizer = embedder.compute_word2vec_embeddings( tfidf_weights=True) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'Word2Vec_with_TfIdf_weights') elif model == 'Word2Vec': sentence_embeddings, w2v, _ = embedder.compute_word2vec_embeddings( tfidf_weights=False) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'Word2Vec') elif model == 'BERT': bert_model = 'bert-base-nli-stsb-mean-tokens' # This line is specific to BERT sentence_embeddings, bert = embedder.compute_bert_embeddings( bert_model) df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'BERT') elif model == 'ELMo': sentence_embeddings, elmo = embedder.compute_elmo_embeddings() df_by_sentence = embedder.add_embeddings_to_corpus_df( df_by_sentence, sentence_embeddings, 'ELMo_layer_3') else: raise KeyError(f'The model {model} is not recognized as input.') w2v_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'Word2Vec', 'Word2Vec', w2v, metric_colname='w2v_distance_test1') w2v_tfidf_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'Word2Vec_with_TfIdf_weights', 'Word2Vec_TfIdf_weighted', w2v_tfidf, metric_colname='w2v_tfidf_weighted_distance_test1', tfidf_vectorizer=tfidf_vectorizer) elmo_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'ELMo_layer_3', 'ELMo', elmo, metric_colname='elmo_distance_test1') bert_emb, df_by_sentence = query_embeddings( list_of_sentences[0], df_by_sentence, 'BERT', 'BERT', bert, metric_colname='bert_distance_test1') df_by_sentence.sort_values('w2v_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_equal(w2v_emb, df_by_sentence['Word2Vec'][0]) df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_equal( w2v_tfidf_emb, df_by_sentence['Word2Vec_with_TfIdf_weights'][0]) df_by_sentence.sort_values('elmo_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." # np.testing.assert_array_almost_equal(elmo_emb, df_by_sentence['ELMo_layer_3'][0]) # This test does not work, see https://github.com/allenai/allennlp/issues/3995# df_by_sentence.sort_values('bert_distance_test1', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][ 0] == "Mr Michael went to the store to buy some eggs." np.testing.assert_array_almost_equal(bert_emb, df_by_sentence['BERT'][0]) w2v_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'Word2Vec', 'Word2Vec', w2v, metric_colname='w2v_distance_test2') w2v_tfidf_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'Word2Vec_with_TfIdf_weights', 'Word2Vec_TfIdf_weighted', w2v_tfidf, metric_colname='w2v_tfidf_weighted_distance_test2', tfidf_vectorizer=tfidf_vectorizer) elmo_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'ELMo_layer_3', 'ELMo', elmo, metric_colname='elmo_distance_test2') bert_emb, df_by_sentence = query_embeddings( "New York", df_by_sentence, 'BERT', 'BERT', bert, metric_colname='bert_distance_test2') df_by_sentence.sort_values('w2v_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('elmo_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:" df_by_sentence.sort_values('bert_distance_test2', ascending=True, inplace=True) df_by_sentence.reset_index(inplace=True, drop=True) assert df_by_sentence['sentence'][0] == \ "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
MODELS_DIR = os.getenv('MODELS_DIR') CONFIG_DIR = os.getenv('CONFIG_DIR') LOGGING_CONFIG = os.getenv('LOGGING_CONFIG') with open(LOGGING_CONFIG, 'r') as f: config = yaml.safe_load(f) logging.config.dictConfig(config) with open(os.path.join(CONFIG_DIR, 'filenames.json'), 'r') as f: file_names = json.load(f) corpus_filename = "corpus_by_sentence.csv" corpus_by_sentence = pd.read_csv(os.path.join(DATA_DIR, "processed", corpus_filename)) list_of_sentences = corpus_by_sentence['sentence'].values.tolist() print("Instantiating Embedder class.") embedder = Embedder(list_of_sentences) for model in models_to_be_run: print(f"Calculating {model} embeddings.") if model == 'Word2Vec_tfidf_weighted': sentence_embeddings, model_obj, tfidf_vectorizer = embedder.compute_word2vec_embeddings(tfidf_weights=True) embedder.save_model(tfidf_vectorizer, MODELS_DIR, file_names[model]['vectorizer_filename']) # the line above is specific to Word2Vec with TfIdf vectorizer and cannot be generalized to other models elif model == 'Word2Vec': sentence_embeddings, model_obj, _ = embedder.compute_word2vec_embeddings(tfidf_weights=False) elif model == 'BERT': bert_model = 'bert-base-nli-stsb-mean-tokens' # This line is specific to BERT sentence_embeddings, model_obj = embedder.compute_bert_embeddings(bert_model) elif model == 'ELMo': sentence_embeddings, model_obj = embedder.compute_elmo_embeddings() else:
def test_compute_bert_embeddings_contains_empty_sentence(self): embedder = Embedder(['Sentence one.', '']) embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens') assert embeddings.shape == (2, 768)
def test_compute_bert_embeddings_is_empty_sentence(self): embedder = Embedder([]) embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens') assert len(embeddings) == 0 np.testing.assert_array_equal(np.array([], dtype=np.float64), embeddings)
def test_compute_bert_embeddings(self, list_of_sentences, expected_bert_embeddings): embedder = Embedder(list_of_sentences) embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens') assert len(list_of_sentences) == len(embeddings) assert embeddings.shape == (len(list_of_sentences), 768) np.testing.assert_array_almost_equal(expected_bert_embeddings, embeddings, decimal=5)
def test_compute_elmo_embeddings_is_empty_sentence(self): embedder = Embedder([]) embeddings, _ = embedder.compute_elmo_embeddings() assert len(embeddings) == 0 np.testing.assert_array_equal(np.array([], dtype=np.float64), embeddings)
def test_compute_elmo_embeddings(self, list_of_sentences, expected_elmo_embeddings): embedder = Embedder(list_of_sentences) embeddings, _ = embedder.compute_elmo_embeddings() assert len(list_of_sentences) == len(embeddings) assert embeddings.shape == (len(list_of_sentences), 1024) np.testing.assert_array_almost_equal(expected_elmo_embeddings, embeddings)
def test_compute_word2vec_embeddings_is_empty_sentence_raises_error(self): embedder = Embedder([]) with pytest.raises(RuntimeError): embedder.compute_word2vec_embeddings()
def test_class_instantiation(self, list_of_sentences): embedder = Embedder(list_of_sentences) assert embedder.list_of_sentences == list_of_sentences