class Word_Embeddings_Differential_Feature_Generator_Unittests( unittest.TestCase): def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() self._model = Word_Embedding_Differential_Feature_Generator(self._db) self._posts = [] self._author = None self._set_author(u'test_user') def tearDown(self): self._db.session.close() self._db.deleteDB() self._db.session.close() def test_simple_case(self): self._add_post(u'of to a for', u'of is') self._add_target_article(u'0', u'of was ', u'am that was') self._setup_test() is_vec1 = self._get_word_dimension(u'is', 0) was_vec_d1 = self._get_word_dimension(u'was', 0) expected_val = was_vec_d1 - is_vec1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d0" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) expected_val = was_vec_d1 - is_vec1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_np.mean_target_articles_title_to_posts_content_d0" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) is_vec = self._words[u'is'] was_vec = self._words['was'] expected_val = commons.euclidean_distance(is_vec, was_vec) db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_distance_function_euclidean_distance_target_articles_title_np.mean_TO_posts_content_np.mean" ).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_few_words(self): self._add_post(u'of to a for', u'of is on') self._add_target_article(u'0', u'of was that', u'am that was') self._setup_test() dimension = 0 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot2 - tot1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) dimension = 140 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot2 - tot1 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_opposite(self): self._add_post(u'am that was', u'of was that') self._add_target_article(u'0', u'of is on', u'of to a for') self._setup_test() dimension = 0 is_vec1 = self._get_word_dimension(u'is', dimension) on_vec1 = self._get_word_dimension(u'on', dimension) tot1 = is_vec1 + on_vec1 was_vec_d1 = self._get_word_dimension(u'was', dimension) that_vec_d1 = self._get_word_dimension(u'that', dimension) tot2 = was_vec_d1 + that_vec_d1 expected_val = tot1 - tot2 db_val = self._db.get_author_feature( self._author.author_guid, u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d" + str(dimension)).attribute_value self.assertAlmostEqual(float(db_val), expected_val, places=4) def test_empty_word(self): self._add_post(u'of to a for', u'') self._add_target_article(u'0', u'of was that', u'am that was') self._setup_test() self.assertTrue(True) def _add_post(self, title, content): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = u'test' post.post_id = len(self._posts) post.guid = post.post_id self._db.addPost(post) self._posts.append(post) def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author def _setup_test(self): self._db.session.commit() self._word_embedding_model_creator = GloveWordEmbeddingModelCreator( self._db) self._word_embedding_model_creator.execute(None) params = {'authors': [self._author], 'posts': self._posts} self._model = Word_Embedding_Differential_Feature_Generator( self._db, **params) self._model.execute() self._words = self._db.get_word_embedding_dictionary() def _get_word_dimension(self, word, dimension): word_vec = self._words[word] return word_vec[dimension] def _add_target_article(self, post_id, title, description): target_article = Target_Article() target_article.author_guid = u'test_user' target_article.post_id = post_id target_article.title = title target_article.description = description target_article.keywords = 'temp, lala, fafa' self._db.add_target_articles([target_article]) def _add_target_article_item(self, post_id, type, content, author_guid): article_item = Target_Article_Item() article_item.post_id = post_id article_item.type = type article_item.item_number = 3 article_item.content = content article_item.author_guid = author_guid self._db.addPosts([article_item])
class TestGensimWordEmbeddingsModelTrainer(TestCase): def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() # self._Word_Embedding_Model_Creator.execute(None) self._is_load_wikipedia_300d_glove_model = True self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt" self._table_name = "wikipedia_model_300d" self._word_vector_dict_full_path = "data/output/word_embedding/" self._word_vector_dict = {} self._author = None self._set_author(u'test_user') self._counter = 0 self._posts = [] def tearDown(self): self._db.session.close() def test_add_additional_fields_to_existing_table(self): self._add_post(u'was', u'is') self._add_post(u'is', u'was') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._word_embedding_model_creator._aggregation_functions_names = [ 'sum' ] self._word_embedding_model_creator.execute(None) file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == 'test_user') & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] sum_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'sum'] mean_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'np.mean'] try: if len(sum_value_df.values.tolist()) > 0 and len( mean_value_df.values.tolist()) > 0: self.assertTrue(True) else: self.fail() except: self.fail() def test_case_post_represent_by_posts(self): self._add_post(u'post1', u'the claim', u'Claim') self._add_post(u'post2', u'dog cat pig man') # 2 self._add_post(u'post3', u'TV is the best guys') # 1 self._add_claim_tweet_connection(u'post1', u'post2') self._add_claim_tweet_connection(u'post1', u'post3') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator._targeted_fields_for_embedding = [{ 'source': { 'table_name': 'posts', 'id': 'post_id' }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id' }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [] } }] self._word_embedding_model_creator.execute(None) model_name_path = self._word_embedding_model_creator._prepare_model_name_path( ) model = Word2Vec.load(model_name_path) word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict( model) self._words = word_vector_dict self._words_vectors = self._get_posts_val() expected_val = self._calc_results() self._generic_test(expected_val, u'post1') def _setup_test(self): self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._words = self._db.get_word_embedding_dictionary() self._words_vectors = self._get_posts_val() def _generic_test(self, expected_value, source_id=u""): if source_id == u"": source_id = self._author.author_guid file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == source_id) & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] self.assert_word_embedding(word_embedding_results, expected_value, u'min') self.assert_word_embedding(word_embedding_results, expected_value, u'max') self.assert_word_embedding(word_embedding_results, expected_value, u'np.mean') def assert_word_embedding(self, db_results, expected_value, type): result_value = db_results.loc[db_results[u'word_embedding_type'] == type, '0':].values.tolist()[0] self.assertEquals(list(expected_value[type]), result_value) def _generic_non_equal_test(self, expected_value): db_results = self._db.get_author_word_embedding( self._author.author_guid, u'posts', u'content') self.assertNotEqual(expected_value[u'min'], db_results[u'min']) self.assertNotEqual(expected_value[u'max'], db_results[u'max']) self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean']) def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author def _add_post(self, title, content, _domain=u'Microblog'): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = _domain post.post_id = title post.guid = title self._db.addPost(post) self._posts.append(post) def _get_posts_val( self): # return the vectors for all the words in the added posts vals = {} for post in self._posts: for word in post.content.split(): if word in self._words.keys(): vals[word] = self._words[word] return vals.values() def _calc_mean(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('np.mean'), ziped_vec) return tuple(result) def _calc_min(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('min'), ziped_vec) return tuple(result) def _calc_max(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('max'), ziped_vec) return tuple(result) def _calc_results(self): vectors = self._words_vectors results = {} results[u'min'] = self._calc_min(vectors) results[u'max'] = self._calc_max(vectors) results[u'np.mean'] = self._calc_mean(vectors) return results def _add_target_article(self, post_id, title, description, author_guid): target_article = Target_Article() target_article.author_guid = author_guid target_article.post_id = post_id target_article.title = title target_article.description = description target_article.keywords = 'temp, lala, fafa' self._db.add_target_articles([target_article]) def _add_target_article_item(self, post_id, type, content, author_guid): article_item = Target_Article_Item() article_item.post_id = post_id article_item.type = type article_item.item_number = 3 article_item.content = content article_item.author_guid = author_guid self._db.addPosts([article_item]) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass