class TestClaimToTopicConverter(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._claim_dictionary = {} self._authors = [] self._add_author(u'test author') self._preprocess_visualization = ClaimToTopicConverter(self._db) def tearDown(self): self._db.session.close_all() self._db.deleteDB() self._db.session.close() def test_generate_topics_no_topics(self): claim_id_posts_dict = self._db.get_claim_id_posts_dict() self._preprocess_visualization.generate_topics_tables() topics = self._db.get_topics() self.assertEqual(topics, []) def test_generate_topics_from_1_claim(self): self._add_claim(u'claim1', u'claim1 content') self._db.session.commit() claim_id_posts_dict = self._db.get_claim_id_posts_dict() self._preprocess_visualization.generate_topics_tables() self.assertTopicInserted(u'claim1', [u'claim1', u'content']) def test_generate_topics_from_5_claims(self): self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim2 content') self._add_claim(u'claim3', u'claim3 content move') self._add_claim(u'claim4', u'claim4 dif data') self._add_claim(u'claim5', u'claim5 some boring text') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self.assertTopicInserted(u'claim1', [u'claim1', u'content']) self.assertTopicInserted(u'claim2', [u'claim2', u'content']) self.assertTopicInserted(u'claim3', [u'claim3', u'content', u'move']) self.assertTopicInserted(u'claim4', [u'claim4', u'dif', u'data']) self.assertTopicInserted(u'claim5', [u'claim5', u'some', u'boring', u'text']) def test_generate_post_topic_mapping_no_claim(self): self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() self.assertEqual(0, len(mappings)) def test_generate_post_topic_mapping_1_claim(self): self._add_claim(u'claim1', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] self.assertEqual(3, len(mappings)) self.assertSetEqual( {('post1', topic_id, 1.0), ('post2', topic_id, 1.0), ('post3', topic_id, 1.0)}, set(mappings)) def test_generate_post_topic_mapping_2_claim(self): self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._add_claim_tweet_connection(u'claim2', u'post4') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim2'] self.assertEqual(5, len(mappings)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0), ('post5', topic_id2, 1.0)}, set(mappings)) def test__generate_author_topic_mapping_2_claim(self): self._add_author(u'test author2') self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim1 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post3') self._add_claim_tweet_connection(u'claim2', u'post4') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization._domain = u"Microblog" self._preprocess_visualization.generate_topics_tables() self._preprocess_visualization.generate_post_topic_mapping() self._preprocess_visualization.generate_author_topic_mapping() mapping = self._db.get_author_topic_mapping() self.assertEqual(2, len(mapping)) self.assertSetEqual( {(u'test author', 0.6, 0.4), (u'test author2', 0, 0)}, set(mapping)) def test_visualization(self): self._add_author(u'test author2', u"bad_actor") self._add_claim(u'claim1', u'claim1 content') self._add_claim(u'claim2', u'claim2 content') self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') self._add_post(u"test author2", u'post4', u'post4 bla bla', u'Microblog') self._add_post(u"test author2", u'post5', u'post5 noting new', u'Microblog') self._add_claim_tweet_connection(u'claim1', u'post1') self._add_claim_tweet_connection(u'claim1', u'post2') self._add_claim_tweet_connection(u'claim1', u'post4') self._add_claim_tweet_connection(u'claim2', u'post3') self._add_claim_tweet_connection(u'claim2', u'post5') self._db.session.commit() self._preprocess_visualization._domain = u"Microblog" self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim1'] topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary( )[u'claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {(u'test author', 0.666666666667, 0.333333333333), (u'test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) # def test_double_execution_visualization(self): # self._add_author(u'test author2', u"bad_actor") # self._add_claim(u'claim1', u'claim1 content') # self._add_claim(u'claim2', u'claim2 content') # self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog') # self._add_post(u"test author", u'post2', u'post2 bla bla', u'Microblog') # self._add_post(u"test author", u'post3', u'post3 noting new', u'Microblog') # self._add_post(u"test author2", u'post4', u'post4 bla bla', u'Microblog') # self._add_post(u"test author2", u'post5', u'post5 noting new', u'Microblog') # self._add_claim_tweet_connection(u'claim1', u'post1') # self._add_claim_tweet_connection(u'claim1', u'post2') # self._add_claim_tweet_connection(u'claim1', u'post4') # self._add_claim_tweet_connection(u'claim2', u'post3') # self._add_claim_tweet_connection(u'claim2', u'post5') # self._db.session.commit() # self._preprocess_visualization._domain = u"Microblog" # self._preprocess_visualization.execute() # self._preprocess_visualization.execute() # # author_topic_mapping = self._db.get_author_topic_mapping() # post_topic_mappings = self._db.get_post_topic_mapping() # post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] # topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim1'] # topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim2'] # self.assertEqual(2, len(author_topic_mapping)) # self.assertSetEqual({(u'test author2', 0.5, 0.5), (u'test author', 0.666666666667, 0.333333333333)}, # set(author_topic_mapping)) # self.assertSetEqual( # {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), # ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def assertTopicInserted(self, claim_id, expected_terms): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary( )[claim_id] claim = self._claim_dictionary[claim_id] expected = set(clean_tweet(claim.description).split(' ')) self.assertIn(topic_id, topic_dict) self.assertSetEqual(expected, topic_dict[topic_id]) self.assertSetEqual(set(expected_terms), topic_dict[topic_id]) def _add_author(self, author_guid, type=u"good_actor"): author = Author() author.author_guid = author_guid author.author_full_name = author_guid author.author_screen_name = author_guid author.name = author_guid author.domain = u'Microblog' author.author_type = type self._db.add_author(author) self._authors.append(author) def _add_post(self, author_guid, title, content, domain=u'Microblog'): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.is_detailed = True post.is_LB = False self._db.addPost(post) self._posts.append(post) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str=u"2017-06-14 05:00:00", keywords=u"", post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.keywords = keywords claim.url = u"claim url" self._db.addPost(claim) self._claim_dictionary[claim.claim_id] = claim
class TestBehaviorFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() pass ######################## Average minute between posts tests ###################### def test_average_minutes_between_posts_no_post_expected_0(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._features = [ 'average_minutes_between_posts' ] self._behavior_feature_generator._targeted_fields = [{ 'source': { "table_name": "posts", "id": "author_guid", "target_field": "content", "where_clauses": [{ "field_name": 1, "value": 1 }] }, "connection": {}, "destination": {} }] result = self._behavior_feature_generator.average_minutes_between_posts( **{'posts': self._posts}) self.assertEqual(0, result) def test_average_minutes_between_posts_one_post_expected_0(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { "table_name": "posts", "id": "author_guid", "target_field": "content", "where_clauses": [{ "field_name": 1, "value": 1 }] }, "connection": {}, "destination": {} }] self._behavior_feature_generator.execute() result_feature = self._db.get_author_feature( u"author_guid", u"BehaviorFeatureGenerator_average_minutes_between_posts") feature_value = getattr(result_feature, u'attribute_value') self.assertEqual('0', feature_value) def test_average_minutes_between_posts_3_post_expected_105(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-12 06:00:00") self._add_post(u"post3", u"content 3", "2017-06-12 08:30:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_minutes_between_posts( **{'posts': self._posts}) self.assertEqual(105, result) ######################## Average posts per day tests ###################### def test_average_posts_per_day_active_days_no_posts_expect_0(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertEqual(0, result) def test_average_posts_per_day_1_active_days_1_post_each_expect_1(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertAlmostEqual(1.0, result, 0.0000001) def test_average_posts_per_day_3_active_days_1_post_each_expect_1(self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-16 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertAlmostEqual(1.0, result, 0.0000001) def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2( self): self._add_author(u"author_guid") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) result = self._behavior_feature_generator.average_posts_per_day_active_days( **{'posts': self._posts}) self.assertEqual(2.0, result) def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2_represent_by_post( self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "<=" }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_posts_per_day_active_days") self.assertEqual(u'2.0', author_feature.attribute_value) author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_posts_per_day_total") self.assertGreater(float(author_feature.attribute_value), 0) def test_retweet_count_0_posts(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'0', author_feature.attribute_value) def test_retweet_count_1_retweet(self): self._add_author(u"author_guid") self._add_post(u"post1", u"RT @content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'1', author_feature.attribute_value) def test_retweet_count_3_retweet(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"RT @content 3 RT @hi", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5 bla RT @bla", "2017-06-16 04:00:00") self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'3', author_feature.attribute_value) author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_average_retweets") self.assertEqual(u'0.5', author_feature.attribute_value) def test_received_retweets_count_0_retweets(self): self._add_author(u"author_guid") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'0', author_feature.attribute_value) def test_received_retweets_count_1_retweets(self): self._add_author(u"author_guid") self._add_post(u"post1", u"RT @author_guid content 1", "2017-06-12 05:00:00") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'authors', 'id': 'author_guid', "target_field": "author_guid", "where_clauses": [{ "field_name": "1", "value": "1" }] }, 'connection': {}, 'destination': {} }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'author_guid', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'1', author_feature.attribute_value) def test_received_retweets_count_3_retweets_only_from_microblog_tweets( self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"RT @author_guid content 3 RT @hi", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5 bla RT @author_guid", "2017-06-16 04:00:00") self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() params = self._get_params() self._behavior_feature_generator = BehaviorFeatureGenerator( self._db, **params) self._behavior_feature_generator._targeted_fields = [{ 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } }] self._behavior_feature_generator.execute() author_feature = self._db.get_author_feature( u'post0', u"BehaviorFeatureGenerator_retweet_count") self.assertEqual(u'3', author_feature.attribute_value) ######################## argument_parser tests ###################### def test_argument_parser_connection_conditions(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "<=" }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post4', 'post5', 'post6'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "timeinterval", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post2', 'post3'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_before_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "before", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post2', u'post3'} self.assertSetEqual(actual, expected) def test_argument_parser_connection_conditions_with_after_timedelta(self): self._add_author(u"author_guid") self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim") self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00") self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00") self._add_post(u"post3", u"content 3", "2017-06-15 05:00:00") self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00") self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00") self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00") self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00", u"Not Microblog") self._add_claim_tweet_connection(u"post0", u"post1") self._add_claim_tweet_connection(u"post0", u"post2") self._add_claim_tweet_connection(u"post0", u"post3") self._add_claim_tweet_connection(u"post0", u"post4") self._add_claim_tweet_connection(u"post0", u"post5") self._add_claim_tweet_connection(u"post0", u"post6") self._add_claim_tweet_connection(u"post0", u"post7") self._db.add_author(self._author) self._db.session.commit() arg_parser = ArgumentParser(self._db) args = { 'source': { 'table_name': 'posts', 'id': 'post_id', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', "where_clauses": [{ "val1": "source.date", "val2": "dest.date", "op": "after", "delta": 1 }] }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = arg_parser._get_source_id_target_elements( args) actual = set([ element.post_id for element in source_id_target_elements_dict["post0"] ]) expected = {'post3'} self.assertSetEqual(actual, expected) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.author_screen_name = author_guid author.name = u'test' author.domain = u'tests' author.statuses_count = 0 author.created_at = u"2017-06-14 05:00:00" # self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date self._db.addPost(post) self._posts.append(post) self._author.statuses_count += 1 def _get_params(self): posts = {self._author.author_guid: self._posts} params = {'authors': [self._author], 'posts': posts} return params def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass
class TestGensimWordEmbeddingsModelTrainer(TestCase): def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() # self._Word_Embedding_Model_Creator.execute(None) self._is_load_wikipedia_300d_glove_model = True self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt" self._table_name = "wikipedia_model_300d" self._word_vector_dict_full_path = "data/output/word_embedding/" self._word_vector_dict = {} self._author = None self._set_author(u'test_user') self._counter = 0 self._posts = [] def tearDown(self): self._db.session.close() def test_add_additional_fields_to_existing_table(self): self._add_post(u'was', u'is') self._add_post(u'is', u'was') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._word_embedding_model_creator._aggregation_functions_names = [ 'sum' ] self._word_embedding_model_creator.execute(None) file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == 'test_user') & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] sum_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'sum'] mean_value_df = word_embedding_results.loc[ word_embedding_results[u'word_embedding_type'] == u'np.mean'] try: if len(sum_value_df.values.tolist()) > 0 and len( mean_value_df.values.tolist()) > 0: self.assertTrue(True) else: self.fail() except: self.fail() def test_case_post_represent_by_posts(self): self._add_post(u'post1', u'the claim', u'Claim') self._add_post(u'post2', u'dog cat pig man') # 2 self._add_post(u'post3', u'TV is the best guys') # 1 self._add_claim_tweet_connection(u'post1', u'post2') self._add_claim_tweet_connection(u'post1', u'post3') self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator._targeted_fields_for_embedding = [{ 'source': { 'table_name': 'posts', 'id': 'post_id' }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id' }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [] } }] self._word_embedding_model_creator.execute(None) model_name_path = self._word_embedding_model_creator._prepare_model_name_path( ) model = Word2Vec.load(model_name_path) word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict( model) self._words = word_vector_dict self._words_vectors = self._get_posts_val() expected_val = self._calc_results() self._generic_test(expected_val, u'post1') def _setup_test(self): self._db.session.commit() self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer( self._db) self._word_embedding_model_creator.execute(None) self._words = self._db.get_word_embedding_dictionary() self._words_vectors = self._get_posts_val() def _generic_test(self, expected_value, source_id=u""): if source_id == u"": source_id = self._author.author_guid file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv" data = pd.DataFrame.from_csv(file_output_path) word_embedding_results = data.loc[(data['author_id'] == source_id) & (data['table_name'] == u'posts') & (data['targeted_field_name'] == u'content')] self.assert_word_embedding(word_embedding_results, expected_value, u'min') self.assert_word_embedding(word_embedding_results, expected_value, u'max') self.assert_word_embedding(word_embedding_results, expected_value, u'np.mean') def assert_word_embedding(self, db_results, expected_value, type): result_value = db_results.loc[db_results[u'word_embedding_type'] == type, '0':].values.tolist()[0] self.assertEquals(list(expected_value[type]), result_value) def _generic_non_equal_test(self, expected_value): db_results = self._db.get_author_word_embedding( self._author.author_guid, u'posts', u'content') self.assertNotEqual(expected_value[u'min'], db_results[u'min']) self.assertNotEqual(expected_value[u'max'], db_results[u'max']) self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean']) def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author def _add_post(self, title, content, _domain=u'Microblog'): post = Post() post.author = self._author.author_full_name post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = _domain post.post_id = title post.guid = title self._db.addPost(post) self._posts.append(post) def _get_posts_val( self): # return the vectors for all the words in the added posts vals = {} for post in self._posts: for word in post.content.split(): if word in self._words.keys(): vals[word] = self._words[word] return vals.values() def _calc_mean(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('np.mean'), ziped_vec) return tuple(result) def _calc_min(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('min'), ziped_vec) return tuple(result) def _calc_max(self, vectors): vectors = self._get_posts_val() if len(vectors) == 0: return (0, ) * 300 ziped_vec = zip(*vectors) result = map(eval('max'), ziped_vec) return tuple(result) def _calc_results(self): vectors = self._words_vectors results = {} results[u'min'] = self._calc_min(vectors) results[u'max'] = self._calc_max(vectors) results[u'np.mean'] = self._calc_mean(vectors) return results def _add_target_article(self, post_id, title, description, author_guid): target_article = Target_Article() target_article.author_guid = author_guid target_article.post_id = post_id target_article.title = title target_article.description = description target_article.keywords = 'temp, lala, fafa' self._db.add_target_articles([target_article]) def _add_target_article_item(self, post_id, type, content, author_guid): article_item = Target_Article_Item() article_item.post_id = post_id article_item.type = type article_item.item_number = 3 article_item.content = content article_item.author_guid = author_guid self._db.addPosts([article_item]) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass
class RedditFeatureGeneratorTest(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None self._init_authors() self._init_posts() self._init_claims() self._reddit_post_by_claim_feature_generator = RedditPostByClaimFeatureGenerator( self._db, **self._get_params()) self._reddit_author_by_claim_feature_generator = RedditAuthorByClaimFeatureGenerator( self._db, **self._get_params()) def tearDown(self): self._db.session.close() pass def test_karma_by_submission_and_comment(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_karma_by_submission_and_comment', 'expected': -13 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_karma_by_submission_and_comment', 'expected': -321 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_karma_by_submission_and_comment', 'expected': 1 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 52312 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 102 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 234 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': 5904.222222 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': -19.55555556 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': 38.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 27 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 7 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': 2.998904337 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': -2.525365088 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': 2.234762661 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 8.995080203 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 7.357797068 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 4.503581242 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'karma_by_submission_and_comment' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_karma_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_karma_by_submission', 'expected': 738 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_karma_by_submission', 'expected': -321 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_karma_by_submission', 'expected': 123 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_karma_by_submission', 'expected': 52312 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_karma_by_submission', 'expected': 102 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_karma_by_submission', 'expected': 234 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_karma_by_submission', 'expected': 26525 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_karma_by_submission', 'expected': -109.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_karma_by_submission', 'expected': 178.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_karma_by_submission', 'expected': 26525 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_karma_by_submission', 'expected': -109.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_karma_by_submission', 'expected': 178.5 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'karma_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_upvotes_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_upvotes_by_submission', 'expected': 762 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_upvotes_by_submission', 'expected': 112 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_upvotes_by_submission', 'expected': 369 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_upvotes_by_submission', 'expected': 74593 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_upvotes_by_submission', 'expected': 241 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_upvotes_by_submission', 'expected': 2067 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_upvotes_by_submission', 'expected': 37677.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_upvotes_by_submission', 'expected': 176.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_upvotes_by_submission', 'expected': 1218 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_upvotes_by_submission', 'expected': 37677.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_upvotes_by_submission', 'expected': 176.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_upvotes_by_submission', 'expected': 1218 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'upvotes_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_downvotes_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_downvotes_by_submission', 'expected': 24 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_downvotes_by_submission', 'expected': 10 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_downvotes_by_submission', 'expected': 246 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_downvotes_by_submission', 'expected': 22281 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_downvotes_by_submission', 'expected': 562 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_downvotes_by_submission', 'expected': 1833 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_downvotes_by_submission', 'expected': 11152.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_downvotes_by_submission', 'expected': 286 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_downvotes_by_submission', 'expected': 1039.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_downvotes_by_submission', 'expected': 11152.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_downvotes_by_submission', 'expected': 286 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_downvotes_by_submission', 'expected': 1039.5 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'downvotes_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_author_comment_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_comment_karma', 'expected': 2261 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_comment_karma', 'expected': 2842 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_comment_karma', 'expected': 2842 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_comment_karma', 'expected': 37027 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_comment_karma', 'expected': 35111 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_comment_karma', 'expected': 30880 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_comment_karma', 'expected': 19096.66667 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_comment_karma', 'expected': 18031 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_comment_karma', 'expected': 11833.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_comment_karma', 'expected': 22588 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_comment_karma', 'expected': 16555 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_comment_karma', 'expected': 6806 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_comment_karma', 'expected': -0.018614054 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_comment_karma', 'expected': 0.128211429 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_comment_karma', 'expected': 1.862860226 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_comment_karma', 'expected': -1.992620739 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_comment_karma', 'expected': -2.723581645 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_comment_karma', 'expected': 3.595027437 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'comment_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_link_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_link_karma', 'expected': 1 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_link_karma', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_link_karma', 'expected': 90 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_link_karma', 'expected': 171576 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_link_karma', 'expected': 171576 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_link_karma', 'expected': 5897 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_link_karma', 'expected': 20565.77778 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_link_karma', 'expected': 29840.16667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_link_karma', 'expected': 1866 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_link_karma', 'expected': 1341 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_link_karma', 'expected': 738.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_link_karma', 'expected': 738.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_link_karma', 'expected': 2.991811692 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_link_karma', 'expected': 2.443747273 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_link_karma', 'expected': 1.751305522 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_link_karma', 'expected': 8.963145712 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_link_karma', 'expected': 5.977609271 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_link_karma', 'expected': 3.018013716 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'link_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_total_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_total_karma', 'expected': 2435 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_total_karma', 'expected': 6379 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_total_karma', 'expected': 6379 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_total_karma', 'expected': 206687 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_total_karma', 'expected': 206687 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_total_karma', 'expected': 32221 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_total_karma', 'expected': 39662.44444 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_total_karma', 'expected': 47871.16667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_total_karma', 'expected': 13699.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_total_karma', 'expected': 22589 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_total_karma', 'expected': 17240.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_total_karma', 'expected': 8099 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_total_karma', 'expected': 2.767953592 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_total_karma', 'expected': 2.349097328 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_total_karma', 'expected': 1.963784833 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_total_karma', 'expected': 7.954685555 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_total_karma', 'expected': 5.605190323 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_total_karma', 'expected': 3.878351431 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'total_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_gold(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_gold', 'expected': 3 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_gold', 'expected': 3 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_gold', 'expected': 3 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_gold' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_moderator(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_moderator', 'expected': 2 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_moderator', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_moderator', 'expected': 0 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_moderator' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_employee(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_employee', 'expected': 3 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_employee', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_employee', 'expected': 1 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_employee' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_gold(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_gold', 'expected': 0.333333333 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_gold', 'expected': 0.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_gold', 'expected': 0.75 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_gold' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_moderator(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_moderator', 'expected': 0.222222222 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_moderator', 'expected': 0.166666667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_moderator', 'expected': 0 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_moderator' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_employee(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_employee', 'expected': 0.333333333 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_employee', 'expected': 0.166666667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_employee', 'expected': 0.25 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_employee' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def assert_author_feature_test_case(self, test_case, class_name): self.assert_author_feature_number( test_case['claim_id'], "{}_{}".format(class_name, test_case['test_name']), test_case['expected']) def assert_author_feature_number(self, author_guid, attribute_name, expected): result_feature = self._db.get_author_feature(author_guid, attribute_name) feature_value = getattr(result_feature, 'attribute_value') self.assertAlmostEqual(float(expected), float(feature_value), places=2) def _add_author(self, name=None, link_karma=None, comment_karma=None, is_employee=0, is_mod=0, is_gold=0, author_osn_id=None): author = Author() reddit_author = RedditAuthor() author.name = name author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = 'reddit' author.author_osn_id = author_osn_id author.author_full_name = name author.url = 'https://www.reddit.com/user/' + name reddit_author.name = author.name reddit_author.author_guid = author.author_guid reddit_author.comments_count = None reddit_author.comment_karma = comment_karma reddit_author.link_karma = link_karma reddit_author.is_gold = is_gold reddit_author.is_moderator = is_mod reddit_author.is_employee = is_employee self._db.add_authors([author]) self._db.add_reddit_authors([reddit_author]) # self._author = author def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) def _add_claim(self, claim_id): claim = Claim() claim.claim_id = claim_id self._db.addPosts([claim]) def _init_authors(self): self._add_author('Smile_lifeisgood', comment_karma=30880, link_karma=1341, is_gold=1, is_mod=0, is_employee=0) self._add_author('Cunty_Balls', comment_karma=7369, link_karma=90, is_gold=1, is_mod=0, is_employee=0) self._add_author('I_kick_fuck_nuns', comment_karma=2842, link_karma=5897, is_gold=1, is_mod=0, is_employee=0) self._add_author('TheRiseofMindhawk', comment_karma=2261, link_karma=174, is_gold=1, is_mod=1, is_employee=0) self._add_author('dialog2011', comment_karma=37027, link_karma=4582, is_gold=0, is_mod=0, is_employee=1) self._add_author('chrmanyaki', comment_karma=22588, link_karma=1, is_gold=0, is_mod=0, is_employee=1) self._add_author('Undertakerjoe', comment_karma=9177, link_karma=1384, is_gold=0, is_mod=0, is_employee=0) self._add_author('Lmb2298', comment_karma=25741, link_karma=1, is_gold=0, is_mod=0, is_employee=0) self._add_author('azzazaz', comment_karma=35111, link_karma=171576, is_gold=0, is_mod=1, is_employee=0) self._add_author('juanwonone1', comment_karma=6243, link_karma=136, is_gold=0, is_mod=0, is_employee=1) def _init_posts(self): self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('juanwonone1', '15/10/2017 21:44', '76ksr4', 738, 0.97)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Lmb2298', '01/10/2017 22:24', 'dferfgh', 52312, 0.77)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('dialog2011', '12/06/2017 23:45', '6gv0vk', 27)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('chrmanyaki', '15/10/2017 21:58', 'doeq8ke', 27)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('azzazaz', '12/06/2018 10:50', 'e0j4zkz', 32)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Smile_lifeisgood', '12/06/2018 20:08', 'e0in2zm', 11)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Undertakerjoe', '15/10/2017 22:17', 'doerbqu', -13)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('I_kick_fuck_nuns', '18/06/2017 3:39', 'dj1qid5', 2)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('TheRiseofMindhawk', '13/06/2017 8:17', 'ditymrc', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('I_kick_fuck_nuns', '11/06/2018 18:49', '8qal3m', 102, 0.92)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '16/10/2017 2:23', 'dof4fen', -321, 0.3)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Smile_lifeisgood', '13/06/2017 0:29', 'dditbt8r', 11)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Lmb2298', '15/10/2017 22:38', 'doeslie', 11)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('azzazaz', '16/10/2017 0:30', 'doeyvtb', 9)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '15/10/2017 22:50', 'doetc6j', 7)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Cunty_Balls', '16/10/2017 1:52', 'dof2x1x', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Cunty_Balls', '16/10/2017 2:43', 'dof5cpo', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '16/10/2017 3:45', 'dof84f8', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Cunty_Balls', '15/10/2017 22:24', 'doerqsj', 234, 0.53)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '16/10/2017 21:44', '76ksr2', 123, 0.6)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Smile_lifeisgood', '13/06/2017 7:04', 'ditvpox', 7)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Smile_lifeisgood', '13/06/2017 0:51', 'ditcy28', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('juanwonone1', '15/10/2017 23:36', 'doevzsq', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('juanwonone1', '16/10/2017 0:26', 'doeynrr', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '11/06/2018 21:55', 'e0hy5he', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '11/06/2018 22:04', 'e0hyrhi', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '12/06/2018 1:31', 'e0icveq', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Cunty_Balls', '13/06/2017 7:55', 'ditxua6', 3)[0].guid) def _init_claims(self): self._add_claim('cd2e1978-4dfa-3a40-b62f-71153001629c') self._add_claim('a4beae51-463f-33fc-bbf6-20eca5104afe') self._add_claim('9e875999-9a3e-3357-bfa6-ede4fe67c1c9') def _get_params(self): return {'authors': [], 'posts': []} # test_cases = [ # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'kurtosis_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'kurtosis_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'kurtosis_', # 'expected': # } # ]
class TestFakeNewsFeatureGenerator(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() def test_get_word_count_1_claim_1_comments_no_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {}) self.assert_word_dictionary_fraction('post0', {}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('0', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.0', author_feature.attribute_value) def test_get_word_count_1_claim_1_comments_1_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '1'}) self.assert_word_dictionary_fraction('post0', {'liar': '1.0'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('1', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('1.0', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_1_words(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00") self._add_post("post2", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post3", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post4", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '1'}) self.assert_word_dictionary_fraction('post0', {'liar': '0.25'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('1', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.25', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_8_words(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 liar bad word liar", "2017-06-12 05:00:00") self._add_post("post2", "no bad words liar at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no liar bad words at all liar", "2017-06-12 05:00:00") self._add_post("post4", " liar no liar bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count('post0', {'liar': '8'}) self.assert_word_dictionary_fraction('post0', {'liar': '2.0'}) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('8', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('2.0', author_feature.attribute_value) def test_get_word_count_1_claim_4_comments_8_different_words(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() self.assert_word_dictionary_count( 'post0', { 'liar': '3', 'joke': '2', 'didnt actually': '1', 'untrue': '1', 'laugh': '1' }) self.assert_word_dictionary_fraction( 'post0', { 'liar': '0.75', 'joke': '0.5', 'didnt actually': '0.25', 'untrue': '0.25', 'laugh': '0.25' }) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_count_sum') self.assertEqual('3', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_words_fraction_sum') self.assertEqual('0.75', author_feature.attribute_value) def test_get_claim_type_4_claim(self): self._add_author('author_guid') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00") self._add_claim('post1', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_claim('post2', 'the claim', "2017-06-10 05:00:00", 'pants-fire') self._add_claim('post3', 'the claim', "2017-06-10 05:00:00", 'mostly-false') self._add_claim('post4', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'mostly-true') self._add_claim('post6', 'the claim', "2017-06-10 05:00:00", 'half_true') self._add_claim('post7', 'the claim', "2017-06-10 05:00:00", 'unproven') self._db.session.commit() self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db) self.fake_news_feature_generator._domain = 'Claim' self.fake_news_feature_generator.setUp() self.fake_news_feature_generator.execute() author_feature = self._db.get_author_feature( 'post0', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) author_feature = self._db.get_author_feature( 'post1', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post2', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post3', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('False', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post4', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('True', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post5', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual('True', author_feature.attribute_value) author_feature = self._db.get_author_feature( 'post6', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) author_feature = self._db.get_author_feature( 'post7', 'FakeNewsFeatureGenerator_claim_verdict') self.assertEqual(None, author_feature) def assert_word_dictionary_count(self, author_guid, values): self.assert_dictionary_words(author_guid, 'FakeNewsFeatureGenerator_{0}_count', '0', values) def assert_word_dictionary_fraction(self, author_guid, values): self.assert_dictionary_words(author_guid, 'FakeNewsFeatureGenerator_{0}_fraction', '0.0', values) def assert_dictionary_words(self, author_guid, count_template, default_value, values): fake_news_dictionary_words = self.fake_news_feature_generator._fake_news_dictionary for word in fake_news_dictionary_words: word = word.strip().replace(' ', '-') author_feature = self._db.get_author_feature( author_guid, count_template.format(word)) if word in values: self.assertEqual(values[word], author_feature.attribute_value) else: self.assertEqual(default_value, author_feature.attribute_value) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = 'test author' author.author_screen_name = author_guid author.name = 'test' author.domain = 'tests' author.statuses_count = 0 author.created_at = "2017-06-14 05:00:00" self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain='Microblog', post_type=None): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.post_type = post_type self._db.addPost(post) self._posts.append(post) def _get_params(self): posts = {self._author.author_guid: self._posts} params = params = {'authors': [self._author], 'posts': posts} return params def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str, post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.url = "claim url" self._db.addPost(claim)
class TestFakeNewsClassifier(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None def tearDown(self): self._db.session.close() def test_classify_by_dictionary_1_FN_1_FP(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.0) self.assertAlmostEqual(float(output_data['AUC']), 0.0) def test_classify_by_dictionary_1_FN_1_FP_and_ignore_1(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'unknown') self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post12", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post13", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post14", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.0) self.assertAlmostEqual(float(output_data['AUC']), 0.0) def test_classify_by_dictionary_0_FN_0_FP(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 0) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 0) self.assertAlmostEqual(float(output_data['accuracy']), 1.0) self.assertAlmostEqual(float(output_data['AUC']), 1.0) def test_classify_by_dictionary_1_FN_0_FP_3_claims(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_author('author_guid') self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_author('author_guid') self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post11", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post12", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post13", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post14", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 1) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 0) self.assertAlmostEqual(float(output_data['accuracy']), 0.666666, places=4) self.assertAlmostEqual(float(output_data['AUC']), 0.75) def test_classify_by_dictionary_0_FN_1_FP_3_claims(self): self._add_author('author') self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE') self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post2", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post3", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post4", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post0", "post1") self._add_claim_tweet_connection("post0", "post2") self._add_claim_tweet_connection("post0", "post3") self._add_claim_tweet_connection("post0", "post4") self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00") self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00") self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post5", "post6") self._add_claim_tweet_connection("post5", "post7") self._add_claim_tweet_connection("post5", "post8") self._add_claim_tweet_connection("post5", "post9") self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'TRUE') self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00") self._add_post("post12", "no bad words untrue at all liar", "2017-06-12 05:00:00") self._add_post("post13", "no joke bad words at all laugh", "2017-06-12 05:00:00") self._add_post("post14", " liar no didnt actually bad words at all", "2017-06-12 05:00:00") self._add_claim_tweet_connection("post10", "post11") self._add_claim_tweet_connection("post10", "post12") self._add_claim_tweet_connection("post10", "post13") self._add_claim_tweet_connection("post10", "post14") self._db.session.commit() self.fake_news_feature_classifier = FakeNewsClassifier(self._db) self.fake_news_feature_classifier.setUp() self.fake_news_feature_classifier.execute() output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv' output_file = open(output_file_path, 'r') reader = csv.DictReader(output_file) output_data = next(reader) self.assertAlmostEqual(float(output_data['FN (think good but bad)']), 0) self.assertAlmostEqual(float(output_data['FP (think bad but good)']), 1) self.assertAlmostEqual(float(output_data['accuracy']), 0.666666, places=4) self.assertAlmostEqual(float(output_data['AUC']), 0.75) def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = 'test author' author.author_screen_name = author_guid author.name = 'test' author.domain = 'tests' author.statuses_count = 0 author.created_at = "2017-06-14 05:00:00" self._db.add_author(author) self._author = author def _add_post(self, title, content, date_str, domain='Microblog', post_type=None): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.post_type = post_type self._db.addPost(post) self._posts.append(post) def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass def _add_claim(self, claim_id, content, date_str, post_type=None): claim = Claim() claim.claim_id = claim_id claim.verdict = post_type claim.title = claim_id claim.description = content claim.verdict_date = convert_str_to_unicode_datetime(date_str) claim.url = "claim url" self._db.addPost(claim)
class TestEntityToTopicConverter(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._post_dictionary = {} self._authors = [] self._add_author('test author') self._preprocess_visualization = EntityToTopicConverter(self._db) def tearDown(self): self._db.session.close_all() self._db.deleteDB() self._db.session.close() def test_generate_topics_no_topics(self): arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) topics = self._db.get_topics() self.assertEqual(topics, []) def test_generate_topics_from_1_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') def test_generate_topics_from_1_claim_and_remove_stop_words(self): self._add_post("test author", 'claim1', 'claim1 go to the house', 'Claim') arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } self._db.session.commit() self._preprocess_visualization._remove_stop_words = True source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') def test_generate_topics_from_5_claims(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'claim3', 'claim3 content move', 'Claim') self._add_post("test author", 'claim4', 'claim4 dif data', 'Claim') self._add_post("test author", 'claim5', 'claim5 some boring text', 'Claim') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() self.assertTopicInserted('claim1') self.assertTopicInserted('claim2') self.assertTopicInserted('claim3') self.assertTopicInserted('claim4') self.assertTopicInserted('claim5') def test_generate_post_topic_mapping_no_claim(self): arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': {}, 'destination': {} } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) mappings = self._db.get_post_topic_mapping() self.assertEqual(0, len(mappings)) def test_generate_post_topic_mapping_1_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] self.assertEqual(3, len(mappings)) self.assertSetEqual( {('post1', topic_id, 1.0), ('post2', topic_id, 1.0), ('post3', topic_id, 1.0)}, set(mappings)) def test_generate_post_topic_mapping_2_claim(self): self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author", 'claim2', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._add_claim_tweet_connection('claim2', 'post4') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.save_topic_entities() mappings = self._db.get_post_topic_mapping() mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(5, len(mappings)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0), ('post5', topic_id2, 1.0)}, set(mappings)) def test__generate_author_topic_mapping_2_claim(self): self._add_author('test author2') self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim1 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post3') self._add_claim_tweet_connection('claim2', 'post4') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() arg = { 'source': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Claim" }] }, 'connection': { 'table_name': 'claim_tweet_connection', 'source_id': 'claim_id', 'target_id': 'post_id', }, 'destination': { 'table_name': 'posts', 'id': 'post_id', 'target_field': 'content', "where_clauses": [{ "field_name": "domain", "value": "Microblog" }] } } self._preprocess_visualization._domain = "Microblog" source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements( arg) self._preprocess_visualization.generate_topics_tables( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_post_topic_mapping( source_id_target_elements_dict, arg) self._preprocess_visualization.generate_author_topic_mapping() self._preprocess_visualization.save_topic_entities() mapping = self._db.get_author_topic_mapping() self.assertEqual(2, len(mapping)) self.assertSetEqual({('test author', 0.6, 0.4), ('test author2', 0, 0)}, set(mapping)) def test_visualization(self): self._add_author('test author2', "bad_actor") self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author2", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author2", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post4') self._add_claim_tweet_connection('claim2', 'post3') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() self._preprocess_visualization._domain = "Microblog" self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {('test author', 0.666666666667, 0.333333333333), ('test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def test_double_execution_visualization(self): self._add_author('test author2', "bad_actor") self._add_post("test author", 'claim1', 'claim1 content', 'Claim') self._add_post("test author2", 'claim2', 'claim2 content', 'Claim') self._add_post("test author", 'post1', 'post1 content of data', 'Microblog') self._add_post("test author", 'post2', 'post2 bla bla', 'Microblog') self._add_post("test author", 'post3', 'post3 noting new', 'Microblog') self._add_post("test author2", 'post4', 'post4 bla bla', 'Microblog') self._add_post("test author2", 'post5', 'post5 noting new', 'Microblog') self._add_claim_tweet_connection('claim1', 'post1') self._add_claim_tweet_connection('claim1', 'post2') self._add_claim_tweet_connection('claim1', 'post4') self._add_claim_tweet_connection('claim2', 'post3') self._add_claim_tweet_connection('claim2', 'post5') self._db.session.commit() self._preprocess_visualization._domain = "Microblog" self._preprocess_visualization.execute() self._preprocess_visualization.execute() author_topic_mapping = self._db.get_author_topic_mapping() post_topic_mappings = self._db.get_post_topic_mapping() post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings] topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim1'] topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary( )['claim2'] self.assertEqual(2, len(author_topic_mapping)) self.assertSetEqual( {('test author', 0.666666666667, 0.333333333333), ('test author2', 0.5, 0.5)}, set(author_topic_mapping)) self.assertSetEqual( {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0), ('post5', topic_id2, 1.0)}, set(post_topic_mappings)) def assertTopicInserted(self, claim_id): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_source_id_topic_dictionary( )[claim_id] self.assertIn(topic_id, topic_dict) expected = set( clean_tweet(self._post_dictionary[claim_id].content).split(' ')) if self._preprocess_visualization._remove_stop_words: expected = set( clean_content_by_nltk_stopwords( self._post_dictionary[claim_id].content).split(' ')) self.assertSetEqual(expected, topic_dict[topic_id]) def _add_author(self, author_guid, type="good_actor"): author = Author() author.author_guid = author_guid author.author_full_name = author_guid author.author_screen_name = author_guid author.name = author_guid author.domain = 'Microblog' author.author_type = type self._db.add_author(author) self._authors.append(author) def _add_post(self, author_guid, title, content, domain='Microblog'): post = Post() post.author = author_guid post.author_guid = author_guid post.content = content post.title = title post.domain = domain post.post_id = title post.guid = post.post_id post.is_detailed = True post.is_LB = False self._db.addPost(post) self._posts.append(post) self._post_dictionary[post.post_id] = post def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) pass