class N_Grams_Feature_Generator_Unittests(unittest.TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._domain = 'test'
        self._posts = []
        self._authors = []

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def execute_module(self):
        authors = self._authors
        posts = self._db.get_posts_by_domain('test')
        parameters = {"authors": authors, "posts": posts, "graphs": []}
        n_gram_module = N_Grams_Feature_Generator(self._db, **parameters)
        n_gram_module._stemming = False
        n_gram_module.execute(window_start=None)

    def test_simple_case(self):
        self._add_author('1')
        self._add_post('do that',
                       'This article includes a list of references, but its sources remain unclear because it has insufficient inline citations. Please help to improve this article by introducing more precise citations.',
                       '1')
        self._add_post('to do ', 'article citations insufficient inline because the damn thing will not do that', '1')
        self._add_author('2')
        self._add_post('this was a triumph', 'im making a note here insufficient inline', '2')
        self.execute_module()
        features = self._db.get_author_features()
        db_val = self._db.get_author_feature('1', "2_gram_insufficient_inline").attribute_value
        self.assertEqual(db_val, str(2))

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = 'test author'
        author.name = 'test'
        author.domain = 'test'
        self._db.add_author(author)
        self._db.session.commit()
        self._authors.append(author)

    def _add_post(self, title, content, author_guid):
        post = Post()
        post.author = author_guid
        post.author_guid = author_guid
        post.content = content
        post.title = title
        post.domain = 'test'
        post.post_id = len(self._posts)
        post.guid = post.post_id
        post.date = date('2020-01-01 23:59:59')
        self._db.addPost(post)
        self._db.session.commit()
        self._posts.append(post)
class TestBehaviorFeatureGenerator(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._posts = []
        self._author = None

    def tearDown(self):
        self._db.session.close()
        pass

    ######################## Average minute between posts tests ######################

    def test_average_minutes_between_posts_no_post_expected_0(self):
        self._add_author(u"author_guid")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._features = [
            'average_minutes_between_posts'
        ]
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                "table_name": "posts",
                "id": "author_guid",
                "target_field": "content",
                "where_clauses": [{
                    "field_name": 1,
                    "value": 1
                }]
            },
            "connection": {},
            "destination": {}
        }]
        result = self._behavior_feature_generator.average_minutes_between_posts(
            **{'posts': self._posts})
        self.assertEqual(0, result)

    def test_average_minutes_between_posts_one_post_expected_0(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                "table_name": "posts",
                "id": "author_guid",
                "target_field": "content",
                "where_clauses": [{
                    "field_name": 1,
                    "value": 1
                }]
            },
            "connection": {},
            "destination": {}
        }]
        self._behavior_feature_generator.execute()
        result_feature = self._db.get_author_feature(
            u"author_guid",
            u"BehaviorFeatureGenerator_average_minutes_between_posts")
        feature_value = getattr(result_feature, u'attribute_value')
        self.assertEqual('0', feature_value)

    def test_average_minutes_between_posts_3_post_expected_105(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-12 06:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-12 08:30:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        result = self._behavior_feature_generator.average_minutes_between_posts(
            **{'posts': self._posts})
        self.assertEqual(105, result)

    ######################## Average posts per day tests ######################

    def test_average_posts_per_day_active_days_no_posts_expect_0(self):
        self._add_author(u"author_guid")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        result = self._behavior_feature_generator.average_posts_per_day_active_days(
            **{'posts': self._posts})
        self.assertEqual(0, result)

    def test_average_posts_per_day_1_active_days_1_post_each_expect_1(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        result = self._behavior_feature_generator.average_posts_per_day_active_days(
            **{'posts': self._posts})
        self.assertAlmostEqual(1.0, result, 0.0000001)

    def test_average_posts_per_day_3_active_days_1_post_each_expect_1(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-16 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        result = self._behavior_feature_generator.average_posts_per_day_active_days(
            **{'posts': self._posts})
        self.assertAlmostEqual(1.0, result, 0.0000001)

    def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2(
            self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        result = self._behavior_feature_generator.average_posts_per_day_active_days(
            **{'posts': self._posts})
        self.assertEqual(2.0, result)

    def test_average_posts_per_day_3_active_days_1_first_2_secound_3_third_expect_2_represent_by_post(
            self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name':
                'claim_tweet_connection',
                'source_id':
                'claim_id',
                'target_id':
                'post_id',
                "where_clauses": [{
                    "val1": "source.date",
                    "val2": "dest.date",
                    "op": "<="
                }]
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'post0',
            u"BehaviorFeatureGenerator_average_posts_per_day_active_days")
        self.assertEqual(u'2.0', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            u'post0', u"BehaviorFeatureGenerator_average_posts_per_day_total")
        self.assertGreater(float(author_feature.attribute_value), 0)

    def test_retweet_count_0_posts(self):
        self._add_author(u"author_guid")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'authors',
                'id': 'author_guid',
                "target_field": "author_guid",
                "where_clauses": [{
                    "field_name": "1",
                    "value": "1"
                }]
            },
            'connection': {},
            'destination': {}
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'author_guid', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'0', author_feature.attribute_value)

    def test_retweet_count_1_retweet(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"RT @content 1", "2017-06-12 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'authors',
                'id': 'author_guid',
                "target_field": "author_guid",
                "where_clauses": [{
                    "field_name": "1",
                    "value": "1"
                }]
            },
            'connection': {},
            'destination': {}
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'author_guid', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'1', author_feature.attribute_value)

    def test_retweet_count_3_retweet(self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"RT @content 3 RT @hi",
                       "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5 bla RT @bla",
                       "2017-06-16 04:00:00")
        self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id',
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'post0', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'3', author_feature.attribute_value)

        author_feature = self._db.get_author_feature(
            u'post0', u"BehaviorFeatureGenerator_average_retweets")
        self.assertEqual(u'0.5', author_feature.attribute_value)

    def test_received_retweets_count_0_retweets(self):
        self._add_author(u"author_guid")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'authors',
                'id': 'author_guid',
                "target_field": "author_guid",
                "where_clauses": [{
                    "field_name": "1",
                    "value": "1"
                }]
            },
            'connection': {},
            'destination': {}
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'author_guid', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'0', author_feature.attribute_value)

    def test_received_retweets_count_1_retweets(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"RT @author_guid content 1",
                       "2017-06-12 05:00:00")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'authors',
                'id': 'author_guid',
                "target_field": "author_guid",
                "where_clauses": [{
                    "field_name": "1",
                    "value": "1"
                }]
            },
            'connection': {},
            'destination': {}
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'author_guid', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'1', author_feature.attribute_value)

    def test_received_retweets_count_3_retweets_only_from_microblog_tweets(
            self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-10 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"RT @author_guid content 3 RT @hi",
                       "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5 bla RT @author_guid",
                       "2017-06-16 04:00:00")
        self._add_post(u"post6", u"RT @content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7 RT @wow", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        params = self._get_params()
        self._behavior_feature_generator = BehaviorFeatureGenerator(
            self._db, **params)
        self._behavior_feature_generator._targeted_fields = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id',
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }]
        self._behavior_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u'post0', u"BehaviorFeatureGenerator_retweet_count")
        self.assertEqual(u'3', author_feature.attribute_value)

    ######################## argument_parser tests ######################

    def test_argument_parser_connection_conditions(self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        arg_parser = ArgumentParser(self._db)
        args = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name':
                'claim_tweet_connection',
                'source_id':
                'claim_id',
                'target_id':
                'post_id',
                "where_clauses": [{
                    "val1": "source.date",
                    "val2": "dest.date",
                    "op": "<="
                }]
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = arg_parser._get_source_id_target_elements(
            args)
        actual = set([
            element.post_id
            for element in source_id_target_elements_dict["post0"]
        ])
        expected = {'post4', 'post5', 'post6'}
        self.assertSetEqual(actual, expected)

    def test_argument_parser_connection_conditions_with_timedelta(self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        arg_parser = ArgumentParser(self._db)
        args = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name':
                'claim_tweet_connection',
                'source_id':
                'claim_id',
                'target_id':
                'post_id',
                "where_clauses": [{
                    "val1": "source.date",
                    "val2": "dest.date",
                    "op": "timeinterval",
                    "delta": 1
                }]
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = arg_parser._get_source_id_target_elements(
            args)
        actual = set([
            element.post_id
            for element in source_id_target_elements_dict["post0"]
        ])
        expected = {'post2', 'post3'}
        self.assertSetEqual(actual, expected)

    def test_argument_parser_connection_conditions_with_before_timedelta(self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-13 06:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        arg_parser = ArgumentParser(self._db)
        args = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name':
                'claim_tweet_connection',
                'source_id':
                'claim_id',
                'target_id':
                'post_id',
                "where_clauses": [{
                    "val1": "source.date",
                    "val2": "dest.date",
                    "op": "before",
                    "delta": 1
                }]
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = arg_parser._get_source_id_target_elements(
            args)
        actual = set([
            element.post_id
            for element in source_id_target_elements_dict["post0"]
        ])
        expected = {'post2', u'post3'}
        self.assertSetEqual(actual, expected)

    def test_argument_parser_connection_conditions_with_after_timedelta(self):
        self._add_author(u"author_guid")
        self._add_post(u"post0", u"the claim", "2017-06-14 05:00:00", u"Claim")
        self._add_post(u"post1", u"content 1", "2017-06-12 06:00:00")
        self._add_post(u"post2", u"content 2", "2017-06-13 05:00:00")
        self._add_post(u"post3", u"content 3", "2017-06-15 05:00:00")
        self._add_post(u"post4", u"content 4", "2017-06-16 03:00:00")
        self._add_post(u"post5", u"content 5", "2017-06-16 04:00:00")
        self._add_post(u"post6", u"content 6", "2017-06-16 05:00:00")
        self._add_post(u"post7", u"content 7", "2017-06-16 06:00:00",
                       u"Not Microblog")
        self._add_claim_tweet_connection(u"post0", u"post1")
        self._add_claim_tweet_connection(u"post0", u"post2")
        self._add_claim_tweet_connection(u"post0", u"post3")
        self._add_claim_tweet_connection(u"post0", u"post4")
        self._add_claim_tweet_connection(u"post0", u"post5")
        self._add_claim_tweet_connection(u"post0", u"post6")
        self._add_claim_tweet_connection(u"post0", u"post7")
        self._db.add_author(self._author)
        self._db.session.commit()
        arg_parser = ArgumentParser(self._db)
        args = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name':
                'claim_tweet_connection',
                'source_id':
                'claim_id',
                'target_id':
                'post_id',
                "where_clauses": [{
                    "val1": "source.date",
                    "val2": "dest.date",
                    "op": "after",
                    "delta": 1
                }]
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = arg_parser._get_source_id_target_elements(
            args)
        actual = set([
            element.post_id
            for element in source_id_target_elements_dict["post0"]
        ])
        expected = {'post3'}
        self.assertSetEqual(actual, expected)

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'test author'
        author.author_screen_name = author_guid
        author.name = u'test'
        author.domain = u'tests'
        author.statuses_count = 0
        author.created_at = u"2017-06-14 05:00:00"
        # self._db.add_author(author)
        self._author = author

    def _add_post(self, title, content, date_str, domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_guid
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime(date_str)
        post.created_at = post.date
        self._db.addPost(post)
        self._posts.append(post)

        self._author.statuses_count += 1

    def _get_params(self):
        posts = {self._author.author_guid: self._posts}
        params = {'authors': [self._author], 'posts': posts}
        return params

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass
class TestAccountPropertiesFeatureGenerator(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self.author_guid = u"author_guid"

        author = Author()
        author.author_guid = self.author_guid
        author.author_full_name = u'author'
        author.name = u'author_name'
        author.author_screen_name = u'author_screen_name'
        author.domain = u'Microblog'
        author.statuses_count = 10
        author.friends_count = 5
        author.followers_count = 6
        author.favourites_count = 8
        author.author_sub_type = u"bot"
        author.author_type = u"bad"
        author.created_at = u"2017-06-17 05:00:00"
        author.default_profile = True
        author.default_profile_image = True
        author.verified = True
        self._db.add_author(author)

        post = Post()
        post.author = self.author_guid
        post.author_guid = self.author_guid
        post.content = u"content"
        post.title = u"title"
        post.domain = u"domain"
        post.post_id = u"post_id"
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime("2017-06-14 05:00:00")
        post.created_at = post.date
        self._db.addPost(post)

        self._db.session.commit()
        self.feature_prefix = u"AccountPropertiesFeatureGenerator_"
        self.account_properties_feature_generator = AccountPropertiesFeatureGenerator(
            self._db, **{
                'authors': [author],
                'posts': {
                    self.author_guid: [post]
                }
            })
        self.account_properties_feature_generator.execute()

    def tearDown(self):
        self._db.session.close()
        pass

    def test_account_age(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"account_age")
        account_creation_date = parser.parse(u"2017-06-17 05:00:00").date()
        today_date = datetime.date.today()
        delta = today_date - account_creation_date
        self.assertEqual(delta.days, int(author_feature.attribute_value))

    def test_number_followers(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"number_followers")
        self.assertEqual(6, int(author_feature.attribute_value))

    def test_number_friends(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"number_friends")
        self.assertEqual(5, int(author_feature.attribute_value))

    def test_friends_followers_ratio(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"friends_followers_ratio")
        self.assertAlmostEqual(5.0 / 6,
                               float(author_feature.attribute_value),
                               places=5)

    def test_number_of_crawled_posts(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"number_of_crawled_posts")
        self.assertEqual("1", author_feature.attribute_value)

    def test_number_of_posts(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"number_of_posts")
        self.assertEqual(10, int(author_feature.attribute_value))

    def test_default_profile(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"default_profile")
        self.assertEqual(u"1", author_feature.attribute_value)

    def test_default_profile_image(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"default_profile_image")
        self.assertEqual(u"1", author_feature.attribute_value)

    def test_verified(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"verified")
        self.assertEqual(u'1', author_feature.attribute_value)

    def test_screen_name_length(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"screen_name_length")
        self.assertEqual(18, int(author_feature.attribute_value))

    def test_author_screen_name(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"author_screen_name")
        self.assertEqual(u"author_screen_name", author_feature.attribute_value)

    def test_author_type(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"author_type")
        self.assertEqual(u"bad", author_feature.attribute_value)

    def test_author_sub_type(self):
        author_feature = self._db.get_author_feature(
            self.author_guid, self.feature_prefix + u"author_sub_type")
        self.assertEqual(u"bot", author_feature.attribute_value)
class TestClaimToTopicConverter(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._posts = []
        self._claim_dictionary = {}
        self._authors = []
        self._add_author(u'test author')
        self._preprocess_visualization = ClaimToTopicConverter(self._db)

    def tearDown(self):
        self._db.session.close_all()
        self._db.deleteDB()
        self._db.session.close()

    def test_generate_topics_no_topics(self):
        claim_id_posts_dict = self._db.get_claim_id_posts_dict()
        self._preprocess_visualization.generate_topics_tables()
        topics = self._db.get_topics()
        self.assertEqual(topics, [])

    def test_generate_topics_from_1_claim(self):
        self._add_claim(u'claim1', u'claim1 content')
        self._db.session.commit()
        claim_id_posts_dict = self._db.get_claim_id_posts_dict()
        self._preprocess_visualization.generate_topics_tables()
        self.assertTopicInserted(u'claim1', [u'claim1', u'content'])

    def test_generate_topics_from_5_claims(self):
        self._add_claim(u'claim1', u'claim1 content')
        self._add_claim(u'claim2', u'claim2 content')
        self._add_claim(u'claim3', u'claim3 content move')
        self._add_claim(u'claim4', u'claim4 dif data')
        self._add_claim(u'claim5', u'claim5 some boring text')
        self._db.session.commit()
        self._preprocess_visualization.generate_topics_tables()
        self.assertTopicInserted(u'claim1', [u'claim1', u'content'])
        self.assertTopicInserted(u'claim2', [u'claim2', u'content'])
        self.assertTopicInserted(u'claim3', [u'claim3', u'content', u'move'])
        self.assertTopicInserted(u'claim4', [u'claim4', u'dif', u'data'])
        self.assertTopicInserted(u'claim5',
                                 [u'claim5', u'some', u'boring', u'text'])

    def test_generate_post_topic_mapping_no_claim(self):
        self._preprocess_visualization.generate_post_topic_mapping()
        mappings = self._db.get_post_topic_mapping()
        self.assertEqual(0, len(mappings))

    def test_generate_post_topic_mapping_1_claim(self):
        self._add_claim(u'claim1', u'claim1 content')
        self._add_post(u"test author", u'post1', u'post1 content of data',
                       u'Microblog')
        self._add_post(u"test author", u'post2', u'post2  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post3', u'post3 noting  new',
                       u'Microblog')
        self._add_claim_tweet_connection(u'claim1', u'post1')
        self._add_claim_tweet_connection(u'claim1', u'post2')
        self._add_claim_tweet_connection(u'claim1', u'post3')
        self._db.session.commit()

        self._preprocess_visualization.generate_topics_tables()
        self._preprocess_visualization.generate_post_topic_mapping()

        mappings = self._db.get_post_topic_mapping()
        mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                    for tm in mappings]
        topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[u'claim1']
        self.assertEqual(3, len(mappings))
        self.assertSetEqual(
            {('post1', topic_id, 1.0), ('post2', topic_id, 1.0),
             ('post3', topic_id, 1.0)}, set(mappings))

    def test_generate_post_topic_mapping_2_claim(self):
        self._add_claim(u'claim1', u'claim1 content')
        self._add_claim(u'claim2', u'claim1 content')
        self._add_post(u"test author", u'post1', u'post1 content of data',
                       u'Microblog')
        self._add_post(u"test author", u'post2', u'post2  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post3', u'post3 noting  new',
                       u'Microblog')
        self._add_post(u"test author", u'post4', u'post4  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post5', u'post5 noting  new',
                       u'Microblog')
        self._add_claim_tweet_connection(u'claim1', u'post1')
        self._add_claim_tweet_connection(u'claim1', u'post2')
        self._add_claim_tweet_connection(u'claim1', u'post3')
        self._add_claim_tweet_connection(u'claim2', u'post4')
        self._add_claim_tweet_connection(u'claim2', u'post5')
        self._db.session.commit()
        self._preprocess_visualization.generate_topics_tables()
        self._preprocess_visualization.generate_post_topic_mapping()

        mappings = self._db.get_post_topic_mapping()
        mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                    for tm in mappings]
        topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[u'claim1']
        topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[u'claim2']
        self.assertEqual(5, len(mappings))
        self.assertSetEqual(
            {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0),
             ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0),
             ('post5', topic_id2, 1.0)}, set(mappings))

    def test__generate_author_topic_mapping_2_claim(self):
        self._add_author(u'test author2')
        self._add_claim(u'claim1', u'claim1 content')
        self._add_claim(u'claim2', u'claim1 content')
        self._add_post(u"test author", u'post1', u'post1 content of data',
                       u'Microblog')
        self._add_post(u"test author", u'post2', u'post2  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post3', u'post3 noting  new',
                       u'Microblog')
        self._add_post(u"test author", u'post4', u'post4  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post5', u'post5 noting  new',
                       u'Microblog')
        self._add_claim_tweet_connection(u'claim1', u'post1')
        self._add_claim_tweet_connection(u'claim1', u'post2')
        self._add_claim_tweet_connection(u'claim1', u'post3')
        self._add_claim_tweet_connection(u'claim2', u'post4')
        self._add_claim_tweet_connection(u'claim2', u'post5')
        self._db.session.commit()
        self._preprocess_visualization._domain = u"Microblog"
        self._preprocess_visualization.generate_topics_tables()
        self._preprocess_visualization.generate_post_topic_mapping()
        self._preprocess_visualization.generate_author_topic_mapping()

        mapping = self._db.get_author_topic_mapping()
        self.assertEqual(2, len(mapping))
        self.assertSetEqual(
            {(u'test author', 0.6, 0.4), (u'test author2', 0, 0)},
            set(mapping))

    def test_visualization(self):
        self._add_author(u'test author2', u"bad_actor")
        self._add_claim(u'claim1', u'claim1 content')
        self._add_claim(u'claim2', u'claim2 content')
        self._add_post(u"test author", u'post1', u'post1 content of data',
                       u'Microblog')
        self._add_post(u"test author", u'post2', u'post2  bla bla',
                       u'Microblog')
        self._add_post(u"test author", u'post3', u'post3 noting  new',
                       u'Microblog')
        self._add_post(u"test author2", u'post4', u'post4  bla bla',
                       u'Microblog')
        self._add_post(u"test author2", u'post5', u'post5 noting  new',
                       u'Microblog')
        self._add_claim_tweet_connection(u'claim1', u'post1')
        self._add_claim_tweet_connection(u'claim1', u'post2')
        self._add_claim_tweet_connection(u'claim1', u'post4')
        self._add_claim_tweet_connection(u'claim2', u'post3')
        self._add_claim_tweet_connection(u'claim2', u'post5')
        self._db.session.commit()
        self._preprocess_visualization._domain = u"Microblog"
        self._preprocess_visualization.execute()

        author_topic_mapping = self._db.get_author_topic_mapping()
        post_topic_mappings = self._db.get_post_topic_mapping()
        post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                               for tm in post_topic_mappings]
        topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[u'claim1']
        topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[u'claim2']
        self.assertEqual(2, len(author_topic_mapping))
        self.assertSetEqual(
            {(u'test author', 0.666666666667, 0.333333333333),
             (u'test author2', 0.5, 0.5)}, set(author_topic_mapping))
        self.assertSetEqual(
            {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0),
             ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0),
             ('post5', topic_id2, 1.0)}, set(post_topic_mappings))

    # def test_double_execution_visualization(self):
    #     self._add_author(u'test author2', u"bad_actor")
    #     self._add_claim(u'claim1', u'claim1 content')
    #     self._add_claim(u'claim2', u'claim2 content')
    #     self._add_post(u"test author", u'post1', u'post1 content of data', u'Microblog')
    #     self._add_post(u"test author", u'post2', u'post2  bla bla', u'Microblog')
    #     self._add_post(u"test author", u'post3', u'post3 noting  new', u'Microblog')
    #     self._add_post(u"test author2", u'post4', u'post4  bla bla', u'Microblog')
    #     self._add_post(u"test author2", u'post5', u'post5 noting  new', u'Microblog')
    #     self._add_claim_tweet_connection(u'claim1', u'post1')
    #     self._add_claim_tweet_connection(u'claim1', u'post2')
    #     self._add_claim_tweet_connection(u'claim1', u'post4')
    #     self._add_claim_tweet_connection(u'claim2', u'post3')
    #     self._add_claim_tweet_connection(u'claim2', u'post5')
    #     self._db.session.commit()
    #     self._preprocess_visualization._domain = u"Microblog"
    #     self._preprocess_visualization.execute()
    #     self._preprocess_visualization.execute()
    #
    #     author_topic_mapping = self._db.get_author_topic_mapping()
    #     post_topic_mappings = self._db.get_post_topic_mapping()
    #     post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist) for tm in post_topic_mappings]
    #     topic_id1 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim1']
    #     topic_id2 = self._preprocess_visualization.get_claim_id_topic_dictionary()[u'claim2']
    #     self.assertEqual(2, len(author_topic_mapping))
    #     self.assertSetEqual({(u'test author2', 0.5, 0.5), (u'test author', 0.666666666667, 0.333333333333)},
    #                         set(author_topic_mapping))
    #     self.assertSetEqual(
    #         {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0), ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0),
    #          ('post5', topic_id2, 1.0)}, set(post_topic_mappings))

    def assertTopicInserted(self, claim_id, expected_terms):
        topics = self._db.get_topics()
        terms = self._db.get_terms()
        topic_dict = defaultdict(set)
        term_dict = {term.term_id: term.description for term in terms}
        for topic_id, term_id, prob in topics:
            topic_dict[topic_id].add(term_dict[term_id])
        topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary(
        )[claim_id]
        claim = self._claim_dictionary[claim_id]
        expected = set(clean_tweet(claim.description).split(' '))
        self.assertIn(topic_id, topic_dict)
        self.assertSetEqual(expected, topic_dict[topic_id])
        self.assertSetEqual(set(expected_terms), topic_dict[topic_id])

    def _add_author(self, author_guid, type=u"good_actor"):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = author_guid
        author.author_screen_name = author_guid
        author.name = author_guid
        author.domain = u'Microblog'
        author.author_type = type
        self._db.add_author(author)
        self._authors.append(author)

    def _add_post(self, author_guid, title, content, domain=u'Microblog'):
        post = Post()
        post.author = author_guid
        post.author_guid = author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.is_detailed = True
        post.is_LB = False
        self._db.addPost(post)
        self._posts.append(post)

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass

    def _add_claim(self,
                   claim_id,
                   content,
                   date_str=u"2017-06-14 05:00:00",
                   keywords=u"",
                   post_type=None):
        claim = Claim()
        claim.claim_id = claim_id
        claim.verdict = post_type
        claim.title = claim_id
        claim.description = content
        claim.verdict_date = convert_str_to_unicode_datetime(date_str)
        claim.keywords = keywords
        claim.url = u"claim url"
        self._db.addPost(claim)
        self._claim_dictionary[claim.claim_id] = claim
class Word_Embeddings_Differential_Feature_Generator_Unittests(
        unittest.TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        self._model = Word_Embedding_Differential_Feature_Generator(self._db)

        self._posts = []
        self._author = None
        self._set_author(u'test_user')

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def test_simple_case(self):
        self._add_post(u'of to a for', u'of is')
        self._add_target_article(u'0', u'of was ', u'am that was')
        self._setup_test()

        is_vec1 = self._get_word_dimension(u'is', 0)
        was_vec_d1 = self._get_word_dimension(u'was', 0)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_np.mean_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        is_vec = self._words[u'is']
        was_vec = self._words['was']
        expected_val = commons.euclidean_distance(is_vec, was_vec)
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_distance_function_euclidean_distance_target_articles_title_np.mean_TO_posts_content_np.mean"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_few_words(self):
        self._add_post(u'of to a for', u'of is on')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        dimension = 140
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_opposite(self):
        self._add_post(u'am that was', u'of was that')
        self._add_target_article(u'0', u'of is on', u'of to a for')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot1 - tot2
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_empty_word(self):
        self._add_post(u'of to a for', u'')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()
        self.assertTrue(True)

    def _add_post(self, title, content):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = u'test'
        post.post_id = len(self._posts)
        post.guid = post.post_id
        self._db.addPost(post)
        self._posts.append(post)

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GloveWordEmbeddingModelCreator(
            self._db)
        self._word_embedding_model_creator.execute(None)

        params = {'authors': [self._author], 'posts': self._posts}
        self._model = Word_Embedding_Differential_Feature_Generator(
            self._db, **params)
        self._model.execute()

        self._words = self._db.get_word_embedding_dictionary()

    def _get_word_dimension(self, word, dimension):
        word_vec = self._words[word]
        return word_vec[dimension]

    def _add_target_article(self, post_id, title, description):
        target_article = Target_Article()
        target_article.author_guid = u'test_user'
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])
class TestGensimWordEmbeddingsModelTrainer(TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        # self._Word_Embedding_Model_Creator.execute(None)
        self._is_load_wikipedia_300d_glove_model = True
        self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt"
        self._table_name = "wikipedia_model_300d"
        self._word_vector_dict_full_path = "data/output/word_embedding/"
        self._word_vector_dict = {}

        self._author = None
        self._set_author(u'test_user')
        self._counter = 0
        self._posts = []

    def tearDown(self):
        self._db.session.close()

    def test_add_additional_fields_to_existing_table(self):
        self._add_post(u'was', u'is')
        self._add_post(u'is', u'was')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)

        self._word_embedding_model_creator.execute(None)
        self._word_embedding_model_creator._aggregation_functions_names = [
            'sum'
        ]
        self._word_embedding_model_creator.execute(None)

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == 'test_user')
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]
        sum_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'sum']
        mean_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'np.mean']

        try:
            if len(sum_value_df.values.tolist()) > 0 and len(
                    mean_value_df.values.tolist()) > 0:
                self.assertTrue(True)
            else:
                self.fail()
        except:
            self.fail()

    def test_case_post_represent_by_posts(self):
        self._add_post(u'post1', u'the claim', u'Claim')
        self._add_post(u'post2', u'dog cat pig man')  # 2
        self._add_post(u'post3', u'TV is the best guys')  # 1
        self._add_claim_tweet_connection(u'post1', u'post2')
        self._add_claim_tweet_connection(u'post1', u'post3')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator._targeted_fields_for_embedding = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id'
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id'
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": []
            }
        }]

        self._word_embedding_model_creator.execute(None)
        model_name_path = self._word_embedding_model_creator._prepare_model_name_path(
        )
        model = Word2Vec.load(model_name_path)
        word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict(
            model)
        self._words = word_vector_dict
        self._words_vectors = self._get_posts_val()
        expected_val = self._calc_results()
        self._generic_test(expected_val, u'post1')

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator.execute(None)

        self._words = self._db.get_word_embedding_dictionary()
        self._words_vectors = self._get_posts_val()

    def _generic_test(self, expected_value, source_id=u""):
        if source_id == u"":
            source_id = self._author.author_guid

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == source_id)
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]

        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'min')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'max')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'np.mean')

    def assert_word_embedding(self, db_results, expected_value, type):
        result_value = db_results.loc[db_results[u'word_embedding_type'] ==
                                      type, '0':].values.tolist()[0]
        self.assertEquals(list(expected_value[type]), result_value)

    def _generic_non_equal_test(self, expected_value):
        db_results = self._db.get_author_word_embedding(
            self._author.author_guid, u'posts', u'content')
        self.assertNotEqual(expected_value[u'min'], db_results[u'min'])
        self.assertNotEqual(expected_value[u'max'], db_results[u'max'])
        self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean'])

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _add_post(self, title, content, _domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = _domain
        post.post_id = title
        post.guid = title
        self._db.addPost(post)
        self._posts.append(post)

    def _get_posts_val(
            self):  # return the vectors for all the words in the added posts
        vals = {}
        for post in self._posts:
            for word in post.content.split():
                if word in self._words.keys():
                    vals[word] = self._words[word]
        return vals.values()

    def _calc_mean(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('np.mean'), ziped_vec)
        return tuple(result)

    def _calc_min(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('min'), ziped_vec)
        return tuple(result)

    def _calc_max(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('max'), ziped_vec)
        return tuple(result)

    def _calc_results(self):
        vectors = self._words_vectors
        results = {}
        results[u'min'] = self._calc_min(vectors)
        results[u'max'] = self._calc_max(vectors)
        results[u'np.mean'] = self._calc_mean(vectors)
        return results

    def _add_target_article(self, post_id, title, description, author_guid):
        target_article = Target_Article()
        target_article.author_guid = author_guid
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass
class Data_Handler_Unittests(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._data_handler = Data_Handler(self._db, 'author_type')
        self._authors_to_author_features_dict = {}

        self._fill_empty= True
        self._remove_features = []
        self._select_features = []
        self._label_text_to_value = {'good':0,'bad':1}

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def test_basic_case(self):
        self._create_author_with_features('1','good',(10,11,12,13,14,15,16))
        self._create_author_with_features('2','bad', (20,21,22,23,24,25,26))
        self._create_author_with_features('3','good', (30,31,32,33,34,35,36))
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)

        self.assertEqual(1,1)

    def test_remove_by_prefix(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._remove_features_by_prefix = ['feature_test','bla']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(feature_num, 1)


    def test_remove_by_prefix_2(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._remove_features_by_prefix = ['feature_test']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(3, feature_num)

    def test_select_by_prefix(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'dada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._select_features_by_prefix = ['feature_test']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(2, feature_num)

    def test_select_by_prefix2(self):
        self._create_author('123','bad')
        self._create_author_feature_with_name('123',3,'feature_test')
        self._create_author_feature_with_name('123', 5, 'feature_test2')
        self._create_author_feature_with_name('123', 5, 'bla_bla')
        self._create_author_feature_with_name('123', 5, 'bloom_bla')
        self._create_author_feature_with_name('123', 5, 'blada')
        self._create_author_feature_with_name('123', 6, 'bla_bli')
        self._data_handler._select_features_by_prefix = ['bla']
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        feature_num = len(authors_features_dataframe.columns)
        self.assertEqual(3, feature_num)

    def test_fill_and_drop_nan(self):
        self._create_author_with_features('1','good',(10,None,12,None))
        self._create_author_with_features('2', 'bad', (20, 24, 22,None))
        self._create_author_with_features('3', 'bad', (30, 34, 32,None))
        self._data_handler._fill_empty = 'zero'
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        null_val = authors_features_dataframe.iloc[0][u'1']
        self.assertEqual(null_val,0)
        did_remove_empty_column = u'3' not in authors_features_dataframe.columns
        self.assertTrue(did_remove_empty_column)
        self._data_handler._fill_empty= 'mean'
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(self._remove_features, self._select_features, self._label_text_to_value)
        null_val = authors_features_dataframe.iloc[0][u'1']
        self.assertEqual(null_val,(24+34)/2)

    def test_get_split(self):
        self._auto_create_authors(4,7)
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                    0, 2)
        self.assertEqual(test_set.iloc[0][u'0'],11)
        self.assertEqual(test_set.iloc[1][u'0'],21)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(
            authors_features_dataframe,
            authors_labels,
            1, 2)
        self.assertEqual(test_set.iloc[0][u'0'],31)
        self.assertEqual(test_set.iloc[1][u'0'],41)

    def test_train_and_test_differ(self):
        author_number = 7
        self._auto_create_authors(author_number,9)
        authors_features_dataframe, authors_labels = self._data_handler.get_labeled_authors_feature_dataframe_for_classification(
            self._remove_features, self._select_features, self._label_text_to_value)
        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                                                            0, 7)
        for num in range(author_number):
            author_guid =(num+1)*10+1
            is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid)
            if is_in_both:
                logging.info("in both " + str(author_guid))
            self.assertFalse(is_in_both)

        test_set, train_set, test_labels, train_labels = self._data_handler.get_the_k_fragment_from_dataset(authors_features_dataframe,
                                                                                                            authors_labels,
                                                                                                            6, 7)
        for num in range(author_number):
            author_guid =(num+1)*10+1
            is_in_both = self._is_val_in_datatframe(test_set,author_guid)==self._is_val_in_datatframe(train_set,author_guid)
            if is_in_both:
                logging.info("in both "+str(author_guid))
            self.assertFalse(is_in_both)

    def _auto_create_authors(self, author_num, num_of_features):
        for num in range(author_num):
            author_name = num+1
            feature = []
            for feature_name in range(num_of_features):
                feature.append(str(author_name*10+feature_name+1))
            author_type = str(author_name*1000+author_name)
            self._create_author_with_features(str(author_name),author_type, feature)

    def _compare_authors_features_to_author(self):
        pass
    
    def _create_author_with_features(self, author_guid, author_type, feature_values):
        self._create_author(author_guid,author_type)
        for feature_value in feature_values:
            self._create_author_feature(author_guid,feature_value)
        self._db.session.commit()

    def _create_author(self, guid, author_type):
        author = Author()
        author.name = unicode(guid)
        author.domain = u'Microblog'
        author.author_guid = unicode(guid)
        author.author_screen_name = u'TestUser1'
        author.author_type = author_type
        author.domain = u'Restaurant'
        author.author_osn_id = 1

        self._authors_to_author_features_dict[author.author_guid]=[]
        self._db.add_author(author)

    def _create_author_feature(self, author_guid, value):
        feature_name = str(len(self._authors_to_author_features_dict[author_guid]))
        self._create_author_feature_with_name(author_guid, value, feature_name)

    def _create_author_feature_with_name(self, author_guid, value, feature_name):
        author_feature = AuthorFeatures()
        author_feature.author_guid = author_guid
        author_feature.window_start = date('2010-01-01 00:00:00')
        author_feature.window_end = date('2020-01-01 23:59:59')
        author_feature.attribute_name = feature_name
        author_feature.attribute_value=value
        self._authors_to_author_features_dict[author_guid].append(author_feature)
        self._db.update_author_features((author_feature))
        self._db.session.commit()

    def _is_val_in_datatframe(self, df, value):
        for row in range(df.shape[0]):  # df is the DataFrame
            for col in range(df.shape[1]):
                if df.iloc[row][col] == value:
                    return True
        return False
    def _get_random_guid(self):
        return unicode(uuid.uuid4())
Esempio n. 8
0
class TestTimelineOverlapVisualizationGenerator(TestCase):
    def setUp(self):
        self.config = getConfig()
        self._db = DB()
        self._db.setUp()
        self.timeline_overlap = TimelineOverlapVisualizationGenerator()

        author1 = Author()
        author1.name = 'acquired_user'
        author1.domain = 'Microblog'
        author1.author_guid = 'acquired_user'
        author1.author_screen_name = 'acquired_user'
        author1.author_full_name = 'acquired_user'
        author1.author_osn_id = 1
        author1.created_at = datetime.datetime.now()
        author1.missing_data_complementor_insertion_date = datetime.datetime.now(
        )
        author1.xml_importer_insertion_date = datetime.datetime.now()
        author1.author_type = 'bad_actor'
        author1.author_sub_type = 'acquired'
        self._db.add_author(author1)

        for i in range(1, 11):
            post1 = Post()
            post1.post_id = 'bad_post' + str(i)
            post1.author = 'acquired_user'
            post1.guid = 'bad_post' + str(i)
            post1.date = datetime.datetime.now()
            post1.domain = 'Microblog'
            post1.author_guid = 'acquired_user'
            post1.content = 'InternetTV love it' + str(i)
            post1.xml_importer_insertion_date = datetime.datetime.now()
            self._db.addPost(post1)

        author = Author()
        author.name = 'TestUser1'
        author.domain = 'Microblog'
        author.author_guid = 'TestUser1'
        author.author_screen_name = 'TestUser1'
        author.author_full_name = 'TestUser1'
        author.author_osn_id = 2
        author.created_at = datetime.datetime.now()
        author.missing_data_complementor_insertion_date = datetime.datetime.now(
        )
        author.xml_importer_insertion_date = datetime.datetime.now()
        self._db.add_author(author)

        for i in range(1, 11):
            post = Post()
            post.post_id = 'TestPost' + str(i)
            post.author = 'TestUser1'
            post.guid = 'TestPost' + str(i)
            post.date = datetime.datetime.now()
            post.domain = 'Microblog'
            post.author_guid = 'TestUser1'
            post.content = 'InternetTV love it' + str(i)
            post.xml_importer_insertion_date = datetime.datetime.now()
            self._db.addPost(post)

        self._db.commit()

    def test_generate_timeline_overlap_csv(self):
        self.timeline_overlap.setUp()
        self.timeline_overlap.generate_timeline_overlap_csv()
        author = self._db.get_author_by_author_guid('acquired_user')
        self.assertEqual(author.author_type, 'bad_actor')
        self.assertEqual(author.author_sub_type, 'acquired')
        pass

    def tearDown(self):
        self._db.session.close_all()
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()
class TF_IDF_Feature_Generator_Unittests(unittest.TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._domain = u'test'
        self._posts = []
        self._authors = []
        self._texts = []

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def execute_module(self):
        authors = self._authors
        posts = self._db.get_posts_by_domain('test')
        parameters = {"authors": authors, "posts": posts, "graphs": []}
        self._module = TF_IDF_Feature_Generator(self._db, **parameters)
        self._module._stemming = False
        self._module.execute(window_start=None)

    def test_tf_idf(self):
        self._add_author(u'1')
        text1 = 'this is a a sample'
        text2 = 'this is another another example example example'
        self._add_post('ta da', text1, '1')
        self._add_author(u'2')
        self._add_post('ta dddda', text2, '2')
        self.execute_module()
        self._module.clear_memo_dicts()
        module_result = self._module.tfidf('example', text2, [text1, text2],
                                           {})
        self.assertAlmostEqual(module_result, 0.129, places=3)
        self._module.clear_memo_dicts()
        module_result = self._module.tfidf('example', text1, [text1, text2],
                                           {})
        self.assertAlmostEqual(module_result, 0.0, places=2)
        self._module.clear_memo_dicts()
        self.assertAlmostEqual(self._module.tf('this', text1), 0.2)
        self._module.clear_memo_dicts()
        self.assertAlmostEqual(self._module.tf('this', text2),
                               0.1428,
                               places=3)

    def test_tf_idf_complicated(self):
        self._add_author(u'1')
        text1 = 'this is a a sample'
        text2 = 'this is another another example example example'
        text3 = 'hello world'
        text4 = 'hello big world'
        text5 = 'hello Israel'
        text6 = 'i live in israel'
        self._add_post(text1, text1, u'1')
        self._add_post(text3, text3, u'1')
        self._add_post(text4, text4, u'1')
        self._add_author(u'2')
        self._add_post(text2, text2, u'2')
        self._add_post(text5, text5, u'2')
        self._add_post(text6, text6, u'2')
        self.execute_module()
        self._module.clear_memo_dicts()
        module_result = self._module.tfidf('example', text2, self._texts, {})
        self.assertAlmostEqual(module_result,
                               (3.0 / 7) * abs(math.log((1.0 / 6), 10)),
                               places=4)
        self._module.clear_memo_dicts()
        module_result = self._module.tfidf('example', text1, self._texts, {})
        self.assertAlmostEqual(module_result, 0.0, places=2)
        self._module.clear_memo_dicts()

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'test author'
        author.name = u'test'
        author.domain = u'test'
        self._db.add_author(author)
        self._db.session.commit()
        self._authors.append(author)

    def _add_post(self, title, content, author_guid):
        post = Post()
        post.author = author_guid
        post.author_guid = author_guid
        post.content = content
        post.title = title
        post.domain = u'test'
        post.post_id = len(self._posts)
        post.guid = post.post_id
        post.date = date('2020-01-01 23:59:59')
        self._db.addPost(post)
        self._db.session.commit()
        self._posts.append(post)
        self._texts.append(content)
Esempio n. 10
0
class TestOldTweetsCrawler(TestCase):
    # I checked the test at 21/08/2018 there is a chance that the return tweet count will change (I hope not)
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self.tweets_crawler = OldTweetsCrawler(self._db)
        self.tweets_crawler._domain = u'Claim'
        self._add_author(u"author_guid")
        self._claims = {}

    def tearDown(self):
        self._db.session.close()

    def test_retrieve_tweets_by_content_between_dates_after(self):
        self._add_claim(u"post0", u"The Rock Running for President",
                        u"2017-02-03 00:00:00")
        self._db.commit()
        date_interval_dict = defaultdict(set)
        claim_date = self._claims[u"post0"].verdict_date
        until_date = str_to_date(u"2017-08-03 00:00:00")
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = True
        tweets = self.tweets_crawler._retrieve_tweets_between_dates(
            self._claims[u"post0"], u"The Rock Running for President",
            date_to_str(claim_date, "%Y-%m-%d"),
            date_to_str(until_date, "%Y-%m-%d"))
        tweets_date = map(lambda tweet: tweet.date, tweets)
        self.assertTrue(
            all([claim_date <= date < until_date for date in tweets_date]))
        self.assertGreaterEqual(100, len(tweets))

    def test_retrieve_tweets_by_content_between_dates_before(self):
        self._add_claim(u"post0", u"The Rock Running for President",
                        u"2017-02-03 00:00:00")
        self._db.commit()
        date_interval_dict = defaultdict(set)
        claim_date = self._claims[u"post0"].verdict_date
        since_date = str_to_date(u"2016-08-03 00:00:00")
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = True
        tweets = self.tweets_crawler._retrieve_tweets_between_dates(
            self._claims[u"post0"], u"The Rock Running for President",
            date_to_str(since_date, "%Y-%m-%d"),
            date_to_str(claim_date, "%Y-%m-%d"))
        tweets_date = map(lambda tweet: tweet.date, tweets)
        self.assertTrue(
            all([since_date <= date < claim_date for date in tweets_date]))
        self.assertGreaterEqual(100, len(tweets))

    def test_retrieve_tweets_by_content_between_dates_1_month_interval(self):
        self._add_claim(u"post0", u"The Rock Running for President",
                        u"2017-02-03 00:00:00")
        self._db.commit()
        since_date = str_to_date(u"2017-01-03 00:00:00")
        until_date = str_to_date(u"2017-03-03 00:00:00")
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = True
        self.tweets_crawler._max_num_tweets = 133
        self.tweets_crawler._month_interval = 1
        tweets = self.tweets_crawler._retrieve_old_tweets(
            self._claims[u"post0"], u"The Rock Running for President")

        tweets_date = map(lambda tweet: tweet.date, tweets)
        self.assertTrue(
            all([since_date <= date < until_date for date in tweets_date]))
        self.assertGreaterEqual(133, len(tweets))

    def test_retrieve_tweets_by_content_between_dates_no_limit_after(self):
        self._add_claim(u"post0", u"The Rock Running for President",
                        u"2017-02-03 00:00:00")
        self._db.commit()
        since_date = str_to_date(u"2017-01-03 00:00:00")
        until_date = str_to_date(u"2017-03-03 00:00:00")
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = False
        self.tweets_crawler._max_num_tweets = 250
        self.tweets_crawler._month_interval = 1
        tweets = self.tweets_crawler._retrieve_old_tweets(
            self._claims[u"post0"], u"The Rock Running for President")

        tweets_date = map(lambda tweet: tweet.date, tweets)
        self.assertTrue(all([since_date <= date for date in tweets_date]))
        self.assertGreaterEqual(250, len(tweets))

    def test_retrieve_tweets_by_content_between_dates_no_limit_before(self):
        self._add_claim(u"post0", u"The Rock Running for President",
                        u"2017-02-03 00:00:00")
        self._db.commit()
        since_date = str_to_date(u"2017-01-03 00:00:00")
        until_date = str_to_date(u"2017-03-03 00:00:00")
        self.tweets_crawler._limit_start_date = False
        self.tweets_crawler._limit_end_date = True
        self.tweets_crawler._max_num_tweets = 250
        self.tweets_crawler._month_interval = 1
        tweets = self.tweets_crawler._retrieve_old_tweets(
            self._claims[u"post0"], u"The Rock Running for President")

        tweets_date = map(lambda tweet: tweet.date, tweets)
        self.assertTrue(all([date < until_date for date in tweets_date]))
        self.assertGreaterEqual(250, len(tweets))

    def test_execute_retrieve_tweets_by_full_content_1_month_interval(self):
        self._add_claim(
            u"post0", u"The Rock Running for President",
            u"2017-02-03 00:00:00",
            u"The Rock Running for President, Dwayne Running for President")
        self._db.commit()
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = True
        self.tweets_crawler._max_num_tweets = 133
        self.tweets_crawler._month_interval = 1
        self.tweets_crawler._actions = ['get_old_tweets_by_claims_content']
        self.tweets_crawler.execute()

        tweets_before = self.tweets_crawler._claim_id_tweets_id_before_dict[
            u"post0"]
        tweets_after = self.tweets_crawler._claim_id_tweets_id_after_dict[
            u"post0"]
        self.assertEqual(0, len(tweets_before & tweets_after))
        tweets_retrieved = len(tweets_before) + len(tweets_after)
        self.assertGreaterEqual(133, tweets_retrieved)
        self.assertEqual(tweets_retrieved, len(self._db.get_posts()))
        self.assertEqual(tweets_retrieved,
                         len(self._db.get_claim_tweet_connections()))
        self.assertLess(0, tweets_retrieved)

    def test_execute_retrieve_tweets_by_key_words_1_month_interval(self):
        self._add_claim(
            u"post0", u"The Rock Running for President",
            u"2017-02-03 00:00:00",
            u"The Rock Running for President,Dwayne Running for President")
        self._db.commit()
        self.tweets_crawler._limit_start_date = True
        self.tweets_crawler._limit_end_date = True
        self.tweets_crawler._max_num_tweets = 141
        self.tweets_crawler._month_interval = 1
        self.tweets_crawler._actions = ['get_old_tweets_by_claims_keywords']
        self.tweets_crawler.execute()

        tweets_before = self.tweets_crawler._claim_id_tweets_id_before_dict[
            u"post0"]
        tweets_after = self.tweets_crawler._claim_id_tweets_id_after_dict[
            u"post0"]
        self.assertEqual(0, len(tweets_before & tweets_after))
        tweets_retrieved = len(tweets_before) + len(tweets_after)
        self.assertGreaterEqual(141 * 3, tweets_retrieved)
        self.assertEqual(tweets_retrieved, len(self._db.get_posts()))
        self.assertEqual(tweets_retrieved,
                         len(self._db.get_claim_tweet_connections()))
        self.assertLess(0, tweets_retrieved)

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'test author'
        author.author_screen_name = author_guid
        author.name = u'test'
        author.domain = u'tests'
        author.statuses_count = 0
        author.created_at = u"2017-06-14 05:00:00"
        self._db.add_author(author)
        self._author = author

    def _add_post(self, post_id, content, tags, date_str, domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_guid
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = post_id
        post.domain = domain
        post.post_id = post_id
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime(date_str)
        post.created_at = post.date
        post.tags = tags
        self._db.addPost(post)
        self._author.statuses_count += 1

    def _add_claim(self,
                   claim_id,
                   content,
                   date_str,
                   keywords=u"",
                   post_type=None):
        claim = Claim()
        claim.claim_id = claim_id
        claim.verdict = post_type
        claim.title = claim_id
        claim.description = content
        claim.verdict_date = convert_str_to_unicode_datetime(date_str)
        claim.keywords = keywords
        claim.url = u"claim url"
        self._db.addPost(claim)
        self._claims[claim.claim_id] = claim
Esempio n. 11
0
class TestFakeNewsFeatureGenerator(TestCase):
    def setUp(self):
        self._db = DB()

        self._db.setUp()
        self._posts = []
        self._author = None

    def tearDown(self):
        self._db.session.close()

    def test_get_word_count_1_claim_1_comments_no_words(self):
        self._add_author('author_guid')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_post("post1", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        self.assert_word_dictionary_count('post0', {})

        self.assert_word_dictionary_fraction('post0', {})

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_count_sum')
        self.assertEqual('0', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_fraction_sum')
        self.assertEqual('0.0', author_feature.attribute_value)

    def test_get_word_count_1_claim_1_comments_1_words(self):
        self._add_author('author_guid')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        self.assert_word_dictionary_count('post0', {'liar': '1'})

        self.assert_word_dictionary_fraction('post0', {'liar': '1.0'})

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_count_sum')
        self.assertEqual('1', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_fraction_sum')
        self.assertEqual('1.0', author_feature.attribute_value)

    def test_get_word_count_1_claim_4_comments_1_words(self):
        self._add_author('author_guid')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_post("post1", "1 bad word liar", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post3", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post4", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        self.assert_word_dictionary_count('post0', {'liar': '1'})

        self.assert_word_dictionary_fraction('post0', {'liar': '0.25'})

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_count_sum')
        self.assertEqual('1', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_fraction_sum')
        self.assertEqual('0.25', author_feature.attribute_value)

    def test_get_word_count_1_claim_4_comments_8_words(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_post("post1", "1 liar bad word liar", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words liar at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no liar bad words at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no liar bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        self.assert_word_dictionary_count('post0', {'liar': '8'})

        self.assert_word_dictionary_fraction('post0', {'liar': '2.0'})

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_count_sum')
        self.assertEqual('8', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_fraction_sum')
        self.assertEqual('2.0', author_feature.attribute_value)

    def test_get_word_count_1_claim_4_comments_8_different_words(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        self.assert_word_dictionary_count(
            'post0', {
                'liar': '3',
                'joke': '2',
                'didnt actually': '1',
                'untrue': '1',
                'laugh': '1'
            })

        self.assert_word_dictionary_fraction(
            'post0', {
                'liar': '0.75',
                'joke': '0.5',
                'didnt actually': '0.25',
                'untrue': '0.25',
                'laugh': '0.25'
            })

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_count_sum')
        self.assertEqual('3', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_words_fraction_sum')
        self.assertEqual('0.75', author_feature.attribute_value)

    def test_get_claim_type_4_claim(self):
        self._add_author('author_guid')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00")
        self._add_claim('post1', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_claim('post2', 'the claim', "2017-06-10 05:00:00",
                        'pants-fire')
        self._add_claim('post3', 'the claim', "2017-06-10 05:00:00",
                        'mostly-false')
        self._add_claim('post4', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00",
                        'mostly-true')
        self._add_claim('post6', 'the claim', "2017-06-10 05:00:00",
                        'half_true')
        self._add_claim('post7', 'the claim', "2017-06-10 05:00:00",
                        'unproven')
        self._db.session.commit()

        self.fake_news_feature_generator = FakeNewsFeatureGenerator(self._db)
        self.fake_news_feature_generator._domain = 'Claim'
        self.fake_news_feature_generator.setUp()
        self.fake_news_feature_generator.execute()

        author_feature = self._db.get_author_feature(
            'post0', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual(None, author_feature)
        author_feature = self._db.get_author_feature(
            'post1', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual('False', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post2', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual('False', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post3', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual('False', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post4', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual('True', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post5', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual('True', author_feature.attribute_value)
        author_feature = self._db.get_author_feature(
            'post6', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual(None, author_feature)
        author_feature = self._db.get_author_feature(
            'post7', 'FakeNewsFeatureGenerator_claim_verdict')
        self.assertEqual(None, author_feature)

    def assert_word_dictionary_count(self, author_guid, values):
        self.assert_dictionary_words(author_guid,
                                     'FakeNewsFeatureGenerator_{0}_count', '0',
                                     values)

    def assert_word_dictionary_fraction(self, author_guid, values):
        self.assert_dictionary_words(author_guid,
                                     'FakeNewsFeatureGenerator_{0}_fraction',
                                     '0.0', values)

    def assert_dictionary_words(self, author_guid, count_template,
                                default_value, values):
        fake_news_dictionary_words = self.fake_news_feature_generator._fake_news_dictionary
        for word in fake_news_dictionary_words:
            word = word.strip().replace(' ', '-')

            author_feature = self._db.get_author_feature(
                author_guid, count_template.format(word))
            if word in values:
                self.assertEqual(values[word], author_feature.attribute_value)
            else:
                self.assertEqual(default_value, author_feature.attribute_value)

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = 'test author'
        author.author_screen_name = author_guid
        author.name = 'test'
        author.domain = 'tests'
        author.statuses_count = 0
        author.created_at = "2017-06-14 05:00:00"
        self._db.add_author(author)
        self._author = author

    def _add_post(self,
                  title,
                  content,
                  date_str,
                  domain='Microblog',
                  post_type=None):
        post = Post()
        post.author = self._author.author_guid
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime(date_str)
        post.created_at = post.date
        post.post_type = post_type
        self._db.addPost(post)
        self._posts.append(post)

    def _get_params(self):
        posts = {self._author.author_guid: self._posts}
        params = params = {'authors': [self._author], 'posts': posts}
        return params

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass

    def _add_claim(self, claim_id, content, date_str, post_type=None):
        claim = Claim()
        claim.claim_id = claim_id
        claim.verdict = post_type
        claim.title = claim_id
        claim.description = content
        claim.verdict_date = convert_str_to_unicode_datetime(date_str)
        claim.url = "claim url"
        self._db.addPost(claim)
class TestSyntaxFeatureGenerator(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._posts = []
        self._author = None
        self.syntax_feature_generator = SyntaxFeatureGenerator(self._db, **{})

    def tearDown(self):
        self._db.session.close()
        pass

    def test_average_hashtags(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"#content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post3", u"content 1 #tag #yes", "2017-06-12 05:00:00")
        self._add_post(u"post4", u"content #1 #test #dont #fail #please",
                       "2017-06-12 05:00:00")

        self.syntax_feature_generator.execute()

        author_feature = self._db.get_author_feature(
            u"author_guid", u"SyntaxFeatureGenerator_average_hashtags")
        self.assertAlmostEqual(float(author_feature.attribute_value),
                               8.0 / 4,
                               places=4)

    def test_average_links(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"#content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"https://www.google.co.il 1",
                       "2017-06-12 05:00:00")
        self._add_post(u"post3", u"content 1 #tag http://www.google.co.il",
                       "2017-06-12 05:00:00")
        self._add_post(
            u"post4",
            u"http://www.bank.co.il #1 #test #dont http://www.ynet.co.il https://www.msn.co.il",
            "2017-06-12 05:00:00")

        self.syntax_feature_generator.execute()
        author_feature = self._db.get_author_feature(
            u"author_guid", u"SyntaxFeatureGenerator_average_links")
        self.assertAlmostEqual(float(author_feature.attribute_value),
                               5.0 / 4,
                               places=4)

    def test_average_user_mentions(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"@content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post3", u"content 1 @tag #@es", "2017-06-12 05:00:00")
        self._add_post(u"post4", u"content #1 @test @dont @fail #please",
                       "2017-06-12 05:00:00")

        self.syntax_feature_generator.execute()

        author_feature = self._db.get_author_feature(
            u"author_guid", u"SyntaxFeatureGenerator_average_user_mentions")
        self.assertAlmostEqual(float(author_feature.attribute_value),
                               6.0 / 4,
                               places=4)

    def test_average_post_lenth(self):
        self._add_author(u"author_guid")
        self._add_post(u"post1", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post2", u"content 1", "2017-06-12 05:00:00")
        self._add_post(u"post3", u"content 1 @tag #@es", "2017-06-12 05:00:00")
        self._add_post(u"post4", u"content #1 @test @dont @fail #please",
                       "2017-06-12 05:00:00")

        self.syntax_feature_generator.execute()

        author_feature = self._db.get_author_feature(
            u"author_guid", u"SyntaxFeatureGenerator_average_post_lenth")
        self.assertAlmostEqual(float(author_feature.attribute_value),
                               14.0 / 4,
                               places=4)

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'test author'
        author.name = u'test'
        author.domain = u'tests'
        author.statuses_count = 0
        self._db.add_author(author)
        self._author = author

    def _add_post(self, title, content, date_str, domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_guid
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime(date_str)
        post.created_at = post.date
        self._db.addPost(post)
        self._posts.append(post)

        self._author.statuses_count += 1
Esempio n. 13
0
class TestFakeNewsClassifier(TestCase):
    def setUp(self):
        self._db = DB()

        self._db.setUp()
        self._posts = []
        self._author = None

    def tearDown(self):
        self._db.session.close()

    def test_classify_by_dictionary_1_FN_1_FP(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")

        self._add_author('author_guid')
        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post5", "post6")
        self._add_claim_tweet_connection("post5", "post7")
        self._add_claim_tweet_connection("post5", "post8")
        self._add_claim_tweet_connection("post5", "post9")
        self._db.session.commit()

        self.fake_news_feature_classifier = FakeNewsClassifier(self._db)
        self.fake_news_feature_classifier.setUp()
        self.fake_news_feature_classifier.execute()
        output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv'
        output_file = open(output_file_path, 'r')
        reader = csv.DictReader(output_file)
        output_data = next(reader)
        self.assertAlmostEqual(float(output_data['FN (think good but bad)']),
                               1)
        self.assertAlmostEqual(float(output_data['FP (think bad but good)']),
                               1)
        self.assertAlmostEqual(float(output_data['accuracy']), 0.0)
        self.assertAlmostEqual(float(output_data['AUC']), 0.0)

    def test_classify_by_dictionary_1_FN_1_FP_and_ignore_1(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")

        self._add_author('author_guid')
        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post5", "post6")
        self._add_claim_tweet_connection("post5", "post7")
        self._add_claim_tweet_connection("post5", "post8")
        self._add_claim_tweet_connection("post5", "post9")

        self._add_claim('post10', 'the claim', "2017-06-10 05:00:00",
                        'unknown')
        self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post12", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post13", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post14", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post10", "post11")
        self._add_claim_tweet_connection("post10", "post12")
        self._add_claim_tweet_connection("post10", "post13")
        self._add_claim_tweet_connection("post10", "post14")

        self._db.session.commit()

        self.fake_news_feature_classifier = FakeNewsClassifier(self._db)
        self.fake_news_feature_classifier.setUp()
        self.fake_news_feature_classifier.execute()
        output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv'
        output_file = open(output_file_path, 'r')
        reader = csv.DictReader(output_file)
        output_data = next(reader)
        self.assertAlmostEqual(float(output_data['FN (think good but bad)']),
                               1)
        self.assertAlmostEqual(float(output_data['FP (think bad but good)']),
                               1)
        self.assertAlmostEqual(float(output_data['accuracy']), 0.0)
        self.assertAlmostEqual(float(output_data['AUC']), 0.0)

    def test_classify_by_dictionary_0_FN_0_FP(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")

        self._add_author('author_guid')
        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post5", "post6")
        self._add_claim_tweet_connection("post5", "post7")
        self._add_claim_tweet_connection("post5", "post8")
        self._add_claim_tweet_connection("post5", "post9")
        self._db.session.commit()

        self.fake_news_feature_classifier = FakeNewsClassifier(self._db)
        self.fake_news_feature_classifier.setUp()
        self.fake_news_feature_classifier.execute()
        output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv'
        output_file = open(output_file_path, 'r')
        reader = csv.DictReader(output_file)
        output_data = next(reader)
        self.assertAlmostEqual(float(output_data['FN (think good but bad)']),
                               0)
        self.assertAlmostEqual(float(output_data['FP (think bad but good)']),
                               0)
        self.assertAlmostEqual(float(output_data['accuracy']), 1.0)
        self.assertAlmostEqual(float(output_data['AUC']), 1.0)

    def test_classify_by_dictionary_1_FN_0_FP_3_claims(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")

        self._add_author('author_guid')
        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post5", "post6")
        self._add_claim_tweet_connection("post5", "post7")
        self._add_claim_tweet_connection("post5", "post8")
        self._add_claim_tweet_connection("post5", "post9")

        self._add_author('author_guid')
        self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post11", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post12", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post13", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post14", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post10", "post11")
        self._add_claim_tweet_connection("post10", "post12")
        self._add_claim_tweet_connection("post10", "post13")
        self._add_claim_tweet_connection("post10", "post14")
        self._db.session.commit()

        self.fake_news_feature_classifier = FakeNewsClassifier(self._db)
        self.fake_news_feature_classifier.setUp()
        self.fake_news_feature_classifier.execute()
        output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv'
        output_file = open(output_file_path, 'r')
        reader = csv.DictReader(output_file)
        output_data = next(reader)
        self.assertAlmostEqual(float(output_data['FN (think good but bad)']),
                               1)
        self.assertAlmostEqual(float(output_data['FP (think bad but good)']),
                               0)
        self.assertAlmostEqual(float(output_data['accuracy']),
                               0.666666,
                               places=4)
        self.assertAlmostEqual(float(output_data['AUC']), 0.75)

    def test_classify_by_dictionary_0_FN_1_FP_3_claims(self):
        self._add_author('author')
        self._add_claim('post0', 'the claim', "2017-06-10 05:00:00", 'FALSE')
        self._add_post("post1", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post2", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post3", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post4", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post0", "post1")
        self._add_claim_tweet_connection("post0", "post2")
        self._add_claim_tweet_connection("post0", "post3")
        self._add_claim_tweet_connection("post0", "post4")

        self._add_claim('post5', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post6", "1 bad word at all", "2017-06-12 05:00:00")
        self._add_post("post7", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post8", "no bad words at all", "2017-06-12 05:00:00")
        self._add_post("post9", "no bad words at all", "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post5", "post6")
        self._add_claim_tweet_connection("post5", "post7")
        self._add_claim_tweet_connection("post5", "post8")
        self._add_claim_tweet_connection("post5", "post9")

        self._add_claim('post10', 'the claim', "2017-06-10 05:00:00", 'TRUE')
        self._add_post("post11", "1 liar bad word joke", "2017-06-12 05:00:00")
        self._add_post("post12", "no bad words untrue at all liar",
                       "2017-06-12 05:00:00")
        self._add_post("post13", "no joke bad words at all laugh",
                       "2017-06-12 05:00:00")
        self._add_post("post14", " liar no didnt actually bad words at all",
                       "2017-06-12 05:00:00")
        self._add_claim_tweet_connection("post10", "post11")
        self._add_claim_tweet_connection("post10", "post12")
        self._add_claim_tweet_connection("post10", "post13")
        self._add_claim_tweet_connection("post10", "post14")
        self._db.session.commit()

        self.fake_news_feature_classifier = FakeNewsClassifier(self._db)
        self.fake_news_feature_classifier.setUp()
        self.fake_news_feature_classifier.execute()
        output_file_path = self.fake_news_feature_classifier._output_path + '/fake_news_classifier_results.csv'
        output_file = open(output_file_path, 'r')
        reader = csv.DictReader(output_file)
        output_data = next(reader)
        self.assertAlmostEqual(float(output_data['FN (think good but bad)']),
                               0)
        self.assertAlmostEqual(float(output_data['FP (think bad but good)']),
                               1)
        self.assertAlmostEqual(float(output_data['accuracy']),
                               0.666666,
                               places=4)
        self.assertAlmostEqual(float(output_data['AUC']), 0.75)

    def _add_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = 'test author'
        author.author_screen_name = author_guid
        author.name = 'test'
        author.domain = 'tests'
        author.statuses_count = 0
        author.created_at = "2017-06-14 05:00:00"
        self._db.add_author(author)
        self._author = author

    def _add_post(self,
                  title,
                  content,
                  date_str,
                  domain='Microblog',
                  post_type=None):
        post = Post()
        post.author = self._author.author_guid
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.date = convert_str_to_unicode_datetime(date_str)
        post.created_at = post.date
        post.post_type = post_type
        self._db.addPost(post)
        self._posts.append(post)

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass

    def _add_claim(self, claim_id, content, date_str, post_type=None):
        claim = Claim()
        claim.claim_id = claim_id
        claim.verdict = post_type
        claim.title = claim_id
        claim.description = content
        claim.verdict_date = convert_str_to_unicode_datetime(date_str)
        claim.url = "claim url"
        self._db.addPost(claim)
Esempio n. 14
0
class TestEntityToTopicConverter(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._posts = []
        self._post_dictionary = {}
        self._authors = []
        self._add_author('test author')
        self._preprocess_visualization = EntityToTopicConverter(self._db)

    def tearDown(self):
        self._db.session.close_all()
        self._db.deleteDB()
        self._db.session.close()

    def test_generate_topics_no_topics(self):
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {},
            'destination': {}
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        topics = self._db.get_topics()
        self.assertEqual(topics, [])

    def test_generate_topics_from_1_claim(self):
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._db.session.commit()
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {},
            'destination': {}
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.save_topic_entities()

        self.assertTopicInserted('claim1')

    def test_generate_topics_from_1_claim_and_remove_stop_words(self):
        self._add_post("test author", 'claim1', 'claim1 go to the house',
                       'Claim')
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {},
            'destination': {}
        }
        self._db.session.commit()
        self._preprocess_visualization._remove_stop_words = True
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.save_topic_entities()

        self.assertTopicInserted('claim1')

    def test_generate_topics_from_5_claims(self):
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author", 'claim2', 'claim2 content', 'Claim')
        self._add_post("test author", 'claim3', 'claim3 content move', 'Claim')
        self._add_post("test author", 'claim4', 'claim4 dif data', 'Claim')
        self._add_post("test author", 'claim5', 'claim5 some boring text',
                       'Claim')
        self._db.session.commit()
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {},
            'destination': {}
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.save_topic_entities()

        self.assertTopicInserted('claim1')
        self.assertTopicInserted('claim2')
        self.assertTopicInserted('claim3')
        self.assertTopicInserted('claim4')
        self.assertTopicInserted('claim5')

    def test_generate_post_topic_mapping_no_claim(self):
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {},
            'destination': {}
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_post_topic_mapping(
            source_id_target_elements_dict, arg)
        mappings = self._db.get_post_topic_mapping()
        self.assertEqual(0, len(mappings))

    def test_generate_post_topic_mapping_1_claim(self):
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author", 'post1', 'post1 content of data',
                       'Microblog')
        self._add_post("test author", 'post2', 'post2  bla bla', 'Microblog')
        self._add_post("test author", 'post3', 'post3 noting  new',
                       'Microblog')
        self._add_claim_tweet_connection('claim1', 'post1')
        self._add_claim_tweet_connection('claim1', 'post2')
        self._add_claim_tweet_connection('claim1', 'post3')
        self._db.session.commit()
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id',
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.generate_post_topic_mapping(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.save_topic_entities()

        mappings = self._db.get_post_topic_mapping()
        mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                    for tm in mappings]
        topic_id = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim1']
        self.assertEqual(3, len(mappings))
        self.assertSetEqual(
            {('post1', topic_id, 1.0), ('post2', topic_id, 1.0),
             ('post3', topic_id, 1.0)}, set(mappings))

    def test_generate_post_topic_mapping_2_claim(self):
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author", 'claim2', 'claim1 content', 'Claim')
        self._add_post("test author", 'post1', 'post1 content of data',
                       'Microblog')
        self._add_post("test author", 'post2', 'post2  bla bla', 'Microblog')
        self._add_post("test author", 'post3', 'post3 noting  new',
                       'Microblog')
        self._add_post("test author", 'post4', 'post4  bla bla', 'Microblog')
        self._add_post("test author", 'post5', 'post5 noting  new',
                       'Microblog')
        self._add_claim_tweet_connection('claim1', 'post1')
        self._add_claim_tweet_connection('claim1', 'post2')
        self._add_claim_tweet_connection('claim1', 'post3')
        self._add_claim_tweet_connection('claim2', 'post4')
        self._add_claim_tweet_connection('claim2', 'post5')
        self._db.session.commit()
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id',
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.generate_post_topic_mapping(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.save_topic_entities()

        mappings = self._db.get_post_topic_mapping()
        mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                    for tm in mappings]
        topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim1']
        topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim2']
        self.assertEqual(5, len(mappings))
        self.assertSetEqual(
            {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0),
             ('post3', topic_id1, 1.0), ('post4', topic_id2, 1.0),
             ('post5', topic_id2, 1.0)}, set(mappings))

    def test__generate_author_topic_mapping_2_claim(self):
        self._add_author('test author2')
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author2", 'claim2', 'claim1 content', 'Claim')
        self._add_post("test author", 'post1', 'post1 content of data',
                       'Microblog')
        self._add_post("test author", 'post2', 'post2  bla bla', 'Microblog')
        self._add_post("test author", 'post3', 'post3 noting  new',
                       'Microblog')
        self._add_post("test author", 'post4', 'post4  bla bla', 'Microblog')
        self._add_post("test author", 'post5', 'post5 noting  new',
                       'Microblog')
        self._add_claim_tweet_connection('claim1', 'post1')
        self._add_claim_tweet_connection('claim1', 'post2')
        self._add_claim_tweet_connection('claim1', 'post3')
        self._add_claim_tweet_connection('claim2', 'post4')
        self._add_claim_tweet_connection('claim2', 'post5')
        self._db.session.commit()
        arg = {
            'source': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Claim"
                }]
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id',
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": [{
                    "field_name": "domain",
                    "value": "Microblog"
                }]
            }
        }
        self._preprocess_visualization._domain = "Microblog"
        source_id_target_elements_dict = self._preprocess_visualization._get_source_id_target_elements(
            arg)
        self._preprocess_visualization.generate_topics_tables(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.generate_post_topic_mapping(
            source_id_target_elements_dict, arg)
        self._preprocess_visualization.generate_author_topic_mapping()
        self._preprocess_visualization.save_topic_entities()
        mapping = self._db.get_author_topic_mapping()
        self.assertEqual(2, len(mapping))
        self.assertSetEqual({('test author', 0.6, 0.4),
                             ('test author2', 0, 0)}, set(mapping))

    def test_visualization(self):
        self._add_author('test author2', "bad_actor")
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author2", 'claim2', 'claim2 content', 'Claim')
        self._add_post("test author", 'post1', 'post1 content of data',
                       'Microblog')
        self._add_post("test author", 'post2', 'post2  bla bla', 'Microblog')
        self._add_post("test author", 'post3', 'post3 noting  new',
                       'Microblog')
        self._add_post("test author2", 'post4', 'post4  bla bla', 'Microblog')
        self._add_post("test author2", 'post5', 'post5 noting  new',
                       'Microblog')
        self._add_claim_tweet_connection('claim1', 'post1')
        self._add_claim_tweet_connection('claim1', 'post2')
        self._add_claim_tweet_connection('claim1', 'post4')
        self._add_claim_tweet_connection('claim2', 'post3')
        self._add_claim_tweet_connection('claim2', 'post5')
        self._db.session.commit()
        self._preprocess_visualization._domain = "Microblog"
        self._preprocess_visualization.execute()

        author_topic_mapping = self._db.get_author_topic_mapping()
        post_topic_mappings = self._db.get_post_topic_mapping()
        post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                               for tm in post_topic_mappings]
        topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim1']
        topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim2']
        self.assertEqual(2, len(author_topic_mapping))
        self.assertSetEqual(
            {('test author', 0.666666666667, 0.333333333333),
             ('test author2', 0.5, 0.5)}, set(author_topic_mapping))
        self.assertSetEqual(
            {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0),
             ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0),
             ('post5', topic_id2, 1.0)}, set(post_topic_mappings))

    def test_double_execution_visualization(self):
        self._add_author('test author2', "bad_actor")
        self._add_post("test author", 'claim1', 'claim1 content', 'Claim')
        self._add_post("test author2", 'claim2', 'claim2 content', 'Claim')
        self._add_post("test author", 'post1', 'post1 content of data',
                       'Microblog')
        self._add_post("test author", 'post2', 'post2  bla bla', 'Microblog')
        self._add_post("test author", 'post3', 'post3 noting  new',
                       'Microblog')
        self._add_post("test author2", 'post4', 'post4  bla bla', 'Microblog')
        self._add_post("test author2", 'post5', 'post5 noting  new',
                       'Microblog')
        self._add_claim_tweet_connection('claim1', 'post1')
        self._add_claim_tweet_connection('claim1', 'post2')
        self._add_claim_tweet_connection('claim1', 'post4')
        self._add_claim_tweet_connection('claim2', 'post3')
        self._add_claim_tweet_connection('claim2', 'post5')
        self._db.session.commit()
        self._preprocess_visualization._domain = "Microblog"
        self._preprocess_visualization.execute()
        self._preprocess_visualization.execute()

        author_topic_mapping = self._db.get_author_topic_mapping()
        post_topic_mappings = self._db.get_post_topic_mapping()
        post_topic_mappings = [(tm.post_id, tm.max_topic_id, tm.max_topic_dist)
                               for tm in post_topic_mappings]
        topic_id1 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim1']
        topic_id2 = self._preprocess_visualization.get_source_id_topic_dictionary(
        )['claim2']
        self.assertEqual(2, len(author_topic_mapping))
        self.assertSetEqual(
            {('test author', 0.666666666667, 0.333333333333),
             ('test author2', 0.5, 0.5)}, set(author_topic_mapping))
        self.assertSetEqual(
            {('post1', topic_id1, 1.0), ('post2', topic_id1, 1.0),
             ('post3', topic_id2, 1.0), ('post4', topic_id1, 1.0),
             ('post5', topic_id2, 1.0)}, set(post_topic_mappings))

    def assertTopicInserted(self, claim_id):
        topics = self._db.get_topics()
        terms = self._db.get_terms()
        topic_dict = defaultdict(set)
        term_dict = {term.term_id: term.description for term in terms}
        for topic_id, term_id, prob in topics:
            topic_dict[topic_id].add(term_dict[term_id])
        topic_id = self._preprocess_visualization.get_source_id_topic_dictionary(
        )[claim_id]
        self.assertIn(topic_id, topic_dict)
        expected = set(
            clean_tweet(self._post_dictionary[claim_id].content).split(' '))
        if self._preprocess_visualization._remove_stop_words:
            expected = set(
                clean_content_by_nltk_stopwords(
                    self._post_dictionary[claim_id].content).split(' '))
        self.assertSetEqual(expected, topic_dict[topic_id])

    def _add_author(self, author_guid, type="good_actor"):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = author_guid
        author.author_screen_name = author_guid
        author.name = author_guid
        author.domain = 'Microblog'
        author.author_type = type
        self._db.add_author(author)
        self._authors.append(author)

    def _add_post(self, author_guid, title, content, domain='Microblog'):
        post = Post()
        post.author = author_guid
        post.author_guid = author_guid
        post.content = content
        post.title = title
        post.domain = domain
        post.post_id = title
        post.guid = post.post_id
        post.is_detailed = True
        post.is_LB = False
        self._db.addPost(post)
        self._posts.append(post)
        self._post_dictionary[post.post_id] = post

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass