class Word_Embeddings_Differential_Feature_Generator_Unittests(
        unittest.TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        self._model = Word_Embedding_Differential_Feature_Generator(self._db)

        self._posts = []
        self._author = None
        self._set_author(u'test_user')

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def test_simple_case(self):
        self._add_post(u'of to a for', u'of is')
        self._add_target_article(u'0', u'of was ', u'am that was')
        self._setup_test()

        is_vec1 = self._get_word_dimension(u'is', 0)
        was_vec_d1 = self._get_word_dimension(u'was', 0)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_np.mean_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        is_vec = self._words[u'is']
        was_vec = self._words['was']
        expected_val = commons.euclidean_distance(is_vec, was_vec)
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_distance_function_euclidean_distance_target_articles_title_np.mean_TO_posts_content_np.mean"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_few_words(self):
        self._add_post(u'of to a for', u'of is on')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        dimension = 140
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_opposite(self):
        self._add_post(u'am that was', u'of was that')
        self._add_target_article(u'0', u'of is on', u'of to a for')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot1 - tot2
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_empty_word(self):
        self._add_post(u'of to a for', u'')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()
        self.assertTrue(True)

    def _add_post(self, title, content):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = u'test'
        post.post_id = len(self._posts)
        post.guid = post.post_id
        self._db.addPost(post)
        self._posts.append(post)

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GloveWordEmbeddingModelCreator(
            self._db)
        self._word_embedding_model_creator.execute(None)

        params = {'authors': [self._author], 'posts': self._posts}
        self._model = Word_Embedding_Differential_Feature_Generator(
            self._db, **params)
        self._model.execute()

        self._words = self._db.get_word_embedding_dictionary()

    def _get_word_dimension(self, word, dimension):
        word_vec = self._words[word]
        return word_vec[dimension]

    def _add_target_article(self, post_id, title, description):
        target_article = Target_Article()
        target_article.author_guid = u'test_user'
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])
class TestGensimWordEmbeddingsModelTrainer(TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        # self._Word_Embedding_Model_Creator.execute(None)
        self._is_load_wikipedia_300d_glove_model = True
        self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt"
        self._table_name = "wikipedia_model_300d"
        self._word_vector_dict_full_path = "data/output/word_embedding/"
        self._word_vector_dict = {}

        self._author = None
        self._set_author(u'test_user')
        self._counter = 0
        self._posts = []

    def tearDown(self):
        self._db.session.close()

    def test_add_additional_fields_to_existing_table(self):
        self._add_post(u'was', u'is')
        self._add_post(u'is', u'was')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)

        self._word_embedding_model_creator.execute(None)
        self._word_embedding_model_creator._aggregation_functions_names = [
            'sum'
        ]
        self._word_embedding_model_creator.execute(None)

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == 'test_user')
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]
        sum_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'sum']
        mean_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'np.mean']

        try:
            if len(sum_value_df.values.tolist()) > 0 and len(
                    mean_value_df.values.tolist()) > 0:
                self.assertTrue(True)
            else:
                self.fail()
        except:
            self.fail()

    def test_case_post_represent_by_posts(self):
        self._add_post(u'post1', u'the claim', u'Claim')
        self._add_post(u'post2', u'dog cat pig man')  # 2
        self._add_post(u'post3', u'TV is the best guys')  # 1
        self._add_claim_tweet_connection(u'post1', u'post2')
        self._add_claim_tweet_connection(u'post1', u'post3')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator._targeted_fields_for_embedding = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id'
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id'
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": []
            }
        }]

        self._word_embedding_model_creator.execute(None)
        model_name_path = self._word_embedding_model_creator._prepare_model_name_path(
        )
        model = Word2Vec.load(model_name_path)
        word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict(
            model)
        self._words = word_vector_dict
        self._words_vectors = self._get_posts_val()
        expected_val = self._calc_results()
        self._generic_test(expected_val, u'post1')

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator.execute(None)

        self._words = self._db.get_word_embedding_dictionary()
        self._words_vectors = self._get_posts_val()

    def _generic_test(self, expected_value, source_id=u""):
        if source_id == u"":
            source_id = self._author.author_guid

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == source_id)
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]

        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'min')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'max')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'np.mean')

    def assert_word_embedding(self, db_results, expected_value, type):
        result_value = db_results.loc[db_results[u'word_embedding_type'] ==
                                      type, '0':].values.tolist()[0]
        self.assertEquals(list(expected_value[type]), result_value)

    def _generic_non_equal_test(self, expected_value):
        db_results = self._db.get_author_word_embedding(
            self._author.author_guid, u'posts', u'content')
        self.assertNotEqual(expected_value[u'min'], db_results[u'min'])
        self.assertNotEqual(expected_value[u'max'], db_results[u'max'])
        self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean'])

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _add_post(self, title, content, _domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = _domain
        post.post_id = title
        post.guid = title
        self._db.addPost(post)
        self._posts.append(post)

    def _get_posts_val(
            self):  # return the vectors for all the words in the added posts
        vals = {}
        for post in self._posts:
            for word in post.content.split():
                if word in self._words.keys():
                    vals[word] = self._words[word]
        return vals.values()

    def _calc_mean(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('np.mean'), ziped_vec)
        return tuple(result)

    def _calc_min(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('min'), ziped_vec)
        return tuple(result)

    def _calc_max(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('max'), ziped_vec)
        return tuple(result)

    def _calc_results(self):
        vectors = self._words_vectors
        results = {}
        results[u'min'] = self._calc_min(vectors)
        results[u'max'] = self._calc_max(vectors)
        results[u'np.mean'] = self._calc_mean(vectors)
        return results

    def _add_target_article(self, post_id, title, description, author_guid):
        target_article = Target_Article()
        target_article.author_guid = author_guid
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass
コード例 #3
0
class Twitter_Rest_Api(AbstractController):
    def __init__(self, db):
        AbstractController.__init__(self, db)

        self._working_app_number = self._config_parser.eval(
            self.__class__.__name__, "working_app_number")

        self._maximal_get_friend_ids_requests_in_window = self._config_parser.eval(
            self.__class__.__name__,
            "maximal_get_friend_ids_requests_in_window")

        self._maximal_get_follower_ids_requests_in_window = self._config_parser.eval(
            self.__class__.__name__,
            "maximal_get_follower_ids_requests_in_window")

        self._maximal_get_user_requests_in_window = self._config_parser.eval(
            self.__class__.__name__, "maximal_get_user_requests_in_window")

        self._maximal_user_ids_allowed_in_single_get_user_request = self._config_parser.eval(
            self.__class__.__name__,
            "maximal_user_ids_allowed_in_single_get_user_request")

        self._num_of_twitter_status_id_requests_without_checking = self._config_parser.eval(
            self.__class__.__name__,
            "num_of_twitter_status_id_requests_without_checking")
        self._num_of_twitter_timeline_requests_without_checking = self._config_parser.eval(
            self.__class__.__name__,
            "num_of_twitter_timeline_requests_without_checking")

        self._max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request = self._config_parser.eval(
            self.__class__.__name__,
            "max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request")

        self._max_num_of_tweet_ids_requests_without_checking = self._config_parser.eval(
            self.__class__.__name__,
            "max_num_of_tweet_ids_requests_without_checking")

        self._num_of_get_friend_ids_requests = 0
        self._num_of_get_follower_ids_requests = 0
        self._num_of_get_timeline_statuses = 0
        self._num_of_twitter_status_id_requests = 0
        self._num_of_twitter_timeline_requests = 0
        self._num_of_get_tweet_ids_requests = 0
        self._total_author_connections = []

        print("Creating TwitterApiRequester")
        self._twitter_api_requester = TwitterApiRequester()

        # self._find_source_twitter_id()

        logging.info("Setup DB...")
        print("Setup DB...")
        self._db = DB()
        self._db.setUp()

    def get_timeline_by_user_id(self, user_id):
        try:
            if self._num_of_get_timeline_statuses > self._num_of_twitter_timeline_requests_without_checking:
                seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_timeline(
                )
                if seconds_to_wait != 0:
                    self.count_down_time(seconds_to_wait)
                    self._num_of_get_timeline_statuses = 0
                timeline = self._twitter_api_requester.get_timeline_by_user_id(
                    user_id)
                self._num_of_get_timeline_statuses += 1
                print("Number of get timeline requests is: " +
                      str(self._num_of_get_timeline_statuses))

            return timeline

        except TwitterError as e:
            logging.info(e.message)
            if e.message == "Not authorized.":
                logging.info("Not authorized for user id: " + str(user_id))
                return None
            sec = self._twitter_api_requester.get_sleep_time_for_timeline()
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            self._num_of_get_timeline_statuses = 0
            timeline = self._twitter_api_requester.get_timeline_by_user_id(
                user_id)
            return timeline

    def handle_get_follower_ids_request(self, source_id):
        print("--- handle_get_follower_ids_request ---")
        logging.info("--- handle_get_follower_ids_request ---")
        follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(
            source_id)
        follower_connection_type = unicode(Author_Connection_Type.FOLLOWER)
        temp_author_connections = self._db.create_temp_author_connections(
            source_id, follower_ids, follower_connection_type)
        self._total_author_connections = self._total_author_connections + temp_author_connections
        return follower_ids

    def handle_get_user_ids_request(self, source_id, author_type):
        print("--- handle_get_user_ids_request ---")
        if author_type == Author_Connection_Type.FOLLOWER:
            user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(
                source_id)
        elif author_type == Author_Connection_Type.FRIEND:
            user_ids = self._twitter_api_requester.get_friend_ids_by_user_id(
                source_id)

        author_connections = self.create_author_connections(
            source_id, user_ids, author_type)
        self._total_author_connections = self._total_author_connections + author_connections
        return user_ids

    def handle_get_friend_ids_request(self, source_id):
        friend_ids = self._twitter_api_requester.get_friend_ids_by_user_id(
            source_id)
        friend_connection_type = unicode(Author_Connection_Type.FRIEND)
        author_connections = self.create_author_connections(
            source_id, friend_ids, friend_connection_type)
        self._total_author_connections = self._total_author_connections + author_connections
        return friend_ids

    def crawl_users_by_author_ids(self, author_ids, connection_type,
                                  author_type, are_user_ids, insertion_type):
        self._total_author_connections = []

        total_user_ids = self.crawl_users(author_ids, connection_type)

        self._db.save_author_connections(self._total_author_connections)

        total_user_ids_to_crawl = self.remove_already_crawled_authors(
            total_user_ids)

        users = self.handle_get_users_request(total_user_ids_to_crawl,
                                              are_user_ids, author_type,
                                              insertion_type)
        self.convert_twitter_users_to_authors_and_save(users, author_type,
                                                       insertion_type)

    def crawl_author_connections_by_author_ids(self, author_ids,
                                               connection_type, author_type,
                                               are_user_ids, insertion_type):
        self._total_author_connections = []

        total_user_ids = self.crawl_users_restricted(author_ids,
                                                     connection_type,
                                                     restriction=0)

        #self.remove_already_crawled_authors(total_user_ids) - TBD
        self._db.save_author_connections(self._total_author_connections)

    def crawl_users(self, author_ids, author_type):
        print("--- crawl_users ---")
        total_user_ids = []
        for author_id in author_ids:
            try:
                print("--- crawl_user_ids for author id : " + str(author_id))

                get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request"
                seconds_to_wait = getattr(self._twitter_api_requester,
                                          get_sleep_function_name)()
                if seconds_to_wait != 0:
                    self.save_connections_and_wait(seconds_to_wait)
                    init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests"
                    getattr(self._twitter_api_requester,
                            init_num_of_get_user_ids_requests_func_name)()

                get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id"
                user_ids = getattr(
                    self._twitter_api_requester,
                    get_user_ids_by_given_user_id_function_name)(author_id)

                temp_author_connections = self._db.create_temp_author_connections(
                    author_id, user_ids, author_type, self._window_start)
                self._total_author_connections = self._total_author_connections + temp_author_connections

                total_user_ids = list(set(total_user_ids + user_ids))
            except Exception as e:
                logging.exception(
                    "Failed getting followers or friends for user : {0}".
                    format(author_id))

        return total_user_ids

    def crawl_users_restricted(self, author_ids, author_type, restriction):
        print("--- crawl_users restricted---")
        total_user_ids = []
        for author_id in author_ids:
            try:
                print("--- crawl_user_ids for author id : " + str(author_id))

                get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request"
                seconds_to_wait = getattr(self._twitter_api_requester,
                                          get_sleep_function_name)()
                if seconds_to_wait != 0:
                    self.save_connections_and_wait(seconds_to_wait)
                    init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests"
                    getattr(self._twitter_api_requester,
                            init_num_of_get_user_ids_requests_func_name)()

                get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id"
                user_ids = getattr(
                    self._twitter_api_requester,
                    get_user_ids_by_given_user_id_function_name)(author_id)

                temp_author_connections = self._db.create_temp_author_connections(
                    author_id, user_ids, author_type, self._window_start)
                self._total_author_connections = self._total_author_connections + temp_author_connections

                total_user_ids = list(set(total_user_ids + user_ids))
            except Exception as e:
                logging.exception(
                    "Failed getting followers or friends for user : {0}".
                    format(author_id))

        return total_user_ids

    def check_already_crawled_author_guids(self, author_guids):
        print("--- check_already_crawled_author_ids ----")
        author_ids_to_crawl = []
        for author_guid in author_guids:
            authors_connections = self._db.get_author_connections_by_author_guid(
                author_guid)
            num_of_authors_connections = len(authors_connections)
            if num_of_authors_connections == 0:
                author_ids_to_crawl.append(author_guid)

        print("Number of authors ids to crawl is: " +
              str(len(author_ids_to_crawl)))
        logging.info("Number of authors ids to crawl is: " +
                     str(len(author_ids_to_crawl)))
        print(author_ids_to_crawl)
        logging.info(author_ids_to_crawl)
        return author_ids_to_crawl

    def check_already_crawled_post_id(self, post_id):
        post_retweeter_connections = self._db.get_post_retweeter_connections_by_post_id(
            post_id)
        num_of_post_retweeter_connections = len(post_retweeter_connections)
        if num_of_post_retweeter_connections == 0:
            return False
        return True

    def crawl_retweeters_by_post_id(self, post_ids, are_user_ids, author_type,
                                    bad_actors_collector_inseration_type):
        self._total_author_connections = []
        total_retweeter_ids = []
        for post_id in post_ids:
            retweeter_ids = self._twitter_api_requester.get_retweeter_ids_by_status_id(
                post_id)
            total_retweeter_ids = list(set(total_retweeter_ids +
                                           retweeter_ids))

            post_retweeter_connections = self._db.create_post_retweeter_connections(
                post_id, retweeter_ids)
            self._total_author_connections = self._total_author_connections + post_retweeter_connections

        self._db.save_author_connections(self._total_author_connections)
        self._total_author_connections = []

        users = self.handle_get_users_request(
            total_retweeter_ids, are_user_ids, author_type,
            bad_actors_collector_inseration_type)
        self.convert_twitter_users_to_authors_and_save(
            users, author_type, bad_actors_collector_inseration_type)

    def get_retweets_by_post_id(self, post_id):
        retweets = self._twitter_api_requester.get_retweets_by_status_id(
            post_id)
        print(retweets)

    def create_author_connections(self, source_author_id,
                                  destination_author_ids,
                                  author_connection_type):
        print("---create_author_connections---")
        logging.info("---create_author_connections---")
        author_connections = []
        for destination_author_id in destination_author_ids:
            author_connection = self.create_author_connection(
                source_author_id, destination_author_id,
                author_connection_type)
            author_connections.append(author_connection)

        return author_connections

    def create_author_connection(self, source_author_id, destination_author_id,
                                 connection_type):
        print("---create_author_connection---")
        author_connection = AuthorConnection()
        print("Author connection: source -> " + str(source_author_id) +
              ", dest -> " + str(destination_author_id) +
              ", connection type = " + connection_type)
        author_connection.source_author_osn_id = source_author_id
        author_connection.destination_author_osn_id = destination_author_id
        author_connection.connection_type = unicode(connection_type)
        author_connection.insertion_date = self._window_start

        return author_connection

    def count_down_time(self, seconds_to_wait):
        if seconds_to_wait is not 0:
            print("Seconds to wait is lower than 300: " + str(seconds_to_wait))
            logging.info("Seconds to wait is lower than 300: " +
                         str(seconds_to_wait))
            seconds_to_wait += 100
            print("Seconds to wait were increased to: " + str(seconds_to_wait))
            logging.info("Seconds to wait were increased to: " +
                         str(seconds_to_wait))
        elif seconds_to_wait is not 0 and seconds_to_wait < 400:
            print("Seconds to wait is lower than 400: " + str(seconds_to_wait))
            logging.info("Seconds to wait is lower than 400: " +
                         str(seconds_to_wait))
            seconds_to_wait += 90
            print("Seconds to wait were increased to: " + str(seconds_to_wait))
            logging.info("Seconds to wait were increased to: " +
                         str(seconds_to_wait))
        for i in xrange(seconds_to_wait, 0, -1):
            time.sleep(1)
            msg = "\r Count down: [{}]".format(i)
            print(msg, end="")
            # sys.stdout.write(str(i)+' ')
            # sys.stdout.flush()

    def convert_twitter_users_to_authors_and_save(self, total_twitter_users,
                                                  author_type,
                                                  inseration_type):
        authors = self.convert_twitter_users_to_authors(
            total_twitter_users, author_type, inseration_type)
        print("Total converted Twitter users into authors is: " +
              str(len(authors)))
        self.save_authors(authors)
        self._db.save_author_connections(self._total_author_connections)
        self._total_author_connections = []

    def convert_twitter_users_to_authors(self, total_twitter_users,
                                         author_type, inseration_type):
        print("---Converting Twitter users to authors---")
        convert_twitter_users_to_authors_start_time = time.time()
        authors = self._db.convert_twitter_users_to_authors(
            total_twitter_users, self._domain, author_type, inseration_type)
        convert_twitter_users_to_authors_end_time = time.time()
        convert_twitter_users_to_authors_time = convert_twitter_users_to_authors_end_time - convert_twitter_users_to_authors_start_time
        print("Convert Twitter users to authors took in seconds: " +
              str(convert_twitter_users_to_authors_time))

        return authors

    def save_authors(self, authors):
        print("---Saving authors in DB---")
        print("Number of authors to save is: " + str(len(authors)))
        save_authors_start_time = time.time()
        self._db.add_authors(authors)
        save_authors_end_time = time.time()
        save_authors_time = save_authors_end_time - save_authors_start_time
        print("Saving authors in DB took in seconds: " +
              str(save_authors_time))

    def save_author_connections(self):
        print("---Saving author connections in DB---")
        save_author_connections_start_time = time.time()
        self._db.add_author_connections(self._total_author_connections)
        save_author_connections_end_time = time.time()
        save_author_connections_time = save_author_connections_end_time - save_author_connections_start_time
        print("Saving author connections in DB took in seconds: " +
              str(save_author_connections_time))
        self._total_author_connections = []

    def handle_get_users_request(self, ids, are_user_ids, author_type,
                                 insertion_type):
        total_users = []
        users = []
        ids_in_chunks = split_into_equal_chunks(
            ids, self._maximal_user_ids_allowed_in_single_get_user_request)
        total_chunks = list(ids_in_chunks)
        ids_in_chunks = split_into_equal_chunks(
            ids, self._maximal_user_ids_allowed_in_single_get_user_request)
        print("Total authors ids in chunk from twitter API: " +
              str(len(total_chunks)))
        i = 0
        for ids_in_chunk in ids_in_chunks:
            i += 1
            print("Chunk of authors ids: " + str(i) + "/" +
                  str(len(total_chunks)))

            try:
                users = self.send_get_users_request_and_add_users(
                    ids_in_chunk, are_user_ids, users)
                total_users = total_users + users

            except TwitterError as e:

                print(e)
                error_messages = e.message
                error_message_dict = error_messages[0]
                error_code = error_message_dict['code']
                if error_code == 88:  # Rate limit exceeded
                    self.convert_twitter_users_to_authors_and_save(
                        total_users, author_type, insertion_type)
                    total_users = []

                    seconds_to_wait_object = self._twitter_api_requester.get_sleep_time_for_get_users_request(
                    )
                    if seconds_to_wait_object > 0:
                        count_down_time(seconds_to_wait_object)
                    #epoch_timestamp = seconds_to_wait_object.reset
                    #current_timestamp = time.time()
                    #seconds_to_wait = int(epoch_timestamp - current_timestamp + 5)
                    #count_down_time(seconds_to_wait)

                    users = self.send_get_users_request_and_add_users(
                        ids_in_chunk, are_user_ids, users)
                    total_users = total_users + users

        print("--- Finishing handle_get_users_request --- ")
        logging.info("--- Finishing handle_get_users_request --- ")
        # self.save_authors_and_connections(users, author_type, insertion_type)
        return total_users

    def save_authors_and_connections_and_wait(self, total_twitter_users,
                                              author_type, inseration_type):
        self.save_authors_and_connections(total_twitter_users, author_type,
                                          inseration_type)

        seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request(
        )
        self.count_down_time(seconds_to_wait)

    def save_authors_and_connections(self, total_twitter_users, author_type,
                                     inseration_type):
        self.convert_twitter_users_to_authors_and_save(total_twitter_users,
                                                       author_type,
                                                       inseration_type)

    def send_get_users_request_and_add_users(self, ids_in_chunk, are_user_ids,
                                             total_twitter_users):
        twitter_users = self.send_get_users_request(ids_in_chunk, are_user_ids)
        return twitter_users

    def save_connections_and_wait(self, seconds_to_wait):
        self.save_author_connections()
        self.count_down_time(seconds_to_wait)

    def send_get_users_request(self, ids_in_chunk, are_user_ids):
        if are_user_ids is True:
            twitter_users = self._twitter_api_requester.get_users_by_ids(
                ids_in_chunk)
        else:
            twitter_users = self._twitter_api_requester.get_users_by_screen_names(
                ids_in_chunk)

        return twitter_users

    def handle_retweeters_request(self, retweeter_ids, author_type,
                                  bad_actors_collector_inseration_type):
        total_retweeters = []
        retweeter_ids_in_chunks = split_into_equal_chunks(
            retweeter_ids,
            self._maximal_user_ids_allowed_in_single_get_user_request)
        for retweeter_ids_in_chunk in retweeter_ids_in_chunks:
            retweeters = self._twitter_api_requester.get_users_by_ids(
                retweeter_ids_in_chunk)
            total_retweeters = total_retweeters + retweeters

        self.convert_twitter_users_to_authors_and_save(
            total_retweeters, author_type,
            bad_actors_collector_inseration_type)

    def remove_already_crawled_authors(self, total_user_ids):
        print("remove_already_crawled_authors")
        number_of_extracted_users = len(total_user_ids)
        print("Total number of extracted users is: " +
              str(number_of_extracted_users))
        total_follower_ids_set = set(total_user_ids)

        already_crawled_author_ids = self._db.get_already_crawled_author_ids()
        number_of_already_crawled_authors = len(already_crawled_author_ids)
        print("Total number of already crawled users is: " +
              str(number_of_already_crawled_authors))
        already_crawled_author_ids_set = set(already_crawled_author_ids)

        authors_ids_to_crawl_set = total_follower_ids_set - already_crawled_author_ids_set
        number_of_remaining_authors_ids_to_crawl = len(
            authors_ids_to_crawl_set)
        print("Total number of remaining users to crawl is: " +
              str(number_of_remaining_authors_ids_to_crawl))

        authors_ids_to_crawl = list(authors_ids_to_crawl_set)

        return authors_ids_to_crawl

    def get_timline_by_author_id(self, author_id):
        author_timeline = self._twitter_api_requester.get_timeline_by_user_id(
            author_id)
        return author_timeline

    def get_status_by_twitter_status_id(self, id):
        # try:
        if self._num_of_twitter_status_id_requests >= self._num_of_twitter_status_id_requests_without_checking:
            seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_status_id(
            )
            if seconds_to_wait > 0:
                self.count_down_time(seconds_to_wait)
                self._num_of_twitter_status_id_requests = 0
        self._num_of_twitter_status_id_requests = self._num_of_twitter_status_id_requests + 1
        return self._twitter_api_requester.get_status(id)
        # except TwitterError as e:
        #     exception_response = e[0][0]
        #     logging.info("e.massage =" + exception_response["message"])
        #     code = exception_response["code"]
        #     logging.info("e.code =" + str(exception_response["code"]))
        #
        #     if code == 88:
        #         sec = self._twitter_api_requester.get_sleep_time_for_twitter_status_id()
        #         logging.info("Seconds to wait from catched crush is: " + str(sec))
        #         if sec != 0:
        #             count_down_time(sec)
        #             self._num_of_twitter_status_id_requests = 0
        #         return self._twitter_api_requester.get_status(id)

    def get_timeline_by_author_name(self, author_name,
                                    maximal_tweets_count_in_timeline):
        try:
            print("Number of timeline requests is: " +
                  str(self._num_of_twitter_timeline_requests))
            if self._num_of_twitter_timeline_requests >= self._num_of_twitter_timeline_requests_without_checking:
                seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request(
                )
                if seconds_to_wait > 0:
                    self.count_down_time(seconds_to_wait)
                    self._num_of_twitter_timeline_requests = 0
            self._num_of_twitter_timeline_requests = self._num_of_twitter_timeline_requests + 1
            return self._twitter_api_requester.get_timeline(
                author_name, maximal_tweets_count_in_timeline)

        except TwitterError as e:
            if e.message == "Not authorized.":
                logging.info("Not authorized for user id: " + str(author_name))
                return None

            exception_response = e[0][0]
            logging.info("e.massage =" + exception_response["message"])
            code = exception_response["code"]
            logging.info("e.code =" + str(exception_response["code"]))

            if code == 34:
                return None

            sec = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request(
            )
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            if sec != 0:
                self._num_of_twitter_timeline_requests = 0
            timeline = self._twitter_api_requester.get_timeline(
                author_name, maximal_tweets_count_in_timeline)
            return timeline

    def get_active_users_names_by_screen_names(self, chunk_of_names):
        try:
            users = self._twitter_api_requester.get_users_by_screen_names(
                chunk_of_names)
        except TwitterError as e:
            logging.info(e.message)
            sec = self._twitter_api_requester.get_sleep_time_for_get_users_request(
            )
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            users = self._twitter_api_requester.get_users_by_screen_names(
                chunk_of_names)
        return [user.screen_name for user in users]

    def get_sleep_time_for_twitter_status_id(self):
        return self._twitter_api_requester.get_sleep_time_for_twitter_status_id(
        )

    def get_status(self, id):

        return self._twitter_api_requester.get_status(id)

    def get_posts_by_terms(self, terms):
        posts = {
            term:
            self._twitter_api_requester.get_tweets_by_term(term, 'recent')
            for term in terms
        }
        return posts

    def get_post_by_post_id(self, post_id):
        return self._twitter_api_requester.get_tweet_by_post_id(post_id)

    def get_tweets_by_tweet_ids_and_add_to_db(self, tweet_ids):
        total_tweets = self.get_tweets_by_ids(tweet_ids)

        posts, authors = self._db.convert_tweets_to_posts_and_authors(
            total_tweets, self._domain)
        self._db.addPosts(posts)
        self._db.add_authors(authors)

        return total_tweets

        # move to schema definition

    def get_tweets_by_ids(self, tweet_ids, author_type=""):
        total_tweets = []
        ids_in_chunks = split_into_equal_chunks(
            tweet_ids, self.
            _max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request)
        # seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_tweets_by_tweet_ids_request()
        total_chunks = list(ids_in_chunks)
        ids_in_chunks = split_into_equal_chunks(
            tweet_ids, self.
            _max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request)
        i = 0
        for ids_in_chunk in ids_in_chunks:
            i += 1
            print("Chunk of tweet ids: " + str(i) + "/" +
                  str(len(total_chunks)))
            try:
                tweets = self._twitter_api_requester.get_tweets_by_post_ids(
                    ids_in_chunk)
                total_tweets = list(set(total_tweets + tweets))
                num_of_tweets = len(total_tweets)
                if num_of_tweets > 10000:
                    self._save_posts_and_authors(total_tweets, author_type)
                    total_tweets = []

            except TwitterError as e:
                print(e)
                error_messages = e.message
                error_message_dict = error_messages[0]
                error_code = error_message_dict['code']
                if error_code == 88:  # Rate limit exceeded
                    self._save_posts_and_authors(total_tweets, author_type)
                    total_tweets = []

                    seconds_to_wait_object = self._twitter_api_requester.get_sleep_time_for_get_tweets_by_tweet_ids_request(
                    )
                    epoch_timestamp = seconds_to_wait_object.reset
                    current_timestamp = time.time()
                    seconds_to_wait = int(epoch_timestamp - current_timestamp +
                                          5)
                    count_down_time(seconds_to_wait)
                    tweets = self._twitter_api_requester.get_tweets_by_post_ids(
                        ids_in_chunk)
                    total_tweets = list(set(total_tweets + tweets))
        return total_tweets

    # def create_post_from_tweet_data(self, tweet_data):
    #     author_name = tweet_data.user.screen_name
    #     tweet_author_guid = compute_author_guid_by_author_name(author_name)
    #     tweet_author_guid = cleanForAuthor(tweet_author_guid)
    #     tweet_post_twitter_id = str(tweet_data.id)
    #     tweet_url = generate_tweet_url(tweet_post_twitter_id, author_name)
    #     tweet_creation_time = tweet_data.created_at
    #     tweet_str_publication_date = extract_tweet_publiction_date(tweet_creation_time)
    #     tweet_guid = compute_post_guid(post_url=tweet_url, author_name=author_name,
    #                                    str_publication_date=tweet_str_publication_date)
    #
    #     post = Post(guid=tweet_guid, post_id=tweet_guid, url=unicode(tweet_url),
    #                       date=str_to_date(tweet_str_publication_date),
    #                       title=unicode(tweet_data.text), content=unicode(tweet_data.text),
    #                       post_osn_id=tweet_post_twitter_id,
    #                       author=unicode(author_name), author_guid=unicode(tweet_author_guid),
    #                       domain=unicode(self._domain),
    #                       retweet_count=unicode(tweet_data.retweet_count),
    #                       favorite_count=unicode(tweet_data.favorite_count),
    #                       timeline_importer_insertion_date=unicode(get_current_time_as_string()))
    #     return post

    def _save_posts_and_authors(self, total_tweets, author_type=None):
        posts, authors = self._db.convert_tweets_to_posts_and_authors(
            total_tweets, self._domain)
        for author in authors:
            author.author_type = author_type
        self._db.addPosts(posts)
        self._db.addPosts(authors)
コード例 #4
0
class RedditFeatureGeneratorTest(TestCase):
    def setUp(self):
        self._db = DB()
        self._db.setUp()
        self._posts = []
        self._author = None
        self._init_authors()
        self._init_posts()
        self._init_claims()
        self._reddit_post_by_claim_feature_generator = RedditPostByClaimFeatureGenerator(
            self._db, **self._get_params())
        self._reddit_author_by_claim_feature_generator = RedditAuthorByClaimFeatureGenerator(
            self._db, **self._get_params())

    def tearDown(self):
        self._db.session.close()
        pass

    def test_karma_by_submission_and_comment(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_karma_by_submission_and_comment',
            'expected': -13
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_karma_by_submission_and_comment',
            'expected': -321
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_karma_by_submission_and_comment',
            'expected': 1
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_karma_by_submission_and_comment',
            'expected': 52312
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_karma_by_submission_and_comment',
            'expected': 102
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_karma_by_submission_and_comment',
            'expected': 234
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_karma_by_submission_and_comment',
            'expected': 5904.222222
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_karma_by_submission_and_comment',
            'expected': -19.55555556
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_karma_by_submission_and_comment',
            'expected': 38.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_karma_by_submission_and_comment',
            'expected': 27
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_karma_by_submission_and_comment',
            'expected': 7
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_karma_by_submission_and_comment',
            'expected': 5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'skew_karma_by_submission_and_comment',
            'expected': 2.998904337
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'skew_karma_by_submission_and_comment',
            'expected': -2.525365088
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'skew_karma_by_submission_and_comment',
            'expected': 2.234762661
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'kurtosis_karma_by_submission_and_comment',
            'expected': 8.995080203
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'kurtosis_karma_by_submission_and_comment',
            'expected': 7.357797068
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'kurtosis_karma_by_submission_and_comment',
            'expected': 4.503581242
        }]
        self._reddit_post_by_claim_feature_generator._measure_names = [
            'karma_by_submission_and_comment'
        ]
        self._reddit_post_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_post_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_post_by_claim_feature_generator.
                __class__.__name__)

    def test_karma_by_submission(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_karma_by_submission',
            'expected': 738
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_karma_by_submission',
            'expected': -321
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_karma_by_submission',
            'expected': 123
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_karma_by_submission',
            'expected': 52312
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_karma_by_submission',
            'expected': 102
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_karma_by_submission',
            'expected': 234
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_karma_by_submission',
            'expected': 26525
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_karma_by_submission',
            'expected': -109.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_karma_by_submission',
            'expected': 178.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_karma_by_submission',
            'expected': 26525
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_karma_by_submission',
            'expected': -109.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_karma_by_submission',
            'expected': 178.5
        }]
        self._reddit_post_by_claim_feature_generator._measure_names = [
            'karma_by_submission'
        ]
        self._reddit_post_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_post_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_post_by_claim_feature_generator.
                __class__.__name__)

    def test_upvotes_by_submission(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_upvotes_by_submission',
            'expected': 762
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_upvotes_by_submission',
            'expected': 112
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_upvotes_by_submission',
            'expected': 369
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_upvotes_by_submission',
            'expected': 74593
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_upvotes_by_submission',
            'expected': 241
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_upvotes_by_submission',
            'expected': 2067
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_upvotes_by_submission',
            'expected': 37677.5
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_upvotes_by_submission',
            'expected': 176.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_upvotes_by_submission',
            'expected': 1218
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_upvotes_by_submission',
            'expected': 37677.5
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_upvotes_by_submission',
            'expected': 176.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_upvotes_by_submission',
            'expected': 1218
        }]
        self._reddit_post_by_claim_feature_generator._measure_names = [
            'upvotes_by_submission'
        ]
        self._reddit_post_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_post_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_post_by_claim_feature_generator.
                __class__.__name__)

    def test_downvotes_by_submission(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_downvotes_by_submission',
            'expected': 24
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_downvotes_by_submission',
            'expected': 10
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_downvotes_by_submission',
            'expected': 246
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_downvotes_by_submission',
            'expected': 22281
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_downvotes_by_submission',
            'expected': 562
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_downvotes_by_submission',
            'expected': 1833
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_downvotes_by_submission',
            'expected': 11152.5
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_downvotes_by_submission',
            'expected': 286
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_downvotes_by_submission',
            'expected': 1039.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_downvotes_by_submission',
            'expected': 11152.5
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_downvotes_by_submission',
            'expected': 286
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_downvotes_by_submission',
            'expected': 1039.5
        }]
        self._reddit_post_by_claim_feature_generator._measure_names = [
            'downvotes_by_submission'
        ]
        self._reddit_post_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_post_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_post_by_claim_feature_generator.
                __class__.__name__)

    def test_author_comment_karma(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_comment_karma',
            'expected': 2261
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_comment_karma',
            'expected': 2842
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_comment_karma',
            'expected': 2842
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_comment_karma',
            'expected': 37027
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_comment_karma',
            'expected': 35111
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_comment_karma',
            'expected': 30880
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_comment_karma',
            'expected': 19096.66667
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_comment_karma',
            'expected': 18031
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_comment_karma',
            'expected': 11833.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_comment_karma',
            'expected': 22588
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_comment_karma',
            'expected': 16555
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_comment_karma',
            'expected': 6806
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'skew_comment_karma',
            'expected': -0.018614054
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'skew_comment_karma',
            'expected': 0.128211429
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'skew_comment_karma',
            'expected': 1.862860226
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'kurtosis_comment_karma',
            'expected': -1.992620739
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'kurtosis_comment_karma',
            'expected': -2.723581645
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'kurtosis_comment_karma',
            'expected': 3.595027437
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'comment_karma'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_link_karma(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_link_karma',
            'expected': 1
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_link_karma',
            'expected': 1
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_link_karma',
            'expected': 90
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_link_karma',
            'expected': 171576
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_link_karma',
            'expected': 171576
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_link_karma',
            'expected': 5897
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_link_karma',
            'expected': 20565.77778
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_link_karma',
            'expected': 29840.16667
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_link_karma',
            'expected': 1866
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_link_karma',
            'expected': 1341
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_link_karma',
            'expected': 738.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_link_karma',
            'expected': 738.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'skew_link_karma',
            'expected': 2.991811692
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'skew_link_karma',
            'expected': 2.443747273
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'skew_link_karma',
            'expected': 1.751305522
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'kurtosis_link_karma',
            'expected': 8.963145712
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'kurtosis_link_karma',
            'expected': 5.977609271
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'kurtosis_link_karma',
            'expected': 3.018013716
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'link_karma'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_total_karma(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'min_total_karma',
            'expected': 2435
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'min_total_karma',
            'expected': 6379
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'min_total_karma',
            'expected': 6379
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'max_total_karma',
            'expected': 206687
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'max_total_karma',
            'expected': 206687
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'max_total_karma',
            'expected': 32221
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'mean_total_karma',
            'expected': 39662.44444
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'mean_total_karma',
            'expected': 47871.16667
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'mean_total_karma',
            'expected': 13699.5
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'median_total_karma',
            'expected': 22589
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'median_total_karma',
            'expected': 17240.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'median_total_karma',
            'expected': 8099
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'skew_total_karma',
            'expected': 2.767953592
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'skew_total_karma',
            'expected': 2.349097328
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'skew_total_karma',
            'expected': 1.963784833
        }, {
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': 'kurtosis_total_karma',
            'expected': 7.954685555
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': 'kurtosis_total_karma',
            'expected': 5.605190323
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': 'kurtosis_total_karma',
            'expected': 3.878351431
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'total_karma'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = [
            'min', 'max', 'mean', 'median', 'skew', 'kurtosis'
        ]
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_count_is_gold(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_count_is_gold',
            'expected': 3
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_count_is_gold',
            'expected': 3
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_count_is_gold',
            'expected': 3
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'count_is_gold'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_count_is_moderator(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_count_is_moderator',
            'expected': 2
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_count_is_moderator',
            'expected': 1
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_count_is_moderator',
            'expected': 0
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'count_is_moderator'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_count_is_employee(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_count_is_employee',
            'expected': 3
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_count_is_employee',
            'expected': 1
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_count_is_employee',
            'expected': 1
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'count_is_employee'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_ratio_is_gold(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_ratio_is_gold',
            'expected': 0.333333333
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_ratio_is_gold',
            'expected': 0.5
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_ratio_is_gold',
            'expected': 0.75
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'ratio_is_gold'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_ratio_is_moderator(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_ratio_is_moderator',
            'expected': 0.222222222
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_ratio_is_moderator',
            'expected': 0.166666667
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_ratio_is_moderator',
            'expected': 0
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'ratio_is_moderator'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def test_author_ratio_is_employee(self):
        test_cases = [{
            'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c',
            'test_name': '_ratio_is_employee',
            'expected': 0.333333333
        }, {
            'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe',
            'test_name': '_ratio_is_employee',
            'expected': 0.166666667
        }, {
            'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            'test_name': '_ratio_is_employee',
            'expected': 0.25
        }]
        self._reddit_author_by_claim_feature_generator._measure_names = [
            'ratio_is_employee'
        ]
        self._reddit_author_by_claim_feature_generator._aggregation_functions = []
        self._reddit_author_by_claim_feature_generator.execute()

        for test_case in test_cases:
            self.assert_author_feature_test_case(
                test_case, self._reddit_author_by_claim_feature_generator.
                __class__.__name__)

    def assert_author_feature_test_case(self, test_case, class_name):
        self.assert_author_feature_number(
            test_case['claim_id'], "{}_{}".format(class_name,
                                                  test_case['test_name']),
            test_case['expected'])

    def assert_author_feature_number(self, author_guid, attribute_name,
                                     expected):
        result_feature = self._db.get_author_feature(author_guid,
                                                     attribute_name)
        feature_value = getattr(result_feature, 'attribute_value')
        self.assertAlmostEqual(float(expected), float(feature_value), places=2)

    def _add_author(self,
                    name=None,
                    link_karma=None,
                    comment_karma=None,
                    is_employee=0,
                    is_mod=0,
                    is_gold=0,
                    author_osn_id=None):
        author = Author()
        reddit_author = RedditAuthor()
        author.name = name
        author.author_screen_name = author.name
        author.author_guid = compute_author_guid_by_author_name(author.name)
        author.domain = 'reddit'
        author.author_osn_id = author_osn_id
        author.author_full_name = name
        author.url = 'https://www.reddit.com/user/' + name

        reddit_author.name = author.name
        reddit_author.author_guid = author.author_guid

        reddit_author.comments_count = None
        reddit_author.comment_karma = comment_karma
        reddit_author.link_karma = link_karma
        reddit_author.is_gold = is_gold
        reddit_author.is_moderator = is_mod
        reddit_author.is_employee = is_employee

        self._db.add_authors([author])
        self._db.add_reddit_authors([reddit_author])
        # self._author = author

    def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1):
        post = Post()
        post.post_osn_id = post_osn_id
        post.author = str(author)
        post.author_guid = compute_author_guid_by_author_name(post.author)
        post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M")
        post.url = 'https://www.reddit.com{}'.format(
            post.author)  # just for test
        post.guid = compute_post_guid(post.url, post.post_osn_id,
                                      date_to_str(post.created_at))
        post.domain = 'reddit_comment'
        post.post_type = 'reddit_comment'
        post.post_id = post.guid

        reddit_post = RedditPost()
        reddit_post.post_id = post.post_id
        reddit_post.guid = post.guid
        reddit_post.score = score
        if upvote_ratio != -1:
            post.domain = 'reddit_post'
            post.post_type = 'reddit_post'
            reddit_post.upvote_ratio = upvote_ratio
            reddit_post.ups = int(
                round((reddit_post.upvote_ratio * reddit_post.score) /
                      (2 * reddit_post.upvote_ratio - 1)) if
                reddit_post.upvote_ratio != 0.5 else round(reddit_post.score /
                                                           2))
            reddit_post.downs = reddit_post.ups - reddit_post.score
        else:
            reddit_post.ups = -1
            reddit_post.downs = -1
            reddit_post.upvote_ratio = -1

        self._db.addPosts([post, reddit_post])
        return post, reddit_post

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])

    def _add_claim(self, claim_id):
        claim = Claim()
        claim.claim_id = claim_id
        self._db.addPosts([claim])

    def _init_authors(self):
        self._add_author('Smile_lifeisgood',
                         comment_karma=30880,
                         link_karma=1341,
                         is_gold=1,
                         is_mod=0,
                         is_employee=0)
        self._add_author('Cunty_Balls',
                         comment_karma=7369,
                         link_karma=90,
                         is_gold=1,
                         is_mod=0,
                         is_employee=0)
        self._add_author('I_kick_fuck_nuns',
                         comment_karma=2842,
                         link_karma=5897,
                         is_gold=1,
                         is_mod=0,
                         is_employee=0)
        self._add_author('TheRiseofMindhawk',
                         comment_karma=2261,
                         link_karma=174,
                         is_gold=1,
                         is_mod=1,
                         is_employee=0)
        self._add_author('dialog2011',
                         comment_karma=37027,
                         link_karma=4582,
                         is_gold=0,
                         is_mod=0,
                         is_employee=1)
        self._add_author('chrmanyaki',
                         comment_karma=22588,
                         link_karma=1,
                         is_gold=0,
                         is_mod=0,
                         is_employee=1)
        self._add_author('Undertakerjoe',
                         comment_karma=9177,
                         link_karma=1384,
                         is_gold=0,
                         is_mod=0,
                         is_employee=0)
        self._add_author('Lmb2298',
                         comment_karma=25741,
                         link_karma=1,
                         is_gold=0,
                         is_mod=0,
                         is_employee=0)
        self._add_author('azzazaz',
                         comment_karma=35111,
                         link_karma=171576,
                         is_gold=0,
                         is_mod=1,
                         is_employee=0)
        self._add_author('juanwonone1',
                         comment_karma=6243,
                         link_karma=136,
                         is_gold=0,
                         is_mod=0,
                         is_employee=1)

    def _init_posts(self):
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('juanwonone1', '15/10/2017 21:44', '76ksr4', 738,
                           0.97)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('Lmb2298', '01/10/2017 22:24', 'dferfgh', 52312,
                           0.77)[0].guid)

        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('dialog2011', '12/06/2017 23:45', '6gv0vk',
                           27)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('chrmanyaki', '15/10/2017 21:58', 'doeq8ke',
                           27)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('azzazaz', '12/06/2018 10:50', 'e0j4zkz',
                           32)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('Smile_lifeisgood', '12/06/2018 20:08', 'e0in2zm',
                           11)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('Undertakerjoe', '15/10/2017 22:17', 'doerbqu',
                           -13)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('I_kick_fuck_nuns', '18/06/2017 3:39', 'dj1qid5',
                           2)[0].guid)
        self._add_claim_tweet_connection(
            'cd2e1978-4dfa-3a40-b62f-71153001629c',
            self._add_post('TheRiseofMindhawk', '13/06/2017 8:17', 'ditymrc',
                           2)[0].guid)

        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('I_kick_fuck_nuns', '11/06/2018 18:49', '8qal3m',
                           102, 0.92)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('juanwonone1', '16/10/2017 2:23', 'dof4fen', -321,
                           0.3)[0].guid)

        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('Smile_lifeisgood', '13/06/2017 0:29', 'dditbt8r',
                           11)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('Lmb2298', '15/10/2017 22:38', 'doeslie',
                           11)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('azzazaz', '16/10/2017 0:30', 'doeyvtb', 9)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('juanwonone1', '15/10/2017 22:50', 'doetc6j',
                           7)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('Cunty_Balls', '16/10/2017 1:52', 'dof2x1x',
                           2)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('Cunty_Balls', '16/10/2017 2:43', 'dof5cpo',
                           2)[0].guid)
        self._add_claim_tweet_connection(
            'a4beae51-463f-33fc-bbf6-20eca5104afe',
            self._add_post('juanwonone1', '16/10/2017 3:45', 'dof84f8',
                           1)[0].guid)

        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('Cunty_Balls', '15/10/2017 22:24', 'doerqsj', 234,
                           0.53)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('I_kick_fuck_nuns', '16/10/2017 21:44', '76ksr2',
                           123, 0.6)[0].guid)

        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('Smile_lifeisgood', '13/06/2017 7:04', 'ditvpox',
                           7)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('Smile_lifeisgood', '13/06/2017 0:51', 'ditcy28',
                           5)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('juanwonone1', '15/10/2017 23:36', 'doevzsq',
                           5)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('juanwonone1', '16/10/2017 0:26', 'doeynrr',
                           5)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('I_kick_fuck_nuns', '11/06/2018 21:55', 'e0hy5he',
                           1)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('I_kick_fuck_nuns', '11/06/2018 22:04', 'e0hyrhi',
                           1)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('I_kick_fuck_nuns', '12/06/2018 1:31', 'e0icveq',
                           1)[0].guid)
        self._add_claim_tweet_connection(
            '9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
            self._add_post('Cunty_Balls', '13/06/2017 7:55', 'ditxua6',
                           3)[0].guid)

    def _init_claims(self):
        self._add_claim('cd2e1978-4dfa-3a40-b62f-71153001629c')
        self._add_claim('a4beae51-463f-33fc-bbf6-20eca5104afe')
        self._add_claim('9e875999-9a3e-3357-bfa6-ede4fe67c1c9')

    def _get_params(self):
        return {'authors': [], 'posts': []}


# test_cases = [
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'min_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'min_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'min_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'max_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'max_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'max_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'mean_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'mean_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'mean_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'median_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'median_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'median_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'skew_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'skew_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'skew_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c',
#                 'test_name': 'kurtosis_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe',
#                 'test_name': 'kurtosis_',
#                 'expected':
#             },
#             {
#                 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9',
#                 'test_name': 'kurtosis_',
#                 'expected':
#             }
#         ]
コード例 #5
0
class SocialNetworkCrawler(AbstractController):
    def __init__(self, db):
        AbstractController.__init__(self, db)

        self._working_app_number = self._config_parser.eval(self.__class__.__name__, "working_app_number")

        self._maximal_get_friend_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__,
                                                                                   "maximal_get_friend_ids_requests_in_window")

        self._maximal_get_follower_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__,
                                                                                     "maximal_get_follower_ids_requests_in_window")

        self._maximal_get_user_requests_in_window = self._config_parser.eval(self.__class__.__name__,
                                                                             "maximal_get_user_requests_in_window")

        self._maximal_user_ids_allowed_in_single_get_user_request = self._config_parser.eval(self.__class__.__name__,
                                                                                             "maximal_user_ids_allowed_in_single_get_user_request")

        self._num_of_twitter_status_id_requests_without_checking = self._config_parser.eval(self.__class__.__name__,
                                                                                            "num_of_twitter_status_id_requests_without_checking")
        self._num_of_twitter_timeline_requests_without_checking = self._config_parser.eval(self.__class__.__name__,
                                                                                           "num_of_twitter_timeline_requests_without_checking")

        self._num_of_get_friend_ids_requests = 0
        self._num_of_get_follower_ids_requests = 0
        self._num_of_get_timeline_statuses = 0
        self._num_of_twitter_status_id_requests = 0
        self._num_of_twitter_timeline_requests = 0
        self._num_of_get_twitter_users_requests = 0
        self._total_author_connections = []
        self._total_follower_ids = []

        print("Creating TwitterApiRequester")
        self._twitter_api_requester = TwitterApiRequester(self._working_app_number)

        # self._find_source_twitter_id()

        logging.info("Setup DB...")
        print("Setup DB...")
        self._db = DB()
        self._db.setUp()

    def fill_followers_ids_only(self, author_ids):
        for i, author_id in enumerate(author_ids):
            print("author_id: {0} {1}/{2}".format(author_id, i, len(author_ids)))
            follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)
            temp_author_connections = self._db.create_temp_author_connections(author_id, follower_ids, "follower",
                                                                              self._window_start)
            self._total_author_connections = self._total_author_connections + temp_author_connections

            if len(self._total_author_connections) > 1000000:
                self._db.addPosts(self._total_author_connections)
                self._total_author_connections = []

        self._db.addPosts(self._total_author_connections)

    def fill_followers_and_their_data_simultaneously(self, author_ids):

        for i, author_id in enumerate(author_ids):
            if self._num_of_get_follower_ids_requests < self._maximal_get_follower_ids_requests_in_window:

                self._send_get_follower_ids_for_author_id(author_id, i, author_ids)

            else:
                author_type = None
                are_user_ids = True
                insertion_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR
                users = self.handle_get_users_request(self._total_follower_ids, are_user_ids, author_type,
                                                      insertion_type)
                self.convert_twitter_users_to_authors_and_save(users, "follower", insertion_type)
                #self._db.addPosts(self._total_author_connections)
                #self._db.save_author_connections(self._total_author_connections)

                self._num_of_get_twitter_users_requests = 0
                self._total_follower_ids = []
                #self._total_author_connections = []

                T = time.time()
                diff = (T - self._last_follower_request_time) / 60
                # window time to wait
                if diff < 15:
                    count_down_time(diff * 60)
                self._send_get_follower_ids_for_author_id(author_id, i, author_ids)


    def _send_get_follower_ids_for_author_id(self, author_id, i, author_ids):
        follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)

        self._last_follower_request_time = time.time()

        self._num_of_get_follower_ids_requests += 1

        print("Bring followers {0}:{1}/{2}".format(author_id, i, len(author_ids)))

        self._total_follower_ids = self._total_follower_ids + follower_ids

        temp_author_connections = self._db.create_temp_author_connections(author_id, follower_ids, "follower",
                                                                          self._window_start)
        self._total_author_connections = self._total_author_connections + temp_author_connections



    def get_timeline_by_user_id(self, user_id):
        try:
            if self._num_of_get_timeline_statuses > self._num_of_twitter_timeline_requests_without_checking:
                seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_timeline()
                if seconds_to_wait != 0:
                    self.count_down_time(seconds_to_wait)
                    self._num_of_get_timeline_statuses = 0
                timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id)
                self._num_of_get_timeline_statuses += 1
                print("Number of get timeline requests is: " + str(self._num_of_get_timeline_statuses))

            return timeline

        except TwitterError as e:
            logging.info(e.message)
            if e.message == "Not authorized.":
                logging.info("Not authorized for user id: " + str(user_id))
                return None
            sec = self._twitter_api_requester.get_sleep_time_for_timeline()
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            self._num_of_get_timeline_statuses = 0
            timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id)
            return timeline

    def handle_get_follower_ids_request(self, source_id):
        print("--- handle_get_follower_ids_request ---")
        logging.info("--- handle_get_follower_ids_request ---")
        follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id)
        follower_connection_type = str(Author_Connection_Type.FOLLOWER)
        temp_author_connections = self._db.create_temp_author_connections(source_id, follower_ids,
                                                                          follower_connection_type)
        self._total_author_connections = self._total_author_connections + temp_author_connections
        return follower_ids

    def handle_get_user_ids_request(self, source_id, author_type):
        print("--- handle_get_user_ids_request ---")
        if author_type == Author_Connection_Type.FOLLOWER:
            user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id)
        elif author_type == Author_Connection_Type.FRIEND:
            user_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id)

        author_connections = self.create_author_connections(source_id, user_ids, author_type)
        self._total_author_connections = self._total_author_connections + author_connections
        return user_ids

    def handle_get_friend_ids_request(self, source_id):
        friend_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id)
        friend_connection_type = str(Author_Connection_Type.FRIEND)
        author_connections = self.create_author_connections(source_id, friend_ids, friend_connection_type)
        self._total_author_connections = self._total_author_connections + author_connections
        return friend_ids

    def crawl_users_by_author_ids(self, author_ids, connection_type, author_type, are_user_ids, insertion_type):
        self._total_author_connections = []

        total_follower_ids, already_checked_author_ids = self.get_followers_until_exception(author_ids, connection_type)

        self._db.save_author_connections(self._total_author_connections)

        total_user_ids_to_crawl = self.remove_already_crawled_authors(total_follower_ids)

        users = self.handle_get_users_request(total_user_ids_to_crawl, are_user_ids, author_type, insertion_type)
        self.convert_twitter_users_to_authors_and_save(users, author_type, insertion_type)

        return total_follower_ids, already_checked_author_ids


    def get_follower_ids(self, author_id):
        user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)
        return user_ids

    def get_sleep_time_for_follower_ids(self):
        user_ids = self._twitter_api_requester.get_sleep_time_for_follower_ids()
        return user_ids

    # def crawl_followers_by_twitter_author_ids(self, author_ids, author_type, are_user_ids, inseration_type):
    #     print("--- crawl_followers_by_twitter_author_ids ---")
    #
    #     #authors_ids_to_crawl = self.check_already_crawled_author_ids(author_ids)
    #     total_follower_ids = self.crawl_followers_ids(author_ids)
    #
    #     self.save_author_connections()
    #
    #     total_follower_ids_to_crawl = self.remove_already_crawled_authors(total_follower_ids)
    #
    #     self.handle_get_users_request(total_follower_ids_to_crawl, are_user_ids, author_type, inseration_type)
    #     #self.convert_twitter_users_to_authors_and_save(followers, author_type, inseration_type)

    # def crawl_friends_by_twitter_author_ids(self, author_ids, author_type, are_user_ids, inseration_type):
    #     # authors_ids_to_crawl = self.check_already_crawled_author_ids(author_ids)
    #     total_friends_ids = self.crawl_friends_ids(author_ids)
    #
    #     self.save_author_connections()
    #
    #     total_friends_ids_to_crawl = self.remove_already_crawled_authors(total_friends_ids)
    #
    #     friends = self.handle_get_users_request(total_friends_ids_to_crawl, are_user_ids, author_type, inseration_type)
    #     self.convert_twitter_users_to_authors_and_save(friends, author_type, inseration_type)

    # def crawl_retweeters_by_twitter_post_ids(self, post_ids, author_type, inseration_type):
    #     #authors_ids_to_crawl = self.check_already_crawled_author_ids(post_ids)
    #     total_follower_ids = self.crawl_retweeters_ids(post_ids)
    #
    #     self.save_author_connections()
    #     are_user_ids = True
    #     followers = self.handle_get_users_request(total_follower_ids, are_user_ids, author_type, inseration_type)
    #     self.convert_twitter_users_to_authors_and_save(followers, author_type, inseration_type)

    # def crawl_retweeters_ids(self, posts_ids):
    #     total_retweeter_ids = []
    #     for posts_id in posts_ids:
    #         seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_retweeter_ids_request()
    #         if seconds_to_wait == 0:
    #             retweeter_ids = self.handle_get_retweeter_ids_request(posts_id)
    #             total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids))
    #         else:
    #             self.save_connections_and_wait(seconds_to_wait)
    #             self._twitter_api_requester.init_num_of_get_follower_ids_requests()
    #             retweeter_ids = self.handle_get_retweeter_ids_request(posts_id)
    #             total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids))
    #     return total_retweeter_ids

    # def crawl_users(self, author_ids, author_type):
    #     print("--- crawl_users ---")
    #     total_user_ids = []
    #     for author_id in author_ids:
    #         print("--- crawl_user_ids for author id : " + str(author_id))
    #
    #         get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request"
    #         seconds_to_wait = getattr(self._twitter_api_requester, get_sleep_function_name)()
    #         if seconds_to_wait != 0:
    #             self.save_connections_and_wait(seconds_to_wait)
    #             init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests"
    #             getattr(self._twitter_api_requester, init_num_of_get_user_ids_requests_func_name)()
    #
    #         get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id"
    #         user_ids = getattr(self._twitter_api_requester, get_user_ids_by_given_user_id_function_name)(author_id)
    #
    #         temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type,
    #                                                                           self._window_start)
    #         self._total_author_connections = self._total_author_connections + temp_author_connections
    #
    #         total_user_ids = list(set(total_user_ids + user_ids))
    #
    #     return total_user_ids

    def crawl_users(self, author_ids, author_type):
        total_user_ids = []
        for i, author_id in enumerate(author_ids):
            msg = "\r Bring followers for authors: {0}/{1}".format(i, len(author_ids))
            print(msg, end="")

            try:
                user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)

                temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type,
                                                                                  self._window_start)
                self._total_author_connections = self._total_author_connections + temp_author_connections

                total_user_ids = list(set(total_user_ids + user_ids))

            except TwitterError as e:
                exception_response = e[0][0]
                logging.info("e.massage =" + exception_response["message"])
                code = exception_response["code"]
                logging.info("e.code =" + str(exception_response["code"]))

                if code == 88:
                    sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request()
                    sec = sec + 100
                    logging.info("Seconds to wait from catched crush is: " + str(sec))
                    if sec != 0:
                        count_down_time(sec)

                    user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)

                    temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type,
                                                                                      self._window_start)
                    self._total_author_connections = self._total_author_connections + temp_author_connections

                    total_user_ids = list(set(total_user_ids + user_ids))

        return total_user_ids



    def get_followers_until_exception(self, author_ids, author_type):
        total_follower_ids = []
        already_checked_author_ids = []
        for i, author_id in enumerate(author_ids):
            msg = "\r Bring followers for authors: {0}/{1}".format(i, len(author_ids))
            print(msg, end="")

            try:
                user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)
                already_checked_author_ids.append(author_id)

                if len(user_ids) > 0:
                    temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type,
                                                                                      self._window_start)
                    self._total_author_connections = self._total_author_connections + temp_author_connections

                    total_follower_ids = list(set(total_follower_ids + user_ids))
            except TwitterError as e:
                if e.message == "Not authorized.":
                    logging.info("Not authorized for user id: {0}".format(author_id))
                    return total_follower_ids, already_checked_author_ids

                exception_response = e[0][0]
                logging.info("e.massage =" + exception_response["message"])
                code = exception_response["code"]
                logging.info("e.code =" + str(exception_response["code"]))

                if code == 34:
                    return total_follower_ids, already_checked_author_ids

                if code == 88:
                    sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request()
                    sec = sec + 10
                    # logging.info("Seconds to wait from catched crush is: " + str(sec))
                    # if sec != 0:
                    print("Number of seconds to wait: {0}".format(sec))
                    count_down_time(sec)

                    try:
                        user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)
                        already_checked_author_ids.append(author_id)

                        temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids,
                                                                                          author_type,
                                                                                          self._window_start)
                        self._total_author_connections = self._total_author_connections + temp_author_connections

                        total_follower_ids = list(set(total_follower_ids + user_ids))

                    except TwitterError as e:
                        if e.message == "Not authorized.":
                            logging.info("Not authorized for user id: {0}".format(author_id))
                            return total_follower_ids, already_checked_author_ids

            # except TwitterError as e:
            #     exception_response = e[0][0]
            #     logging.info("e.massage =" + exception_response["message"])
            #     code = exception_response["code"]
            #     logging.info("e.code =" + str(exception_response["code"]))
            #
            #     if code == 88 and len(already_checked_author_ids) != 0:
            #         return total_follower_ids, already_checked_author_ids
            #     elif code == 88 and len(already_checked_author_ids) == 0:
            #         sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request()
            #         sec = sec + 3100
            #         # logging.info("Seconds to wait from catched crush is: " + str(sec))
            #         # if sec != 0:
            #         print("Number of seconds to wait: {0}".format(sec))
            #         count_down_time(sec)
            #         try:
            #             user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id)
            #             already_checked_author_ids.append(author_id)
            #
            #             temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type,
            #                                                                               self._window_start)
            #             self._total_author_connections = self._total_author_connections + temp_author_connections
            #
            #             total_follower_ids = list(set(total_follower_ids + user_ids))
            #             return total_follower_ids, already_checked_author_ids
            #         except TwitterError as e:
            #             if e.message == "Not authorized.":
            #                 logging.info("Not authorized for user id: {0}".format(author_id))
            #                 return total_follower_ids, already_checked_author_ids

        return total_follower_ids, already_checked_author_ids




    def check_already_crawled_author_guids(self, author_guids):
        print("--- check_already_crawled_author_ids ----")
        author_ids_to_crawl = []
        for author_guid in author_guids:
            authors_connections = self._db.get_author_connections_by_author_guid(author_guid)
            num_of_authors_connections = len(authors_connections)
            if num_of_authors_connections == 0:
                author_ids_to_crawl.append(author_guid)

        print("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl)))
        logging.info("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl)))
        print(author_ids_to_crawl)
        logging.info(author_ids_to_crawl)
        return author_ids_to_crawl

    def check_already_crawled_post_id(self, post_id):
        post_retweeter_connections = self._db.get_post_retweeter_connections_by_post_id(post_id)
        num_of_post_retweeter_connections = len(post_retweeter_connections)
        if num_of_post_retweeter_connections == 0:
            return False
        return True

    def crawl_retweeters_by_post_id(self, post_ids, are_user_ids, author_type, bad_actors_collector_inseration_type):
        self._total_author_connections = []
        total_retweeter_ids = []
        for post_id in post_ids:
            retweeter_ids = self._twitter_api_requester.get_retweeter_ids_by_status_id(post_id)
            total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids))

            post_retweeter_connections = self._db.create_post_retweeter_connections(post_id, retweeter_ids)
            self._total_author_connections = self._total_author_connections + post_retweeter_connections

        self._db.save_author_connections(self._total_author_connections)
        self._total_author_connections = []

        users = self.handle_get_users_request(total_retweeter_ids, are_user_ids, author_type,
                                              bad_actors_collector_inseration_type)
        self.convert_twitter_users_to_authors_and_save(users, author_type, bad_actors_collector_inseration_type)

    def get_retweets_by_post_id(self, post_id):
        retweets = self._twitter_api_requester.get_retweets_by_status_id(post_id)
        print(retweets)

    # def create_author_connections(self, source_author_id, destination_author_ids, author_connection_type):
    #     print("---create_author_connections---")
    #     logging.info("---create_author_connections---")
    #     author_connections = []
    #     for destination_author_id in destination_author_ids:
    #         author_connection = self.create_author_connection(source_author_id, destination_author_id, author_connection_type)
    #         author_connections.append(author_connection)
    #
    #     return author_connections

    # def create_author_connection(self, source_author_id, destination_author_id, connection_type):
    #     print("---create_author_connection---")
    #     author_connection = AuthorConnection()
    #     print("Author connection: source -> " + str(source_author_id) + ", dest -> " + str(destination_author_id) + ", connection type = " + connection_type)
    #     author_connection.source_author_osn_id = source_author_id
    #     author_connection.destination_author_osn_id = destination_author_id
    #     author_connection.connection_type = unicode(connection_type)
    #     author_connection.insertion_date = self._window_start
    #
    #     return author_connection

    def count_down_time(self, seconds_to_wait):
        if seconds_to_wait is not 0:
            print("Seconds to wait is lower than 300: " + str(seconds_to_wait))
            logging.info("Seconds to wait is lower than 300: " + str(seconds_to_wait))
            seconds_to_wait += 100
            print("Seconds to wait were increased to: " + str(seconds_to_wait))
            logging.info("Seconds to wait were increased to: " + str(seconds_to_wait))
        elif seconds_to_wait is not 0 and seconds_to_wait < 400:
            print("Seconds to wait is lower than 400: " + str(seconds_to_wait))
            logging.info("Seconds to wait is lower than 400: " + str(seconds_to_wait))
            seconds_to_wait += 90
            print("Seconds to wait were increased to: " + str(seconds_to_wait))
            logging.info("Seconds to wait were increased to: " + str(seconds_to_wait))
        for i in range(seconds_to_wait, 0, -1):
            time.sleep(1)
            msg = "\r Count down: [{}]".format(i)
            print(msg, end="")
            # sys.stdout.write(str(i)+' ')
            # sys.stdout.flush()

    def convert_twitter_users_to_authors_and_save(self, total_twitter_users, author_type, inseration_type):
        authors = self.convert_twitter_users_to_authors(total_twitter_users, author_type, inseration_type)
        print("Total converted Twitter users into authors is: " + str(len(authors)))
        self.save_authors(authors)
        self._db.save_author_connections(self._total_author_connections)
        self._total_author_connections = []

    def convert_twitter_users_to_authors(self, total_twitter_users, author_type, inseration_type):
        print("---Converting Twitter users to authors---")
        convert_twitter_users_to_authors_start_time = time.time()
        authors = self._db.convert_twitter_users_to_authors(total_twitter_users, self._domain, author_type,
                                                            inseration_type)
        convert_twitter_users_to_authors_end_time = time.time()
        convert_twitter_users_to_authors_time = convert_twitter_users_to_authors_end_time - convert_twitter_users_to_authors_start_time
        print("Convert Twitter users to authors took in seconds: " + str(convert_twitter_users_to_authors_time))

        return authors

    def save_authors(self, authors):
        print("---Saving authors in DB---")
        print("Number of authors to save is: " + str(len(authors)))
        save_authors_start_time = time.time()
        self._db.add_authors(authors)
        save_authors_end_time = time.time()
        save_authors_time = save_authors_end_time - save_authors_start_time
        print("Saving authors in DB took in seconds: " + str(save_authors_time))

    def save_author_connections(self):
        print("---Saving author connections in DB---")
        save_author_connections_start_time = time.time()
        self._db.add_author_connections(self._total_author_connections)
        save_author_connections_end_time = time.time()
        save_author_connections_time = save_author_connections_end_time - save_author_connections_start_time
        print("Saving author connections in DB took in seconds: " + str(save_author_connections_time))
        self._total_author_connections = []

    def handle_get_users_request(self, ids, are_user_ids, author_type, insertion_type):
        total_users = []
        users = []
        ids_in_chunks = split_into_equal_chunks(ids,
                                                self._maximal_user_ids_allowed_in_single_get_user_request)
        #seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request()
        total_chunks = list(ids_in_chunks)
        ids_in_chunks = split_into_equal_chunks(ids,
                                                self._maximal_user_ids_allowed_in_single_get_user_request)
        print("Total authors ids in chunk from twitter API: " + str(len(total_chunks)))
        i = 1
        for ids_in_chunk in ids_in_chunks:
            print("Chunk of authors ids: " + str(i) + "/" + str(len(total_chunks)))
            i += 1
            try:
                #num_of_get_users_requests = self._twitter_api_requester.get_num_of_get_users_requests()

                # if seconds_to_wait != 0:
                #     self.save_authors_and_connections_and_wait(users, author_type, insertion_type)
                #     users = []
                #     self._twitter_api_requester.init_num_of_get_users_requests()
                if self._num_of_get_twitter_users_requests < self._maximal_get_user_requests_in_window:

                    users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids,
                                                                      users)
                    total_users = list(set(total_users + users))
                    self._num_of_get_twitter_users_requests += 1


            except TwitterError as e:
                logging.info(e.message)
                sec = self._twitter_api_requester.get_sleep_time_for_get_users_request()
                logging.info("Seconds to wait from catched crush is: " + str(sec))
                count_down_time(sec)
                users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users)
                total_users = list(set(total_users + users))

            except Exception as e:
                logging.info(e.message)
                sec = self._twitter_api_requester.get_sleep_time_for_get_users_request()
                logging.info("Seconds to wait from catched crush is: " + str(sec))
                count_down_time(sec)
                users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users)
                total_users = list(set(total_users + users))

        print("--- Finishing handle_get_users_request --- ")
        logging.info("--- Finishing handle_get_users_request --- ")
        # self.save_authors_and_connections(users, author_type, insertion_type)
        return total_users

    def save_authors_and_connections_and_wait(self, total_twitter_users, author_type, inseration_type):
        self.save_authors_and_connections(total_twitter_users, author_type, inseration_type)

        seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request()
        self.count_down_time(seconds_to_wait)

    def save_authors_and_connections(self, total_twitter_users, author_type, inseration_type):
        self.convert_twitter_users_to_authors_and_save(total_twitter_users, author_type, inseration_type)

    def send_get_users_request_and_add_users(self, ids_in_chunk, are_user_ids, total_twitter_users):
        twitter_users = self.send_get_users_request(ids_in_chunk, are_user_ids)
        total_twitter_users = total_twitter_users + twitter_users
        return total_twitter_users

    def save_connections_and_wait(self, seconds_to_wait):
        self.save_author_connections()
        self.count_down_time(seconds_to_wait)

    def send_get_users_request(self, ids_in_chunk, are_user_ids):
        if are_user_ids is True:
            twitter_users = self._twitter_api_requester.get_users_by_ids(ids_in_chunk)
        else:
            twitter_users = self._twitter_api_requester.get_users_by_screen_names(ids_in_chunk)

        return twitter_users

    def handle_retweeters_request(self, retweeter_ids, author_type, bad_actors_collector_inseration_type):
        total_retweeters = []
        retweeter_ids_in_chunks = split_into_equal_chunks(retweeter_ids,
                                                          self._maximal_user_ids_allowed_in_single_get_user_request)
        for retweeter_ids_in_chunk in retweeter_ids_in_chunks:
            retweeters = self._twitter_api_requester.get_users_by_ids(retweeter_ids_in_chunk)
            total_retweeters = total_retweeters + retweeters

        self.convert_twitter_users_to_authors_and_save(total_retweeters, author_type,
                                                       bad_actors_collector_inseration_type)

    def remove_already_crawled_authors(self, total_user_ids):
        print("remove_already_crawled_authors")
        number_of_extracted_users = len(total_user_ids)
        print("Total number of extracted users is: " + str(number_of_extracted_users))
        total_follower_ids_set = set(total_user_ids)

        already_crawled_author_ids = self._db.get_already_crawled_author_ids()
        number_of_already_crawled_authors = len(already_crawled_author_ids)
        print("Total number of already crawled users is: " + str(number_of_already_crawled_authors))
        already_crawled_author_ids_set = set(already_crawled_author_ids)

        authors_ids_to_crawl_set = total_follower_ids_set - already_crawled_author_ids_set
        number_of_remaining_authors_ids_to_crawl = len(authors_ids_to_crawl_set)
        print("Total number of remaining users to crawl is: " + str(number_of_remaining_authors_ids_to_crawl))

        authors_ids_to_crawl = list(authors_ids_to_crawl_set)

        return authors_ids_to_crawl

    def get_timline_by_author_id(self, author_id):
        author_timeline = self._twitter_api_requester.get_timeline_by_user_id(author_id)
        return author_timeline

    def get_status_by_twitter_status_id(self, id):
        # try:
        if self._num_of_twitter_status_id_requests >= self._num_of_twitter_status_id_requests_without_checking:
            seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_status_id()
            if seconds_to_wait > 0:
                self.count_down_time(seconds_to_wait)
                self._num_of_twitter_status_id_requests = 0
        self._num_of_twitter_status_id_requests = self._num_of_twitter_status_id_requests + 1
        return self._twitter_api_requester.get_status(id)
        # except TwitterError as e:
        #     exception_response = e[0][0]
        #     logging.info("e.massage =" + exception_response["message"])
        #     code = exception_response["code"]
        #     logging.info("e.code =" + str(exception_response["code"]))
        #
        #     if code == 88:
        #         sec = self._twitter_api_requester.get_sleep_time_for_twitter_status_id()
        #         logging.info("Seconds to wait from catched crush is: " + str(sec))
        #         if sec != 0:
        #             count_down_time(sec)
        #             self._num_of_twitter_status_id_requests = 0
        #         return self._twitter_api_requester.get_status(id)

    def get_timeline_by_author_name(self, author_name, maximal_tweets_count_in_timeline):
        try:
            print("Number of timeline requests is: " + str(self._num_of_twitter_timeline_requests))
            if self._num_of_twitter_timeline_requests >= self._num_of_twitter_timeline_requests_without_checking:
                seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request()
                if seconds_to_wait > 0:
                    self.count_down_time(seconds_to_wait)
                    self._num_of_twitter_timeline_requests = 0
            self._num_of_twitter_timeline_requests = self._num_of_twitter_timeline_requests + 1
            return self._twitter_api_requester.get_timeline(author_name, maximal_tweets_count_in_timeline)

        except TwitterError as e:
            if e.message == "Not authorized.":
                logging.info("Not authorized for user id: " + str(author_name))
                return None

            exception_response = e[0][0]
            logging.info("e.massage =" + exception_response["message"])
            code = exception_response["code"]
            logging.info("e.code =" + str(exception_response["code"]))

            if code == 34:
                return None

            sec = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request()
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            if sec != 0:
                self._num_of_twitter_timeline_requests = 0
            timeline = self._twitter_api_requester.get_timeline(author_name, maximal_tweets_count_in_timeline)
            return timeline

    def get_active_users_names_by_screen_names(self, chunk_of_names):
        try:
            users = self._twitter_api_requester.get_users_by_screen_names(chunk_of_names)
        except TwitterError as e:
            logging.info(e.message)
            sec = self._twitter_api_requester.get_sleep_time_for_get_users_request()
            logging.info("Seconds to wait from catched crush is: " + str(sec))
            count_down_time(sec)
            users = self._twitter_api_requester.get_users_by_screen_names(chunk_of_names)
        return [user.screen_name for user in users]

    def get_sleep_time_for_twitter_status_id(self):
        return self._twitter_api_requester.get_sleep_time_for_twitter_status_id()

    def get_status(self, id):

        return self._twitter_api_requester.get_status(id)