class RedditFeatureGeneratorTest(TestCase): def setUp(self): self._db = DB() self._db.setUp() self._posts = [] self._author = None self._init_authors() self._init_posts() self._init_claims() self._reddit_post_by_claim_feature_generator = RedditPostByClaimFeatureGenerator( self._db, **self._get_params()) self._reddit_author_by_claim_feature_generator = RedditAuthorByClaimFeatureGenerator( self._db, **self._get_params()) def tearDown(self): self._db.session.close() pass def test_karma_by_submission_and_comment(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_karma_by_submission_and_comment', 'expected': -13 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_karma_by_submission_and_comment', 'expected': -321 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_karma_by_submission_and_comment', 'expected': 1 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 52312 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 102 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_karma_by_submission_and_comment', 'expected': 234 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': 5904.222222 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': -19.55555556 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_karma_by_submission_and_comment', 'expected': 38.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 27 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 7 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_karma_by_submission_and_comment', 'expected': 5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': 2.998904337 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': -2.525365088 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_karma_by_submission_and_comment', 'expected': 2.234762661 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 8.995080203 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 7.357797068 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_karma_by_submission_and_comment', 'expected': 4.503581242 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'karma_by_submission_and_comment' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_karma_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_karma_by_submission', 'expected': 738 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_karma_by_submission', 'expected': -321 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_karma_by_submission', 'expected': 123 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_karma_by_submission', 'expected': 52312 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_karma_by_submission', 'expected': 102 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_karma_by_submission', 'expected': 234 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_karma_by_submission', 'expected': 26525 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_karma_by_submission', 'expected': -109.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_karma_by_submission', 'expected': 178.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_karma_by_submission', 'expected': 26525 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_karma_by_submission', 'expected': -109.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_karma_by_submission', 'expected': 178.5 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'karma_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_upvotes_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_upvotes_by_submission', 'expected': 762 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_upvotes_by_submission', 'expected': 112 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_upvotes_by_submission', 'expected': 369 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_upvotes_by_submission', 'expected': 74593 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_upvotes_by_submission', 'expected': 241 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_upvotes_by_submission', 'expected': 2067 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_upvotes_by_submission', 'expected': 37677.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_upvotes_by_submission', 'expected': 176.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_upvotes_by_submission', 'expected': 1218 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_upvotes_by_submission', 'expected': 37677.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_upvotes_by_submission', 'expected': 176.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_upvotes_by_submission', 'expected': 1218 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'upvotes_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_downvotes_by_submission(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_downvotes_by_submission', 'expected': 24 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_downvotes_by_submission', 'expected': 10 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_downvotes_by_submission', 'expected': 246 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_downvotes_by_submission', 'expected': 22281 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_downvotes_by_submission', 'expected': 562 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_downvotes_by_submission', 'expected': 1833 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_downvotes_by_submission', 'expected': 11152.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_downvotes_by_submission', 'expected': 286 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_downvotes_by_submission', 'expected': 1039.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_downvotes_by_submission', 'expected': 11152.5 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_downvotes_by_submission', 'expected': 286 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_downvotes_by_submission', 'expected': 1039.5 }] self._reddit_post_by_claim_feature_generator._measure_names = [ 'downvotes_by_submission' ] self._reddit_post_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_post_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_post_by_claim_feature_generator. __class__.__name__) def test_author_comment_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_comment_karma', 'expected': 2261 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_comment_karma', 'expected': 2842 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_comment_karma', 'expected': 2842 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_comment_karma', 'expected': 37027 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_comment_karma', 'expected': 35111 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_comment_karma', 'expected': 30880 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_comment_karma', 'expected': 19096.66667 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_comment_karma', 'expected': 18031 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_comment_karma', 'expected': 11833.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_comment_karma', 'expected': 22588 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_comment_karma', 'expected': 16555 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_comment_karma', 'expected': 6806 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_comment_karma', 'expected': -0.018614054 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_comment_karma', 'expected': 0.128211429 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_comment_karma', 'expected': 1.862860226 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_comment_karma', 'expected': -1.992620739 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_comment_karma', 'expected': -2.723581645 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_comment_karma', 'expected': 3.595027437 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'comment_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_link_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_link_karma', 'expected': 1 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_link_karma', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_link_karma', 'expected': 90 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_link_karma', 'expected': 171576 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_link_karma', 'expected': 171576 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_link_karma', 'expected': 5897 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_link_karma', 'expected': 20565.77778 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_link_karma', 'expected': 29840.16667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_link_karma', 'expected': 1866 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_link_karma', 'expected': 1341 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_link_karma', 'expected': 738.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_link_karma', 'expected': 738.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_link_karma', 'expected': 2.991811692 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_link_karma', 'expected': 2.443747273 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_link_karma', 'expected': 1.751305522 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_link_karma', 'expected': 8.963145712 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_link_karma', 'expected': 5.977609271 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_link_karma', 'expected': 3.018013716 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'link_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_total_karma(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'min_total_karma', 'expected': 2435 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'min_total_karma', 'expected': 6379 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'min_total_karma', 'expected': 6379 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'max_total_karma', 'expected': 206687 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'max_total_karma', 'expected': 206687 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'max_total_karma', 'expected': 32221 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'mean_total_karma', 'expected': 39662.44444 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'mean_total_karma', 'expected': 47871.16667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'mean_total_karma', 'expected': 13699.5 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'median_total_karma', 'expected': 22589 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'median_total_karma', 'expected': 17240.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'median_total_karma', 'expected': 8099 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'skew_total_karma', 'expected': 2.767953592 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'skew_total_karma', 'expected': 2.349097328 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'skew_total_karma', 'expected': 1.963784833 }, { 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': 'kurtosis_total_karma', 'expected': 7.954685555 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': 'kurtosis_total_karma', 'expected': 5.605190323 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': 'kurtosis_total_karma', 'expected': 3.878351431 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'total_karma' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [ 'min', 'max', 'mean', 'median', 'skew', 'kurtosis' ] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_gold(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_gold', 'expected': 3 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_gold', 'expected': 3 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_gold', 'expected': 3 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_gold' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_moderator(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_moderator', 'expected': 2 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_moderator', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_moderator', 'expected': 0 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_moderator' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_count_is_employee(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_count_is_employee', 'expected': 3 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_count_is_employee', 'expected': 1 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_count_is_employee', 'expected': 1 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'count_is_employee' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_gold(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_gold', 'expected': 0.333333333 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_gold', 'expected': 0.5 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_gold', 'expected': 0.75 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_gold' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_moderator(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_moderator', 'expected': 0.222222222 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_moderator', 'expected': 0.166666667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_moderator', 'expected': 0 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_moderator' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def test_author_ratio_is_employee(self): test_cases = [{ 'claim_id': 'cd2e1978-4dfa-3a40-b62f-71153001629c', 'test_name': '_ratio_is_employee', 'expected': 0.333333333 }, { 'claim_id': 'a4beae51-463f-33fc-bbf6-20eca5104afe', 'test_name': '_ratio_is_employee', 'expected': 0.166666667 }, { 'claim_id': '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', 'test_name': '_ratio_is_employee', 'expected': 0.25 }] self._reddit_author_by_claim_feature_generator._measure_names = [ 'ratio_is_employee' ] self._reddit_author_by_claim_feature_generator._aggregation_functions = [] self._reddit_author_by_claim_feature_generator.execute() for test_case in test_cases: self.assert_author_feature_test_case( test_case, self._reddit_author_by_claim_feature_generator. __class__.__name__) def assert_author_feature_test_case(self, test_case, class_name): self.assert_author_feature_number( test_case['claim_id'], "{}_{}".format(class_name, test_case['test_name']), test_case['expected']) def assert_author_feature_number(self, author_guid, attribute_name, expected): result_feature = self._db.get_author_feature(author_guid, attribute_name) feature_value = getattr(result_feature, 'attribute_value') self.assertAlmostEqual(float(expected), float(feature_value), places=2) def _add_author(self, name=None, link_karma=None, comment_karma=None, is_employee=0, is_mod=0, is_gold=0, author_osn_id=None): author = Author() reddit_author = RedditAuthor() author.name = name author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = 'reddit' author.author_osn_id = author_osn_id author.author_full_name = name author.url = 'https://www.reddit.com/user/' + name reddit_author.name = author.name reddit_author.author_guid = author.author_guid reddit_author.comments_count = None reddit_author.comment_karma = comment_karma reddit_author.link_karma = link_karma reddit_author.is_gold = is_gold reddit_author.is_moderator = is_mod reddit_author.is_employee = is_employee self._db.add_authors([author]) self._db.add_reddit_authors([reddit_author]) # self._author = author def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post def _add_claim_tweet_connection(self, claim_id, post_id): connection = Claim_Tweet_Connection() connection.claim_id = claim_id connection.post_id = post_id self._db.add_claim_connections([connection]) def _add_claim(self, claim_id): claim = Claim() claim.claim_id = claim_id self._db.addPosts([claim]) def _init_authors(self): self._add_author('Smile_lifeisgood', comment_karma=30880, link_karma=1341, is_gold=1, is_mod=0, is_employee=0) self._add_author('Cunty_Balls', comment_karma=7369, link_karma=90, is_gold=1, is_mod=0, is_employee=0) self._add_author('I_kick_fuck_nuns', comment_karma=2842, link_karma=5897, is_gold=1, is_mod=0, is_employee=0) self._add_author('TheRiseofMindhawk', comment_karma=2261, link_karma=174, is_gold=1, is_mod=1, is_employee=0) self._add_author('dialog2011', comment_karma=37027, link_karma=4582, is_gold=0, is_mod=0, is_employee=1) self._add_author('chrmanyaki', comment_karma=22588, link_karma=1, is_gold=0, is_mod=0, is_employee=1) self._add_author('Undertakerjoe', comment_karma=9177, link_karma=1384, is_gold=0, is_mod=0, is_employee=0) self._add_author('Lmb2298', comment_karma=25741, link_karma=1, is_gold=0, is_mod=0, is_employee=0) self._add_author('azzazaz', comment_karma=35111, link_karma=171576, is_gold=0, is_mod=1, is_employee=0) self._add_author('juanwonone1', comment_karma=6243, link_karma=136, is_gold=0, is_mod=0, is_employee=1) def _init_posts(self): self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('juanwonone1', '15/10/2017 21:44', '76ksr4', 738, 0.97)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Lmb2298', '01/10/2017 22:24', 'dferfgh', 52312, 0.77)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('dialog2011', '12/06/2017 23:45', '6gv0vk', 27)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('chrmanyaki', '15/10/2017 21:58', 'doeq8ke', 27)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('azzazaz', '12/06/2018 10:50', 'e0j4zkz', 32)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Smile_lifeisgood', '12/06/2018 20:08', 'e0in2zm', 11)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('Undertakerjoe', '15/10/2017 22:17', 'doerbqu', -13)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('I_kick_fuck_nuns', '18/06/2017 3:39', 'dj1qid5', 2)[0].guid) self._add_claim_tweet_connection( 'cd2e1978-4dfa-3a40-b62f-71153001629c', self._add_post('TheRiseofMindhawk', '13/06/2017 8:17', 'ditymrc', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('I_kick_fuck_nuns', '11/06/2018 18:49', '8qal3m', 102, 0.92)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '16/10/2017 2:23', 'dof4fen', -321, 0.3)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Smile_lifeisgood', '13/06/2017 0:29', 'dditbt8r', 11)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Lmb2298', '15/10/2017 22:38', 'doeslie', 11)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('azzazaz', '16/10/2017 0:30', 'doeyvtb', 9)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '15/10/2017 22:50', 'doetc6j', 7)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Cunty_Balls', '16/10/2017 1:52', 'dof2x1x', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('Cunty_Balls', '16/10/2017 2:43', 'dof5cpo', 2)[0].guid) self._add_claim_tweet_connection( 'a4beae51-463f-33fc-bbf6-20eca5104afe', self._add_post('juanwonone1', '16/10/2017 3:45', 'dof84f8', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Cunty_Balls', '15/10/2017 22:24', 'doerqsj', 234, 0.53)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '16/10/2017 21:44', '76ksr2', 123, 0.6)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Smile_lifeisgood', '13/06/2017 7:04', 'ditvpox', 7)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Smile_lifeisgood', '13/06/2017 0:51', 'ditcy28', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('juanwonone1', '15/10/2017 23:36', 'doevzsq', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('juanwonone1', '16/10/2017 0:26', 'doeynrr', 5)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '11/06/2018 21:55', 'e0hy5he', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '11/06/2018 22:04', 'e0hyrhi', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('I_kick_fuck_nuns', '12/06/2018 1:31', 'e0icveq', 1)[0].guid) self._add_claim_tweet_connection( '9e875999-9a3e-3357-bfa6-ede4fe67c1c9', self._add_post('Cunty_Balls', '13/06/2017 7:55', 'ditxua6', 3)[0].guid) def _init_claims(self): self._add_claim('cd2e1978-4dfa-3a40-b62f-71153001629c') self._add_claim('a4beae51-463f-33fc-bbf6-20eca5104afe') self._add_claim('9e875999-9a3e-3357-bfa6-ede4fe67c1c9') def _get_params(self): return {'authors': [], 'posts': []} # test_cases = [ # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'min_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'max_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'mean_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'median_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'skew_', # 'expected': # }, # { # 'claim_id': u'cd2e1978-4dfa-3a40-b62f-71153001629c', # 'test_name': 'kurtosis_', # 'expected': # }, # { # 'claim_id': u'a4beae51-463f-33fc-bbf6-20eca5104afe', # 'test_name': 'kurtosis_', # 'expected': # }, # { # 'claim_id': u'9e875999-9a3e-3357-bfa6-ede4fe67c1c9', # 'test_name': 'kurtosis_', # 'expected': # } # ]
class Twitter_Rest_Api(AbstractExecutor): def __init__(self, db): AbstractExecutor.__init__(self, db) self._working_app_number = self._config_parser.eval(self.__class__.__name__, "working_app_number") self._maximal_get_friend_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_friend_ids_requests_in_window") self._maximal_get_follower_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_follower_ids_requests_in_window") self._maximal_get_user_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_user_requests_in_window") self._maximal_user_ids_allowed_in_single_get_user_request = self._config_parser.eval(self.__class__.__name__, "maximal_user_ids_allowed_in_single_get_user_request") self._num_of_twitter_status_id_requests_without_checking = self._config_parser.eval(self.__class__.__name__, "num_of_twitter_status_id_requests_without_checking") self._num_of_twitter_timeline_requests_without_checking = self._config_parser.eval(self.__class__.__name__, "num_of_twitter_timeline_requests_without_checking") self._num_of_get_friend_ids_requests = 0 self._num_of_get_follower_ids_requests = 0 self._num_of_get_timeline_statuses = 0 self._num_of_twitter_status_id_requests = 0 self._num_of_twitter_timeline_requests = 0 self._total_author_connections = [] print("Creating TwitterApiRequester") self._twitter_api_requester = TwitterApiRequester() # self._find_source_twitter_id() logging.info("Setup DB...") print("Setup DB...") self._db = DB() self._db.setUp() def get_timeline_by_user_id(self, user_id): try: if self._num_of_get_timeline_statuses > self._num_of_twitter_timeline_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_timeline() if seconds_to_wait != 0: self.count_down_time(seconds_to_wait) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id) self._num_of_get_timeline_statuses += 1 print("Number of get timeline requests is: " + str(self._num_of_get_timeline_statuses)) return timeline except TwitterError as e: logging.info(e.message) if e.message == "Not authorized.": logging.info("Not authorized for user id: " + str(user_id)) return None sec = self._twitter_api_requester.get_sleep_time_for_timeline() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id) return timeline def handle_get_follower_ids_request(self, source_id): print("--- handle_get_follower_ids_request ---") logging.info("--- handle_get_follower_ids_request ---") follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id) follower_connection_type = unicode(Author_Connection_Type.FOLLOWER) temp_author_connections = self._db.create_temp_author_connections(source_id, follower_ids, follower_connection_type) self._total_author_connections = self._total_author_connections + temp_author_connections return follower_ids def handle_get_user_ids_request(self, source_id, author_type): print("--- handle_get_user_ids_request ---") if author_type == Author_Connection_Type.FOLLOWER: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id) elif author_type == Author_Connection_Type.FRIEND: user_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id) author_connections = self.create_author_connections(source_id, user_ids, author_type) self._total_author_connections = self._total_author_connections + author_connections return user_ids def handle_get_friend_ids_request(self, source_id): friend_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id) friend_connection_type = unicode(Author_Connection_Type.FRIEND) author_connections = self.create_author_connections(source_id, friend_ids, friend_connection_type) self._total_author_connections = self._total_author_connections + author_connections return friend_ids def crawl_users_by_author_ids(self, author_ids, connection_type, author_type, are_user_ids, insertion_type): self._total_author_connections = [] total_user_ids = self.crawl_users(author_ids, connection_type) self._db.save_author_connections(self._total_author_connections) total_user_ids_to_crawl = self.remove_already_crawled_authors(total_user_ids) users = self.handle_get_users_request(total_user_ids_to_crawl, are_user_ids, author_type, insertion_type) self.convert_twitter_users_to_authors_and_save(users, author_type, insertion_type) def crawl_users(self, author_ids, author_type): print("--- crawl_users ---") total_user_ids = [] for author_id in author_ids: print("--- crawl_user_ids for author id : " + str(author_id)) get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request" seconds_to_wait = getattr(self._twitter_api_requester, get_sleep_function_name)() if seconds_to_wait != 0: self.save_connections_and_wait(seconds_to_wait) init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests" getattr(self._twitter_api_requester, init_num_of_get_user_ids_requests_func_name)() get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id" user_ids = getattr(self._twitter_api_requester, get_user_ids_by_given_user_id_function_name)(author_id) temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_user_ids = list(set(total_user_ids + user_ids)) return total_user_ids def check_already_crawled_author_guids(self, author_guids): print("--- check_already_crawled_author_ids ----") author_ids_to_crawl = [] for author_guid in author_guids: authors_connections = self._db.get_author_connections_by_author_guid(author_guid) num_of_authors_connections = len(authors_connections) if num_of_authors_connections == 0: author_ids_to_crawl.append(author_guid) print("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) logging.info("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) print(author_ids_to_crawl) logging.info(author_ids_to_crawl) return author_ids_to_crawl def check_already_crawled_post_id(self, post_id): post_retweeter_connections = self._db.get_post_retweeter_connections_by_post_id(post_id) num_of_post_retweeter_connections = len(post_retweeter_connections) if num_of_post_retweeter_connections == 0: return False return True def crawl_retweeters_by_post_id(self, post_ids, are_user_ids, author_type, bad_actors_collector_inseration_type): self._total_author_connections = [] total_retweeter_ids = [] for post_id in post_ids: retweeter_ids = self._twitter_api_requester.get_retweeter_ids_by_status_id(post_id) total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids)) post_retweeter_connections = self._db.create_post_retweeter_connections(post_id, retweeter_ids) self._total_author_connections = self._total_author_connections + post_retweeter_connections self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] users = self.handle_get_users_request(total_retweeter_ids, are_user_ids, author_type, bad_actors_collector_inseration_type) self.convert_twitter_users_to_authors_and_save(users, author_type, bad_actors_collector_inseration_type) def get_retweets_by_post_id(self, post_id): retweets = self._twitter_api_requester.get_retweets_by_status_id(post_id) print(retweets) def create_author_connections(self, source_author_id, destination_author_ids, author_connection_type): print("---create_author_connections---") logging.info("---create_author_connections---") author_connections = [] for destination_author_id in destination_author_ids: author_connection = self.create_author_connection(source_author_id, destination_author_id, author_connection_type) author_connections.append(author_connection) return author_connections def create_author_connection(self, source_author_id, destination_author_id, connection_type): print("---create_author_connection---") author_connection = AuthorConnection() print("Author connection: source -> " + str(source_author_id) + ", dest -> " + str(destination_author_id) + ", connection type = " + connection_type) author_connection.source_author_osn_id = source_author_id author_connection.destination_author_osn_id = destination_author_id author_connection.connection_type = unicode(connection_type) author_connection.insertion_date = self._window_start return author_connection def count_down_time(self, seconds_to_wait): if seconds_to_wait is not 0: print("Seconds to wait is lower than 300: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 300: " + str(seconds_to_wait)) seconds_to_wait += 100 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) elif seconds_to_wait is not 0 and seconds_to_wait < 400: print("Seconds to wait is lower than 400: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 400: " + str(seconds_to_wait)) seconds_to_wait += 90 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) for i in xrange(seconds_to_wait, 0, -1): time.sleep(1) msg = "\r Count down: [{}]".format(i) print(msg, end="") # sys.stdout.write(str(i)+' ') # sys.stdout.flush() def convert_twitter_users_to_authors_and_save(self, total_twitter_users, author_type, inseration_type): authors = self.convert_twitter_users_to_authors(total_twitter_users, author_type, inseration_type) print("Total converted Twitter users into authors is: " + str(len(authors))) self.save_authors(authors) self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] def convert_twitter_users_to_authors(self, total_twitter_users, author_type, inseration_type): print("---Converting Twitter users to authors---") convert_twitter_users_to_authors_start_time = time.time() authors = self._db.convert_twitter_users_to_authors(total_twitter_users, self._domain, author_type, inseration_type) convert_twitter_users_to_authors_end_time = time.time() convert_twitter_users_to_authors_time = convert_twitter_users_to_authors_end_time - convert_twitter_users_to_authors_start_time print("Convert Twitter users to authors took in seconds: " + str(convert_twitter_users_to_authors_time)) return authors def save_authors(self, authors): print("---Saving authors in DB---") print("Number of authors to save is: " + str(len(authors))) save_authors_start_time = time.time() self._db.add_authors(authors) save_authors_end_time = time.time() save_authors_time = save_authors_end_time - save_authors_start_time print("Saving authors in DB took in seconds: " + str(save_authors_time)) def save_author_connections(self): print("---Saving author connections in DB---") save_author_connections_start_time = time.time() self._db.add_author_connections(self._total_author_connections) save_author_connections_end_time = time.time() save_author_connections_time = save_author_connections_end_time - save_author_connections_start_time print("Saving author connections in DB took in seconds: " + str(save_author_connections_time)) self._total_author_connections = [] def handle_get_users_request(self, ids, are_user_ids, author_type, insertion_type): total_users = [] users = [] ids_in_chunks = split_into_equal_chunks(ids, self._maximal_user_ids_allowed_in_single_get_user_request) seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request() total_chunks = list(ids_in_chunks) ids_in_chunks = split_into_equal_chunks(ids, self._maximal_user_ids_allowed_in_single_get_user_request) print("Total authors ids in chunk from twitter API: " + str(len(total_chunks))) i = 1 for ids_in_chunk in ids_in_chunks: print("Chunk of authors ids: " + str(i) + "/" + str(len(total_chunks))) i += 1 try: num_of_get_users_requests = self._twitter_api_requester.get_num_of_get_users_requests() if seconds_to_wait != 0: self.save_authors_and_connections_and_wait(users, author_type, insertion_type) users = [] self._twitter_api_requester.init_num_of_get_users_requests() users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) except TwitterError as e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) except Exception, e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) print("--- Finishing handle_get_users_request --- ") logging.info("--- Finishing handle_get_users_request --- ") # self.save_authors_and_connections(users, author_type, insertion_type) return total_users
class Twitter_Rest_Api(AbstractController): def __init__(self, db): AbstractController.__init__(self, db) self._working_app_number = self._config_parser.eval( self.__class__.__name__, "working_app_number") self._maximal_get_friend_ids_requests_in_window = self._config_parser.eval( self.__class__.__name__, "maximal_get_friend_ids_requests_in_window") self._maximal_get_follower_ids_requests_in_window = self._config_parser.eval( self.__class__.__name__, "maximal_get_follower_ids_requests_in_window") self._maximal_get_user_requests_in_window = self._config_parser.eval( self.__class__.__name__, "maximal_get_user_requests_in_window") self._maximal_user_ids_allowed_in_single_get_user_request = self._config_parser.eval( self.__class__.__name__, "maximal_user_ids_allowed_in_single_get_user_request") self._num_of_twitter_status_id_requests_without_checking = self._config_parser.eval( self.__class__.__name__, "num_of_twitter_status_id_requests_without_checking") self._num_of_twitter_timeline_requests_without_checking = self._config_parser.eval( self.__class__.__name__, "num_of_twitter_timeline_requests_without_checking") self._max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request = self._config_parser.eval( self.__class__.__name__, "max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request") self._max_num_of_tweet_ids_requests_without_checking = self._config_parser.eval( self.__class__.__name__, "max_num_of_tweet_ids_requests_without_checking") self._num_of_get_friend_ids_requests = 0 self._num_of_get_follower_ids_requests = 0 self._num_of_get_timeline_statuses = 0 self._num_of_twitter_status_id_requests = 0 self._num_of_twitter_timeline_requests = 0 self._num_of_get_tweet_ids_requests = 0 self._total_author_connections = [] print("Creating TwitterApiRequester") self._twitter_api_requester = TwitterApiRequester() # self._find_source_twitter_id() logging.info("Setup DB...") print("Setup DB...") self._db = DB() self._db.setUp() def get_timeline_by_user_id(self, user_id): try: if self._num_of_get_timeline_statuses > self._num_of_twitter_timeline_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_timeline( ) if seconds_to_wait != 0: self.count_down_time(seconds_to_wait) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id( user_id) self._num_of_get_timeline_statuses += 1 print("Number of get timeline requests is: " + str(self._num_of_get_timeline_statuses)) return timeline except TwitterError as e: logging.info(e.message) if e.message == "Not authorized.": logging.info("Not authorized for user id: " + str(user_id)) return None sec = self._twitter_api_requester.get_sleep_time_for_timeline() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id( user_id) return timeline def handle_get_follower_ids_request(self, source_id): print("--- handle_get_follower_ids_request ---") logging.info("--- handle_get_follower_ids_request ---") follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id( source_id) follower_connection_type = unicode(Author_Connection_Type.FOLLOWER) temp_author_connections = self._db.create_temp_author_connections( source_id, follower_ids, follower_connection_type) self._total_author_connections = self._total_author_connections + temp_author_connections return follower_ids def handle_get_user_ids_request(self, source_id, author_type): print("--- handle_get_user_ids_request ---") if author_type == Author_Connection_Type.FOLLOWER: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id( source_id) elif author_type == Author_Connection_Type.FRIEND: user_ids = self._twitter_api_requester.get_friend_ids_by_user_id( source_id) author_connections = self.create_author_connections( source_id, user_ids, author_type) self._total_author_connections = self._total_author_connections + author_connections return user_ids def handle_get_friend_ids_request(self, source_id): friend_ids = self._twitter_api_requester.get_friend_ids_by_user_id( source_id) friend_connection_type = unicode(Author_Connection_Type.FRIEND) author_connections = self.create_author_connections( source_id, friend_ids, friend_connection_type) self._total_author_connections = self._total_author_connections + author_connections return friend_ids def crawl_users_by_author_ids(self, author_ids, connection_type, author_type, are_user_ids, insertion_type): self._total_author_connections = [] total_user_ids = self.crawl_users(author_ids, connection_type) self._db.save_author_connections(self._total_author_connections) total_user_ids_to_crawl = self.remove_already_crawled_authors( total_user_ids) users = self.handle_get_users_request(total_user_ids_to_crawl, are_user_ids, author_type, insertion_type) self.convert_twitter_users_to_authors_and_save(users, author_type, insertion_type) def crawl_author_connections_by_author_ids(self, author_ids, connection_type, author_type, are_user_ids, insertion_type): self._total_author_connections = [] total_user_ids = self.crawl_users_restricted(author_ids, connection_type, restriction=0) #self.remove_already_crawled_authors(total_user_ids) - TBD self._db.save_author_connections(self._total_author_connections) def crawl_users(self, author_ids, author_type): print("--- crawl_users ---") total_user_ids = [] for author_id in author_ids: try: print("--- crawl_user_ids for author id : " + str(author_id)) get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request" seconds_to_wait = getattr(self._twitter_api_requester, get_sleep_function_name)() if seconds_to_wait != 0: self.save_connections_and_wait(seconds_to_wait) init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests" getattr(self._twitter_api_requester, init_num_of_get_user_ids_requests_func_name)() get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id" user_ids = getattr( self._twitter_api_requester, get_user_ids_by_given_user_id_function_name)(author_id) temp_author_connections = self._db.create_temp_author_connections( author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_user_ids = list(set(total_user_ids + user_ids)) except Exception as e: logging.exception( "Failed getting followers or friends for user : {0}". format(author_id)) return total_user_ids def crawl_users_restricted(self, author_ids, author_type, restriction): print("--- crawl_users restricted---") total_user_ids = [] for author_id in author_ids: try: print("--- crawl_user_ids for author id : " + str(author_id)) get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request" seconds_to_wait = getattr(self._twitter_api_requester, get_sleep_function_name)() if seconds_to_wait != 0: self.save_connections_and_wait(seconds_to_wait) init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests" getattr(self._twitter_api_requester, init_num_of_get_user_ids_requests_func_name)() get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id" user_ids = getattr( self._twitter_api_requester, get_user_ids_by_given_user_id_function_name)(author_id) temp_author_connections = self._db.create_temp_author_connections( author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_user_ids = list(set(total_user_ids + user_ids)) except Exception as e: logging.exception( "Failed getting followers or friends for user : {0}". format(author_id)) return total_user_ids def check_already_crawled_author_guids(self, author_guids): print("--- check_already_crawled_author_ids ----") author_ids_to_crawl = [] for author_guid in author_guids: authors_connections = self._db.get_author_connections_by_author_guid( author_guid) num_of_authors_connections = len(authors_connections) if num_of_authors_connections == 0: author_ids_to_crawl.append(author_guid) print("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) logging.info("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) print(author_ids_to_crawl) logging.info(author_ids_to_crawl) return author_ids_to_crawl def check_already_crawled_post_id(self, post_id): post_retweeter_connections = self._db.get_post_retweeter_connections_by_post_id( post_id) num_of_post_retweeter_connections = len(post_retweeter_connections) if num_of_post_retweeter_connections == 0: return False return True def crawl_retweeters_by_post_id(self, post_ids, are_user_ids, author_type, bad_actors_collector_inseration_type): self._total_author_connections = [] total_retweeter_ids = [] for post_id in post_ids: retweeter_ids = self._twitter_api_requester.get_retweeter_ids_by_status_id( post_id) total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids)) post_retweeter_connections = self._db.create_post_retweeter_connections( post_id, retweeter_ids) self._total_author_connections = self._total_author_connections + post_retweeter_connections self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] users = self.handle_get_users_request( total_retweeter_ids, are_user_ids, author_type, bad_actors_collector_inseration_type) self.convert_twitter_users_to_authors_and_save( users, author_type, bad_actors_collector_inseration_type) def get_retweets_by_post_id(self, post_id): retweets = self._twitter_api_requester.get_retweets_by_status_id( post_id) print(retweets) def create_author_connections(self, source_author_id, destination_author_ids, author_connection_type): print("---create_author_connections---") logging.info("---create_author_connections---") author_connections = [] for destination_author_id in destination_author_ids: author_connection = self.create_author_connection( source_author_id, destination_author_id, author_connection_type) author_connections.append(author_connection) return author_connections def create_author_connection(self, source_author_id, destination_author_id, connection_type): print("---create_author_connection---") author_connection = AuthorConnection() print("Author connection: source -> " + str(source_author_id) + ", dest -> " + str(destination_author_id) + ", connection type = " + connection_type) author_connection.source_author_osn_id = source_author_id author_connection.destination_author_osn_id = destination_author_id author_connection.connection_type = unicode(connection_type) author_connection.insertion_date = self._window_start return author_connection def count_down_time(self, seconds_to_wait): if seconds_to_wait is not 0: print("Seconds to wait is lower than 300: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 300: " + str(seconds_to_wait)) seconds_to_wait += 100 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) elif seconds_to_wait is not 0 and seconds_to_wait < 400: print("Seconds to wait is lower than 400: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 400: " + str(seconds_to_wait)) seconds_to_wait += 90 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) for i in xrange(seconds_to_wait, 0, -1): time.sleep(1) msg = "\r Count down: [{}]".format(i) print(msg, end="") # sys.stdout.write(str(i)+' ') # sys.stdout.flush() def convert_twitter_users_to_authors_and_save(self, total_twitter_users, author_type, inseration_type): authors = self.convert_twitter_users_to_authors( total_twitter_users, author_type, inseration_type) print("Total converted Twitter users into authors is: " + str(len(authors))) self.save_authors(authors) self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] def convert_twitter_users_to_authors(self, total_twitter_users, author_type, inseration_type): print("---Converting Twitter users to authors---") convert_twitter_users_to_authors_start_time = time.time() authors = self._db.convert_twitter_users_to_authors( total_twitter_users, self._domain, author_type, inseration_type) convert_twitter_users_to_authors_end_time = time.time() convert_twitter_users_to_authors_time = convert_twitter_users_to_authors_end_time - convert_twitter_users_to_authors_start_time print("Convert Twitter users to authors took in seconds: " + str(convert_twitter_users_to_authors_time)) return authors def save_authors(self, authors): print("---Saving authors in DB---") print("Number of authors to save is: " + str(len(authors))) save_authors_start_time = time.time() self._db.add_authors(authors) save_authors_end_time = time.time() save_authors_time = save_authors_end_time - save_authors_start_time print("Saving authors in DB took in seconds: " + str(save_authors_time)) def save_author_connections(self): print("---Saving author connections in DB---") save_author_connections_start_time = time.time() self._db.add_author_connections(self._total_author_connections) save_author_connections_end_time = time.time() save_author_connections_time = save_author_connections_end_time - save_author_connections_start_time print("Saving author connections in DB took in seconds: " + str(save_author_connections_time)) self._total_author_connections = [] def handle_get_users_request(self, ids, are_user_ids, author_type, insertion_type): total_users = [] users = [] ids_in_chunks = split_into_equal_chunks( ids, self._maximal_user_ids_allowed_in_single_get_user_request) total_chunks = list(ids_in_chunks) ids_in_chunks = split_into_equal_chunks( ids, self._maximal_user_ids_allowed_in_single_get_user_request) print("Total authors ids in chunk from twitter API: " + str(len(total_chunks))) i = 0 for ids_in_chunk in ids_in_chunks: i += 1 print("Chunk of authors ids: " + str(i) + "/" + str(len(total_chunks))) try: users = self.send_get_users_request_and_add_users( ids_in_chunk, are_user_ids, users) total_users = total_users + users except TwitterError as e: print(e) error_messages = e.message error_message_dict = error_messages[0] error_code = error_message_dict['code'] if error_code == 88: # Rate limit exceeded self.convert_twitter_users_to_authors_and_save( total_users, author_type, insertion_type) total_users = [] seconds_to_wait_object = self._twitter_api_requester.get_sleep_time_for_get_users_request( ) if seconds_to_wait_object > 0: count_down_time(seconds_to_wait_object) #epoch_timestamp = seconds_to_wait_object.reset #current_timestamp = time.time() #seconds_to_wait = int(epoch_timestamp - current_timestamp + 5) #count_down_time(seconds_to_wait) users = self.send_get_users_request_and_add_users( ids_in_chunk, are_user_ids, users) total_users = total_users + users print("--- Finishing handle_get_users_request --- ") logging.info("--- Finishing handle_get_users_request --- ") # self.save_authors_and_connections(users, author_type, insertion_type) return total_users def save_authors_and_connections_and_wait(self, total_twitter_users, author_type, inseration_type): self.save_authors_and_connections(total_twitter_users, author_type, inseration_type) seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request( ) self.count_down_time(seconds_to_wait) def save_authors_and_connections(self, total_twitter_users, author_type, inseration_type): self.convert_twitter_users_to_authors_and_save(total_twitter_users, author_type, inseration_type) def send_get_users_request_and_add_users(self, ids_in_chunk, are_user_ids, total_twitter_users): twitter_users = self.send_get_users_request(ids_in_chunk, are_user_ids) return twitter_users def save_connections_and_wait(self, seconds_to_wait): self.save_author_connections() self.count_down_time(seconds_to_wait) def send_get_users_request(self, ids_in_chunk, are_user_ids): if are_user_ids is True: twitter_users = self._twitter_api_requester.get_users_by_ids( ids_in_chunk) else: twitter_users = self._twitter_api_requester.get_users_by_screen_names( ids_in_chunk) return twitter_users def handle_retweeters_request(self, retweeter_ids, author_type, bad_actors_collector_inseration_type): total_retweeters = [] retweeter_ids_in_chunks = split_into_equal_chunks( retweeter_ids, self._maximal_user_ids_allowed_in_single_get_user_request) for retweeter_ids_in_chunk in retweeter_ids_in_chunks: retweeters = self._twitter_api_requester.get_users_by_ids( retweeter_ids_in_chunk) total_retweeters = total_retweeters + retweeters self.convert_twitter_users_to_authors_and_save( total_retweeters, author_type, bad_actors_collector_inseration_type) def remove_already_crawled_authors(self, total_user_ids): print("remove_already_crawled_authors") number_of_extracted_users = len(total_user_ids) print("Total number of extracted users is: " + str(number_of_extracted_users)) total_follower_ids_set = set(total_user_ids) already_crawled_author_ids = self._db.get_already_crawled_author_ids() number_of_already_crawled_authors = len(already_crawled_author_ids) print("Total number of already crawled users is: " + str(number_of_already_crawled_authors)) already_crawled_author_ids_set = set(already_crawled_author_ids) authors_ids_to_crawl_set = total_follower_ids_set - already_crawled_author_ids_set number_of_remaining_authors_ids_to_crawl = len( authors_ids_to_crawl_set) print("Total number of remaining users to crawl is: " + str(number_of_remaining_authors_ids_to_crawl)) authors_ids_to_crawl = list(authors_ids_to_crawl_set) return authors_ids_to_crawl def get_timline_by_author_id(self, author_id): author_timeline = self._twitter_api_requester.get_timeline_by_user_id( author_id) return author_timeline def get_status_by_twitter_status_id(self, id): # try: if self._num_of_twitter_status_id_requests >= self._num_of_twitter_status_id_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_status_id( ) if seconds_to_wait > 0: self.count_down_time(seconds_to_wait) self._num_of_twitter_status_id_requests = 0 self._num_of_twitter_status_id_requests = self._num_of_twitter_status_id_requests + 1 return self._twitter_api_requester.get_status(id) # except TwitterError as e: # exception_response = e[0][0] # logging.info("e.massage =" + exception_response["message"]) # code = exception_response["code"] # logging.info("e.code =" + str(exception_response["code"])) # # if code == 88: # sec = self._twitter_api_requester.get_sleep_time_for_twitter_status_id() # logging.info("Seconds to wait from catched crush is: " + str(sec)) # if sec != 0: # count_down_time(sec) # self._num_of_twitter_status_id_requests = 0 # return self._twitter_api_requester.get_status(id) def get_timeline_by_author_name(self, author_name, maximal_tweets_count_in_timeline): try: print("Number of timeline requests is: " + str(self._num_of_twitter_timeline_requests)) if self._num_of_twitter_timeline_requests >= self._num_of_twitter_timeline_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request( ) if seconds_to_wait > 0: self.count_down_time(seconds_to_wait) self._num_of_twitter_timeline_requests = 0 self._num_of_twitter_timeline_requests = self._num_of_twitter_timeline_requests + 1 return self._twitter_api_requester.get_timeline( author_name, maximal_tweets_count_in_timeline) except TwitterError as e: if e.message == "Not authorized.": logging.info("Not authorized for user id: " + str(author_name)) return None exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 34: return None sec = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request( ) logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) if sec != 0: self._num_of_twitter_timeline_requests = 0 timeline = self._twitter_api_requester.get_timeline( author_name, maximal_tweets_count_in_timeline) return timeline def get_active_users_names_by_screen_names(self, chunk_of_names): try: users = self._twitter_api_requester.get_users_by_screen_names( chunk_of_names) except TwitterError as e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request( ) logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self._twitter_api_requester.get_users_by_screen_names( chunk_of_names) return [user.screen_name for user in users] def get_sleep_time_for_twitter_status_id(self): return self._twitter_api_requester.get_sleep_time_for_twitter_status_id( ) def get_status(self, id): return self._twitter_api_requester.get_status(id) def get_posts_by_terms(self, terms): posts = { term: self._twitter_api_requester.get_tweets_by_term(term, 'recent') for term in terms } return posts def get_post_by_post_id(self, post_id): return self._twitter_api_requester.get_tweet_by_post_id(post_id) def get_tweets_by_tweet_ids_and_add_to_db(self, tweet_ids): total_tweets = self.get_tweets_by_ids(tweet_ids) posts, authors = self._db.convert_tweets_to_posts_and_authors( total_tweets, self._domain) self._db.addPosts(posts) self._db.add_authors(authors) return total_tweets # move to schema definition def get_tweets_by_ids(self, tweet_ids, author_type=""): total_tweets = [] ids_in_chunks = split_into_equal_chunks( tweet_ids, self. _max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request) # seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_tweets_by_tweet_ids_request() total_chunks = list(ids_in_chunks) ids_in_chunks = split_into_equal_chunks( tweet_ids, self. _max_tweet_ids_allowed_in_single_get_tweets_by_tweet_ids_request) i = 0 for ids_in_chunk in ids_in_chunks: i += 1 print("Chunk of tweet ids: " + str(i) + "/" + str(len(total_chunks))) try: tweets = self._twitter_api_requester.get_tweets_by_post_ids( ids_in_chunk) total_tweets = list(set(total_tweets + tweets)) num_of_tweets = len(total_tweets) if num_of_tweets > 10000: self._save_posts_and_authors(total_tweets, author_type) total_tweets = [] except TwitterError as e: print(e) error_messages = e.message error_message_dict = error_messages[0] error_code = error_message_dict['code'] if error_code == 88: # Rate limit exceeded self._save_posts_and_authors(total_tweets, author_type) total_tweets = [] seconds_to_wait_object = self._twitter_api_requester.get_sleep_time_for_get_tweets_by_tweet_ids_request( ) epoch_timestamp = seconds_to_wait_object.reset current_timestamp = time.time() seconds_to_wait = int(epoch_timestamp - current_timestamp + 5) count_down_time(seconds_to_wait) tweets = self._twitter_api_requester.get_tweets_by_post_ids( ids_in_chunk) total_tweets = list(set(total_tweets + tweets)) return total_tweets # def create_post_from_tweet_data(self, tweet_data): # author_name = tweet_data.user.screen_name # tweet_author_guid = compute_author_guid_by_author_name(author_name) # tweet_author_guid = cleanForAuthor(tweet_author_guid) # tweet_post_twitter_id = str(tweet_data.id) # tweet_url = generate_tweet_url(tweet_post_twitter_id, author_name) # tweet_creation_time = tweet_data.created_at # tweet_str_publication_date = extract_tweet_publiction_date(tweet_creation_time) # tweet_guid = compute_post_guid(post_url=tweet_url, author_name=author_name, # str_publication_date=tweet_str_publication_date) # # post = Post(guid=tweet_guid, post_id=tweet_guid, url=unicode(tweet_url), # date=str_to_date(tweet_str_publication_date), # title=unicode(tweet_data.text), content=unicode(tweet_data.text), # post_osn_id=tweet_post_twitter_id, # author=unicode(author_name), author_guid=unicode(tweet_author_guid), # domain=unicode(self._domain), # retweet_count=unicode(tweet_data.retweet_count), # favorite_count=unicode(tweet_data.favorite_count), # timeline_importer_insertion_date=unicode(get_current_time_as_string())) # return post def _save_posts_and_authors(self, total_tweets, author_type=None): posts, authors = self._db.convert_tweets_to_posts_and_authors( total_tweets, self._domain) for author in authors: author.author_type = author_type self._db.addPosts(posts) self._db.addPosts(authors)
class SocialNetworkCrawler(AbstractController): def __init__(self, db): AbstractController.__init__(self, db) self._working_app_number = self._config_parser.eval(self.__class__.__name__, "working_app_number") self._maximal_get_friend_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_friend_ids_requests_in_window") self._maximal_get_follower_ids_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_follower_ids_requests_in_window") self._maximal_get_user_requests_in_window = self._config_parser.eval(self.__class__.__name__, "maximal_get_user_requests_in_window") self._maximal_user_ids_allowed_in_single_get_user_request = self._config_parser.eval(self.__class__.__name__, "maximal_user_ids_allowed_in_single_get_user_request") self._num_of_twitter_status_id_requests_without_checking = self._config_parser.eval(self.__class__.__name__, "num_of_twitter_status_id_requests_without_checking") self._num_of_twitter_timeline_requests_without_checking = self._config_parser.eval(self.__class__.__name__, "num_of_twitter_timeline_requests_without_checking") self._num_of_get_friend_ids_requests = 0 self._num_of_get_follower_ids_requests = 0 self._num_of_get_timeline_statuses = 0 self._num_of_twitter_status_id_requests = 0 self._num_of_twitter_timeline_requests = 0 self._num_of_get_twitter_users_requests = 0 self._total_author_connections = [] self._total_follower_ids = [] print("Creating TwitterApiRequester") self._twitter_api_requester = TwitterApiRequester(self._working_app_number) # self._find_source_twitter_id() logging.info("Setup DB...") print("Setup DB...") self._db = DB() self._db.setUp() def fill_followers_ids_only(self, author_ids): for i, author_id in enumerate(author_ids): print("author_id: {0} {1}/{2}".format(author_id, i, len(author_ids))) follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) temp_author_connections = self._db.create_temp_author_connections(author_id, follower_ids, "follower", self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections if len(self._total_author_connections) > 1000000: self._db.addPosts(self._total_author_connections) self._total_author_connections = [] self._db.addPosts(self._total_author_connections) def fill_followers_and_their_data_simultaneously(self, author_ids): for i, author_id in enumerate(author_ids): if self._num_of_get_follower_ids_requests < self._maximal_get_follower_ids_requests_in_window: self._send_get_follower_ids_for_author_id(author_id, i, author_ids) else: author_type = None are_user_ids = True insertion_type = DB_Insertion_Type.MISSING_DATA_COMPLEMENTOR users = self.handle_get_users_request(self._total_follower_ids, are_user_ids, author_type, insertion_type) self.convert_twitter_users_to_authors_and_save(users, "follower", insertion_type) #self._db.addPosts(self._total_author_connections) #self._db.save_author_connections(self._total_author_connections) self._num_of_get_twitter_users_requests = 0 self._total_follower_ids = [] #self._total_author_connections = [] T = time.time() diff = (T - self._last_follower_request_time) / 60 # window time to wait if diff < 15: count_down_time(diff * 60) self._send_get_follower_ids_for_author_id(author_id, i, author_ids) def _send_get_follower_ids_for_author_id(self, author_id, i, author_ids): follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) self._last_follower_request_time = time.time() self._num_of_get_follower_ids_requests += 1 print("Bring followers {0}:{1}/{2}".format(author_id, i, len(author_ids))) self._total_follower_ids = self._total_follower_ids + follower_ids temp_author_connections = self._db.create_temp_author_connections(author_id, follower_ids, "follower", self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections def get_timeline_by_user_id(self, user_id): try: if self._num_of_get_timeline_statuses > self._num_of_twitter_timeline_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_timeline() if seconds_to_wait != 0: self.count_down_time(seconds_to_wait) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id) self._num_of_get_timeline_statuses += 1 print("Number of get timeline requests is: " + str(self._num_of_get_timeline_statuses)) return timeline except TwitterError as e: logging.info(e.message) if e.message == "Not authorized.": logging.info("Not authorized for user id: " + str(user_id)) return None sec = self._twitter_api_requester.get_sleep_time_for_timeline() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) self._num_of_get_timeline_statuses = 0 timeline = self._twitter_api_requester.get_timeline_by_user_id(user_id) return timeline def handle_get_follower_ids_request(self, source_id): print("--- handle_get_follower_ids_request ---") logging.info("--- handle_get_follower_ids_request ---") follower_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id) follower_connection_type = str(Author_Connection_Type.FOLLOWER) temp_author_connections = self._db.create_temp_author_connections(source_id, follower_ids, follower_connection_type) self._total_author_connections = self._total_author_connections + temp_author_connections return follower_ids def handle_get_user_ids_request(self, source_id, author_type): print("--- handle_get_user_ids_request ---") if author_type == Author_Connection_Type.FOLLOWER: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(source_id) elif author_type == Author_Connection_Type.FRIEND: user_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id) author_connections = self.create_author_connections(source_id, user_ids, author_type) self._total_author_connections = self._total_author_connections + author_connections return user_ids def handle_get_friend_ids_request(self, source_id): friend_ids = self._twitter_api_requester.get_friend_ids_by_user_id(source_id) friend_connection_type = str(Author_Connection_Type.FRIEND) author_connections = self.create_author_connections(source_id, friend_ids, friend_connection_type) self._total_author_connections = self._total_author_connections + author_connections return friend_ids def crawl_users_by_author_ids(self, author_ids, connection_type, author_type, are_user_ids, insertion_type): self._total_author_connections = [] total_follower_ids, already_checked_author_ids = self.get_followers_until_exception(author_ids, connection_type) self._db.save_author_connections(self._total_author_connections) total_user_ids_to_crawl = self.remove_already_crawled_authors(total_follower_ids) users = self.handle_get_users_request(total_user_ids_to_crawl, are_user_ids, author_type, insertion_type) self.convert_twitter_users_to_authors_and_save(users, author_type, insertion_type) return total_follower_ids, already_checked_author_ids def get_follower_ids(self, author_id): user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) return user_ids def get_sleep_time_for_follower_ids(self): user_ids = self._twitter_api_requester.get_sleep_time_for_follower_ids() return user_ids # def crawl_followers_by_twitter_author_ids(self, author_ids, author_type, are_user_ids, inseration_type): # print("--- crawl_followers_by_twitter_author_ids ---") # # #authors_ids_to_crawl = self.check_already_crawled_author_ids(author_ids) # total_follower_ids = self.crawl_followers_ids(author_ids) # # self.save_author_connections() # # total_follower_ids_to_crawl = self.remove_already_crawled_authors(total_follower_ids) # # self.handle_get_users_request(total_follower_ids_to_crawl, are_user_ids, author_type, inseration_type) # #self.convert_twitter_users_to_authors_and_save(followers, author_type, inseration_type) # def crawl_friends_by_twitter_author_ids(self, author_ids, author_type, are_user_ids, inseration_type): # # authors_ids_to_crawl = self.check_already_crawled_author_ids(author_ids) # total_friends_ids = self.crawl_friends_ids(author_ids) # # self.save_author_connections() # # total_friends_ids_to_crawl = self.remove_already_crawled_authors(total_friends_ids) # # friends = self.handle_get_users_request(total_friends_ids_to_crawl, are_user_ids, author_type, inseration_type) # self.convert_twitter_users_to_authors_and_save(friends, author_type, inseration_type) # def crawl_retweeters_by_twitter_post_ids(self, post_ids, author_type, inseration_type): # #authors_ids_to_crawl = self.check_already_crawled_author_ids(post_ids) # total_follower_ids = self.crawl_retweeters_ids(post_ids) # # self.save_author_connections() # are_user_ids = True # followers = self.handle_get_users_request(total_follower_ids, are_user_ids, author_type, inseration_type) # self.convert_twitter_users_to_authors_and_save(followers, author_type, inseration_type) # def crawl_retweeters_ids(self, posts_ids): # total_retweeter_ids = [] # for posts_id in posts_ids: # seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_retweeter_ids_request() # if seconds_to_wait == 0: # retweeter_ids = self.handle_get_retweeter_ids_request(posts_id) # total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids)) # else: # self.save_connections_and_wait(seconds_to_wait) # self._twitter_api_requester.init_num_of_get_follower_ids_requests() # retweeter_ids = self.handle_get_retweeter_ids_request(posts_id) # total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids)) # return total_retweeter_ids # def crawl_users(self, author_ids, author_type): # print("--- crawl_users ---") # total_user_ids = [] # for author_id in author_ids: # print("--- crawl_user_ids for author id : " + str(author_id)) # # get_sleep_function_name = "get_sleep_time_for_get_" + author_type + "_ids_request" # seconds_to_wait = getattr(self._twitter_api_requester, get_sleep_function_name)() # if seconds_to_wait != 0: # self.save_connections_and_wait(seconds_to_wait) # init_num_of_get_user_ids_requests_func_name = "init_num_of_get_" + author_type + "_ids_requests" # getattr(self._twitter_api_requester, init_num_of_get_user_ids_requests_func_name)() # # get_user_ids_by_given_user_id_function_name = "get_" + author_type + "_ids_by_user_id" # user_ids = getattr(self._twitter_api_requester, get_user_ids_by_given_user_id_function_name)(author_id) # # temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, # self._window_start) # self._total_author_connections = self._total_author_connections + temp_author_connections # # total_user_ids = list(set(total_user_ids + user_ids)) # # return total_user_ids def crawl_users(self, author_ids, author_type): total_user_ids = [] for i, author_id in enumerate(author_ids): msg = "\r Bring followers for authors: {0}/{1}".format(i, len(author_ids)) print(msg, end="") try: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_user_ids = list(set(total_user_ids + user_ids)) except TwitterError as e: exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 88: sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request() sec = sec + 100 logging.info("Seconds to wait from catched crush is: " + str(sec)) if sec != 0: count_down_time(sec) user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_user_ids = list(set(total_user_ids + user_ids)) return total_user_ids def get_followers_until_exception(self, author_ids, author_type): total_follower_ids = [] already_checked_author_ids = [] for i, author_id in enumerate(author_ids): msg = "\r Bring followers for authors: {0}/{1}".format(i, len(author_ids)) print(msg, end="") try: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) already_checked_author_ids.append(author_id) if len(user_ids) > 0: temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_follower_ids = list(set(total_follower_ids + user_ids)) except TwitterError as e: if e.message == "Not authorized.": logging.info("Not authorized for user id: {0}".format(author_id)) return total_follower_ids, already_checked_author_ids exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 34: return total_follower_ids, already_checked_author_ids if code == 88: sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request() sec = sec + 10 # logging.info("Seconds to wait from catched crush is: " + str(sec)) # if sec != 0: print("Number of seconds to wait: {0}".format(sec)) count_down_time(sec) try: user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) already_checked_author_ids.append(author_id) temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, self._window_start) self._total_author_connections = self._total_author_connections + temp_author_connections total_follower_ids = list(set(total_follower_ids + user_ids)) except TwitterError as e: if e.message == "Not authorized.": logging.info("Not authorized for user id: {0}".format(author_id)) return total_follower_ids, already_checked_author_ids # except TwitterError as e: # exception_response = e[0][0] # logging.info("e.massage =" + exception_response["message"]) # code = exception_response["code"] # logging.info("e.code =" + str(exception_response["code"])) # # if code == 88 and len(already_checked_author_ids) != 0: # return total_follower_ids, already_checked_author_ids # elif code == 88 and len(already_checked_author_ids) == 0: # sec = self._twitter_api_requester.get_sleep_time_for_get_follower_ids_request() # sec = sec + 3100 # # logging.info("Seconds to wait from catched crush is: " + str(sec)) # # if sec != 0: # print("Number of seconds to wait: {0}".format(sec)) # count_down_time(sec) # try: # user_ids = self._twitter_api_requester.get_follower_ids_by_user_id(author_id) # already_checked_author_ids.append(author_id) # # temp_author_connections = self._db.create_temp_author_connections(author_id, user_ids, author_type, # self._window_start) # self._total_author_connections = self._total_author_connections + temp_author_connections # # total_follower_ids = list(set(total_follower_ids + user_ids)) # return total_follower_ids, already_checked_author_ids # except TwitterError as e: # if e.message == "Not authorized.": # logging.info("Not authorized for user id: {0}".format(author_id)) # return total_follower_ids, already_checked_author_ids return total_follower_ids, already_checked_author_ids def check_already_crawled_author_guids(self, author_guids): print("--- check_already_crawled_author_ids ----") author_ids_to_crawl = [] for author_guid in author_guids: authors_connections = self._db.get_author_connections_by_author_guid(author_guid) num_of_authors_connections = len(authors_connections) if num_of_authors_connections == 0: author_ids_to_crawl.append(author_guid) print("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) logging.info("Number of authors ids to crawl is: " + str(len(author_ids_to_crawl))) print(author_ids_to_crawl) logging.info(author_ids_to_crawl) return author_ids_to_crawl def check_already_crawled_post_id(self, post_id): post_retweeter_connections = self._db.get_post_retweeter_connections_by_post_id(post_id) num_of_post_retweeter_connections = len(post_retweeter_connections) if num_of_post_retweeter_connections == 0: return False return True def crawl_retweeters_by_post_id(self, post_ids, are_user_ids, author_type, bad_actors_collector_inseration_type): self._total_author_connections = [] total_retweeter_ids = [] for post_id in post_ids: retweeter_ids = self._twitter_api_requester.get_retweeter_ids_by_status_id(post_id) total_retweeter_ids = list(set(total_retweeter_ids + retweeter_ids)) post_retweeter_connections = self._db.create_post_retweeter_connections(post_id, retweeter_ids) self._total_author_connections = self._total_author_connections + post_retweeter_connections self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] users = self.handle_get_users_request(total_retweeter_ids, are_user_ids, author_type, bad_actors_collector_inseration_type) self.convert_twitter_users_to_authors_and_save(users, author_type, bad_actors_collector_inseration_type) def get_retweets_by_post_id(self, post_id): retweets = self._twitter_api_requester.get_retweets_by_status_id(post_id) print(retweets) # def create_author_connections(self, source_author_id, destination_author_ids, author_connection_type): # print("---create_author_connections---") # logging.info("---create_author_connections---") # author_connections = [] # for destination_author_id in destination_author_ids: # author_connection = self.create_author_connection(source_author_id, destination_author_id, author_connection_type) # author_connections.append(author_connection) # # return author_connections # def create_author_connection(self, source_author_id, destination_author_id, connection_type): # print("---create_author_connection---") # author_connection = AuthorConnection() # print("Author connection: source -> " + str(source_author_id) + ", dest -> " + str(destination_author_id) + ", connection type = " + connection_type) # author_connection.source_author_osn_id = source_author_id # author_connection.destination_author_osn_id = destination_author_id # author_connection.connection_type = unicode(connection_type) # author_connection.insertion_date = self._window_start # # return author_connection def count_down_time(self, seconds_to_wait): if seconds_to_wait is not 0: print("Seconds to wait is lower than 300: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 300: " + str(seconds_to_wait)) seconds_to_wait += 100 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) elif seconds_to_wait is not 0 and seconds_to_wait < 400: print("Seconds to wait is lower than 400: " + str(seconds_to_wait)) logging.info("Seconds to wait is lower than 400: " + str(seconds_to_wait)) seconds_to_wait += 90 print("Seconds to wait were increased to: " + str(seconds_to_wait)) logging.info("Seconds to wait were increased to: " + str(seconds_to_wait)) for i in range(seconds_to_wait, 0, -1): time.sleep(1) msg = "\r Count down: [{}]".format(i) print(msg, end="") # sys.stdout.write(str(i)+' ') # sys.stdout.flush() def convert_twitter_users_to_authors_and_save(self, total_twitter_users, author_type, inseration_type): authors = self.convert_twitter_users_to_authors(total_twitter_users, author_type, inseration_type) print("Total converted Twitter users into authors is: " + str(len(authors))) self.save_authors(authors) self._db.save_author_connections(self._total_author_connections) self._total_author_connections = [] def convert_twitter_users_to_authors(self, total_twitter_users, author_type, inseration_type): print("---Converting Twitter users to authors---") convert_twitter_users_to_authors_start_time = time.time() authors = self._db.convert_twitter_users_to_authors(total_twitter_users, self._domain, author_type, inseration_type) convert_twitter_users_to_authors_end_time = time.time() convert_twitter_users_to_authors_time = convert_twitter_users_to_authors_end_time - convert_twitter_users_to_authors_start_time print("Convert Twitter users to authors took in seconds: " + str(convert_twitter_users_to_authors_time)) return authors def save_authors(self, authors): print("---Saving authors in DB---") print("Number of authors to save is: " + str(len(authors))) save_authors_start_time = time.time() self._db.add_authors(authors) save_authors_end_time = time.time() save_authors_time = save_authors_end_time - save_authors_start_time print("Saving authors in DB took in seconds: " + str(save_authors_time)) def save_author_connections(self): print("---Saving author connections in DB---") save_author_connections_start_time = time.time() self._db.add_author_connections(self._total_author_connections) save_author_connections_end_time = time.time() save_author_connections_time = save_author_connections_end_time - save_author_connections_start_time print("Saving author connections in DB took in seconds: " + str(save_author_connections_time)) self._total_author_connections = [] def handle_get_users_request(self, ids, are_user_ids, author_type, insertion_type): total_users = [] users = [] ids_in_chunks = split_into_equal_chunks(ids, self._maximal_user_ids_allowed_in_single_get_user_request) #seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request() total_chunks = list(ids_in_chunks) ids_in_chunks = split_into_equal_chunks(ids, self._maximal_user_ids_allowed_in_single_get_user_request) print("Total authors ids in chunk from twitter API: " + str(len(total_chunks))) i = 1 for ids_in_chunk in ids_in_chunks: print("Chunk of authors ids: " + str(i) + "/" + str(len(total_chunks))) i += 1 try: #num_of_get_users_requests = self._twitter_api_requester.get_num_of_get_users_requests() # if seconds_to_wait != 0: # self.save_authors_and_connections_and_wait(users, author_type, insertion_type) # users = [] # self._twitter_api_requester.init_num_of_get_users_requests() if self._num_of_get_twitter_users_requests < self._maximal_get_user_requests_in_window: users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) self._num_of_get_twitter_users_requests += 1 except TwitterError as e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) except Exception as e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self.send_get_users_request_and_add_users(ids_in_chunk, are_user_ids, users) total_users = list(set(total_users + users)) print("--- Finishing handle_get_users_request --- ") logging.info("--- Finishing handle_get_users_request --- ") # self.save_authors_and_connections(users, author_type, insertion_type) return total_users def save_authors_and_connections_and_wait(self, total_twitter_users, author_type, inseration_type): self.save_authors_and_connections(total_twitter_users, author_type, inseration_type) seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_get_users_request() self.count_down_time(seconds_to_wait) def save_authors_and_connections(self, total_twitter_users, author_type, inseration_type): self.convert_twitter_users_to_authors_and_save(total_twitter_users, author_type, inseration_type) def send_get_users_request_and_add_users(self, ids_in_chunk, are_user_ids, total_twitter_users): twitter_users = self.send_get_users_request(ids_in_chunk, are_user_ids) total_twitter_users = total_twitter_users + twitter_users return total_twitter_users def save_connections_and_wait(self, seconds_to_wait): self.save_author_connections() self.count_down_time(seconds_to_wait) def send_get_users_request(self, ids_in_chunk, are_user_ids): if are_user_ids is True: twitter_users = self._twitter_api_requester.get_users_by_ids(ids_in_chunk) else: twitter_users = self._twitter_api_requester.get_users_by_screen_names(ids_in_chunk) return twitter_users def handle_retweeters_request(self, retweeter_ids, author_type, bad_actors_collector_inseration_type): total_retweeters = [] retweeter_ids_in_chunks = split_into_equal_chunks(retweeter_ids, self._maximal_user_ids_allowed_in_single_get_user_request) for retweeter_ids_in_chunk in retweeter_ids_in_chunks: retweeters = self._twitter_api_requester.get_users_by_ids(retweeter_ids_in_chunk) total_retweeters = total_retweeters + retweeters self.convert_twitter_users_to_authors_and_save(total_retweeters, author_type, bad_actors_collector_inseration_type) def remove_already_crawled_authors(self, total_user_ids): print("remove_already_crawled_authors") number_of_extracted_users = len(total_user_ids) print("Total number of extracted users is: " + str(number_of_extracted_users)) total_follower_ids_set = set(total_user_ids) already_crawled_author_ids = self._db.get_already_crawled_author_ids() number_of_already_crawled_authors = len(already_crawled_author_ids) print("Total number of already crawled users is: " + str(number_of_already_crawled_authors)) already_crawled_author_ids_set = set(already_crawled_author_ids) authors_ids_to_crawl_set = total_follower_ids_set - already_crawled_author_ids_set number_of_remaining_authors_ids_to_crawl = len(authors_ids_to_crawl_set) print("Total number of remaining users to crawl is: " + str(number_of_remaining_authors_ids_to_crawl)) authors_ids_to_crawl = list(authors_ids_to_crawl_set) return authors_ids_to_crawl def get_timline_by_author_id(self, author_id): author_timeline = self._twitter_api_requester.get_timeline_by_user_id(author_id) return author_timeline def get_status_by_twitter_status_id(self, id): # try: if self._num_of_twitter_status_id_requests >= self._num_of_twitter_status_id_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_status_id() if seconds_to_wait > 0: self.count_down_time(seconds_to_wait) self._num_of_twitter_status_id_requests = 0 self._num_of_twitter_status_id_requests = self._num_of_twitter_status_id_requests + 1 return self._twitter_api_requester.get_status(id) # except TwitterError as e: # exception_response = e[0][0] # logging.info("e.massage =" + exception_response["message"]) # code = exception_response["code"] # logging.info("e.code =" + str(exception_response["code"])) # # if code == 88: # sec = self._twitter_api_requester.get_sleep_time_for_twitter_status_id() # logging.info("Seconds to wait from catched crush is: " + str(sec)) # if sec != 0: # count_down_time(sec) # self._num_of_twitter_status_id_requests = 0 # return self._twitter_api_requester.get_status(id) def get_timeline_by_author_name(self, author_name, maximal_tweets_count_in_timeline): try: print("Number of timeline requests is: " + str(self._num_of_twitter_timeline_requests)) if self._num_of_twitter_timeline_requests >= self._num_of_twitter_timeline_requests_without_checking: seconds_to_wait = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request() if seconds_to_wait > 0: self.count_down_time(seconds_to_wait) self._num_of_twitter_timeline_requests = 0 self._num_of_twitter_timeline_requests = self._num_of_twitter_timeline_requests + 1 return self._twitter_api_requester.get_timeline(author_name, maximal_tweets_count_in_timeline) except TwitterError as e: if e.message == "Not authorized.": logging.info("Not authorized for user id: " + str(author_name)) return None exception_response = e[0][0] logging.info("e.massage =" + exception_response["message"]) code = exception_response["code"] logging.info("e.code =" + str(exception_response["code"])) if code == 34: return None sec = self._twitter_api_requester.get_sleep_time_for_twitter_timeline_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) if sec != 0: self._num_of_twitter_timeline_requests = 0 timeline = self._twitter_api_requester.get_timeline(author_name, maximal_tweets_count_in_timeline) return timeline def get_active_users_names_by_screen_names(self, chunk_of_names): try: users = self._twitter_api_requester.get_users_by_screen_names(chunk_of_names) except TwitterError as e: logging.info(e.message) sec = self._twitter_api_requester.get_sleep_time_for_get_users_request() logging.info("Seconds to wait from catched crush is: " + str(sec)) count_down_time(sec) users = self._twitter_api_requester.get_users_by_screen_names(chunk_of_names) return [user.screen_name for user in users] def get_sleep_time_for_twitter_status_id(self): return self._twitter_api_requester.get_sleep_time_for_twitter_status_id() def get_status(self, id): return self._twitter_api_requester.get_status(id)