Exemple #1
0
class TestTwitterHarvester(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()

        self.harvester = TwitterHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.message = base_search_message
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_search(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet1, tweet2)]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5, tweet_mode="extended")
        self.assertEqual([call("gelman", geocode=None, since_id=None)], mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_search(self, twarc_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet2,)]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400)
        self.harvester.message = message
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5, connection_errors=5, tweet_mode="extended")
        self.assertEqual([call("gelman", geocode=None, since_id=605726286741434400)],
                         mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_new_search(self, mock_twarc_class):
        # The new search style has separate query and geocode parameters for search. However, the legacy
        # style is still accepted.
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet1, tweet2)]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        search_message = copy.deepcopy(base_search_message)
        search_message["seeds"][0]["token"] = {"query": "gelman", "geocode": "38.899434,-77.036449,50mi"}

        self.harvester.message = search_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5, tweet_mode="extended")
        self.assertEqual([call("gelman", since_id=None, geocode="38.899434,-77.036449,50mi")],
                         mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 user timelines. First returns 2 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()]
        # Expecting 2 calls to get for user lookup
        mock_response1 = MagicMock()
        mock_response1.status_code = 200
        mock_response1.json.return_value = {"screen_name": "gwtweets", "protected": False}
        mock_response2 = MagicMock()
        mock_response2.status_code = 200
        mock_response2.json.return_value = {"id_str": "9710852", "protected": False}
        mock_twarc.get.side_effect = [mock_response1, mock_response2]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5, tweet_mode="extended")
        self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)],
                         mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_user_timeline(self, twarc_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 timelines. First returns 1 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet2,), ()]
        # Expecting 2 calls to get for user lookup
        mock_response1 = MagicMock()
        mock_response1.status_code = 200
        mock_response1.json.return_value = {"screen_name": "gwtweets", "protected": False}
        mock_response2 = MagicMock()
        mock_response2.status_code = 200
        mock_response2.json.return_value = {"id_str": "9710852", "protected": False}
        mock_twarc.get.side_effect = [mock_response1, mock_response2]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester", "timeline.28101965.since_id", 605726286741434400)
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5, connection_errors=5, tweet_mode="extended")
        self.assertEqual(
            [call(user_id="28101965", since_id=605726286741434400), call(user_id="9710852", since_id=None)],
            mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline_with_missing_users(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 calls to user_lookup, both which return nothing
        mock_twarc.user_lookup.side_effect = [[], []]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]}
        mock_twarc.get.side_effect = HTTPError(response=mock_response)

        message = copy.deepcopy(base_timeline_message)
        message["seeds"] = [
            {
                "id": "seed_id1",
                "token": "missing1"
            },
            {
                "id": "seed_id2",
                "token": "missing2"
            }

        ]
        self.harvester.message = message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5, tweet_mode="extended")

        self.assertEqual(
            [call('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'missing1'}),
             call('https://api.twitter.com/1.1/users/show.json', allow_404=True, params={'screen_name': 'missing2'})],
            mock_twarc.get.mock_calls)
        self.assertEqual(2, len(self.harvester.result.warnings))
        self.assertEqual(CODE_TOKEN_NOT_FOUND, self.harvester.result.warnings[0].code)
        self.assertEqual("seed_id1", self.harvester.result.warnings[0].extras["seed_id"])

    def test_lookup_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"screen_name": "justin_littman", "protected": False}
        mock_twarc.get.return_value = mock_response

        self.harvester.twarc = mock_twarc
        self.assertEqual(('OK', {'protected': False, 'screen_name': 'justin_littman'}),
                         self.harvester._lookup_user(id="481186914", id_type="user_id"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'user_id': '481186914'})

    def test_lookup_protected_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"screen_name": "justin_littman", "protected": True}
        mock_twarc.get.return_value = mock_response

        self.harvester.twarc = mock_twarc
        self.assertEqual(('unauthorized', {'protected': True, 'screen_name': 'justin_littman'}),
                         self.harvester._lookup_user(id="481186914", id_type="user_id"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'user_id': '481186914'})

    def test_lookup_missing_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]}
        mock_twarc.get.side_effect = HTTPError(response=mock_response)

        self.harvester.twarc = mock_twarc
        self.assertEqual(('not_found', None), self.harvester._lookup_user(id="481186914", id_type="user_id"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'user_id': '481186914'})

    def test_lookup_suspended_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 403
        mock_response.json.return_value = {"errors": [{"code": 63, "message": "User has been suspended."}]}
        mock_twarc.get.side_effect = HTTPError(response=mock_response)

        self.harvester.twarc = mock_twarc
        self.assertEqual(('suspended', None), self.harvester._lookup_user(id="481186914", id_type="user_id"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'user_id': '481186914'})

    def test_lookup_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"user_id": "481186914", "protected": False}
        mock_twarc.get.return_value = mock_response

        self.harvester.twarc = mock_twarc
        self.assertEqual(('OK', {'protected': False, 'user_id': '481186914'}),
                         self.harvester._lookup_user(id="justin_littman", id_type="screen_name"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'screen_name': 'justin_littman'})

    def test_lookup_missing_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_response.json.return_value = {"errors": [{"code": 50, "message": "User not found."}]}
        mock_twarc.get.side_effect = [HTTPError(response=mock_response)]

        self.harvester.twarc = mock_twarc
        self.assertEqual(('not_found', None), self.harvester._lookup_user(id="justin_littman", id_type="screen_name"))

        mock_twarc.get.assert_called_once_with('https://api.twitter.com/1.1/users/show.json', allow_404=True,
                                               params={'screen_name': 'justin_littman'})

    @staticmethod
    def _iter_items(items):
        # This is useful for mocking out a warc iter
        iter_items = []
        for item in items:
            iter_items.append(IterItem(None, None, None, None, item))
        return iter_items

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_search_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(None, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search_incremental(self, iter_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400)
        self.harvester.message = message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(660065173563158500,
                         self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_user_timeline(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet1, tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_timeline_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 2}, self.harvester.result.stats_summary())
        iter_class.assert_called_once_with("test.warc.gz")
        # # Nothing added to state
        self.assertEqual(0, len(self.harvester.state_store.state))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_incremental_user_timeline(self, iter_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester", "timeline.481186914.since_id", 605726286741434400)
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(660065173563158500,
                         self.harvester.state_store.get_state("twitter_harvester", "timeline.481186914.since_id"))
Exemple #2
0
class TestTwitterHarvester(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()

        self.harvester = TwitterHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.message = base_search_message
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_search(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet1, tweet2)]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(
            tests.TWITTER_CONSUMER_KEY,
            tests.TWITTER_CONSUMER_SECRET,
            tests.TWITTER_ACCESS_TOKEN,
            tests.TWITTER_ACCESS_TOKEN_SECRET,
            http_errors=5,
            connection_errors=5)
        self.assertEqual([call("gelman", since_id=None)],
                         mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 2},
                             self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_search(self, twarc_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet2, )]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.state_store.set_state("twitter_harvester",
                                             "gelman.since_id",
                                             605726286741434400)
        self.harvester.message = message
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY,
                                            tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN,
                                            tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5,
                                            connection_errors=5)
        self.assertEqual([call("gelman", since_id=605726286741434400)],
                         mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 1},
                             self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 user timelines. First returns 2 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{
            "screen_name": "gwtweets"
        }], [{
            "id_str": "9710852"
        }]]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(
            tests.TWITTER_CONSUMER_KEY,
            tests.TWITTER_CONSUMER_SECRET,
            tests.TWITTER_ACCESS_TOKEN,
            tests.TWITTER_ACCESS_TOKEN_SECRET,
            http_errors=5,
            connection_errors=5)
        self.assertEqual([
            call(user_id="28101965", since_id=None),
            call(user_id="9710852", since_id=None)
        ], mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 2},
                             self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_user_timeline(self, twarc_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 timelines. First returns 1 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet2, ), ()]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{
            "screen_name": "gwtweets"
        }], [{
            "id_str": "9710852"
        }]]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester",
                                             "timeline.28101965.since_id",
                                             605726286741434400)
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY,
                                            tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN,
                                            tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5,
                                            connection_errors=5)
        self.assertEqual([
            call(user_id="28101965", since_id=605726286741434400),
            call(user_id="9710852", since_id=None)
        ], mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 1},
                             self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline_with_missing_users(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 calls to user_lookup, both which return nothing
        mock_twarc.user_lookup.side_effect = [[], []]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        message = copy.deepcopy(base_timeline_message)
        message["seeds"] = [{
            "id": "seed_id1",
            "token": "missing1"
        }, {
            "id": "seed_id2",
            "token": "missing2"
        }]
        self.harvester.message = message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(
            tests.TWITTER_CONSUMER_KEY,
            tests.TWITTER_CONSUMER_SECRET,
            tests.TWITTER_ACCESS_TOKEN,
            tests.TWITTER_ACCESS_TOKEN_SECRET,
            http_errors=5,
            connection_errors=5)

        self.assertEqual([
            call(screen_names=("missing1", )),
            call(screen_names=("missing2", ))
        ], mock_twarc.user_lookup.mock_calls)
        self.assertEqual(2, len(self.harvester.result.warnings))
        self.assertEqual(CODE_TOKEN_NOT_FOUND,
                         self.harvester.result.warnings[0].code)
        self.assertEqual("seed_id1",
                         self.harvester.result.warnings[0].extras["seed_id"])

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline_with_private_timeline(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 401
        # Expecting 2 user timelines. First returns 2 tweets. Second returns a 404.
        mock_twarc.timeline.side_effect = [(tweet1, tweet2),
                                           HTTPError(response=mock_response)]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{
            "screen_name": "gwtweets"
        }], [{
            "id_str": "9710852"
        }]]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(
            tests.TWITTER_CONSUMER_KEY,
            tests.TWITTER_CONSUMER_SECRET,
            tests.TWITTER_ACCESS_TOKEN,
            tests.TWITTER_ACCESS_TOKEN_SECRET,
            http_errors=5,
            connection_errors=5)
        self.assertEqual([
            call(user_id="28101965", since_id=None),
            call(user_id="9710852", since_id=None)
        ], mock_twarc.timeline.mock_calls)
        self.assertEqual(1, len(self.harvester.result.warnings))
        self.assertEqual(CODE_TOKEN_UNAUTHORIZED,
                         self.harvester.result.warnings[0].code)
        self.assertEqual("seed_id2",
                         self.harvester.result.warnings[0].extras["seed_id"])
        self.assertDictEqual({"tweets": 2},
                             self.harvester.result.harvest_counter)

    def test_lookup_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[{
            "screen_name": "justin_littman"
        }]]

        self.harvester.twarc = mock_twarc
        self.assertEqual("justin_littman",
                         self.harvester._lookup_screen_name("481186914"))

        mock_twarc.user_lookup.assert_called_once_with(
            user_ids=("481186914", ))

    def test_lookup_missing_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_twarc.user_lookup.side_effect = [
            HTTPError(response=mock_response)
        ]

        self.harvester.twarc = mock_twarc
        self.assertIsNone(self.harvester._lookup_screen_name("481186914"))

        mock_twarc.user_lookup.assert_called_once_with(
            user_ids=("481186914", ))

    def test_lookup_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[{"id_str": "481186914"}]]

        self.harvester.twarc = mock_twarc
        self.assertEqual("481186914",
                         self.harvester._lookup_user_id("justin_littman"))

        mock_twarc.user_lookup.assert_called_once_with(
            screen_names=("justin_littman", ))

    def test_lookup_missing_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_twarc.user_lookup.side_effect = [
            HTTPError(response=mock_response)
        ]

        self.harvester.twarc = mock_twarc
        self.assertIsNone(self.harvester._lookup_user_id("justin_littman"))

        mock_twarc.user_lookup.assert_called_once_with(
            screen_names=("justin_littman", ))

    @staticmethod
    def _iter_items(items):
        # This is useful for mocking out a warc iter
        iter_items = []
        for item in items:
            iter_items.append(IterItem(None, None, None, None, item))
        return iter_items

    def test_harvest_options_web(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True
        self.harvester.extract_user_profile_images = False
        # This would normally be passed a warc iter.
        self.harvester._process_tweets(
            self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual(
            {
                'http://bit.ly/1ipwd0B',  # url
                'http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html',  # from retweet
                'http://bit.ly/1NoNeBF'  # from base tweet of quoted status
            },
            self.harvester.result.urls_as_set())

    def test_harvest_options_media(self):
        self.harvester.extract_media = True
        self.harvester.extract_web_resources = False
        self.harvester.extract_user_profile_images = False
        self.harvester._process_tweets(
            self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual(
            {
                'http://pbs.twimg.com/tweet_video_thumb/Chn_42fWwAASuva.jpg',  # media/extended entity
                'http://pbs.twimg.com/media/Bv4ekbqIYAAcmXY.jpg',  # from quoted status
            },
            self.harvester.result.urls_as_set())

    def test_harvest_options_user__images(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = False
        self.harvester.extract_user_profile_images = True
        self.harvester._process_tweets(self._iter_items([tweet2]))
        self.assertSetEqual(
            {
                'http://pbs.twimg.com/profile_images/496478011533713408/GjecBUNj_normal.jpeg',
                'http://abs.twimg.com/images/themes/theme1/bg.png'
            }, self.harvester.result.urls_as_set())

    def test_default_harvest_options(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = False

        self.harvester._process_tweets(
            self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual(set(), self.harvester.result.urls_as_set())

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([tweet2]).__iter__()
        ]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_search_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1},
                             self.harvester.result.stats_summary())
        self.assertEqual(0, len(self.harvester.result.urls_as_set()))
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(
            None,
            self.harvester.state_store.get_state("twitter_harvester",
                                                 "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search_incremental(self, iter_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([tweet2]).__iter__()
        ]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.state_store.set_state("twitter_harvester",
                                             "gelman.since_id",
                                             605726286741434400)
        self.harvester.message = message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1},
                             self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"},
                            self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(
            660065173563158500,
            self.harvester.state_store.get_state("twitter_harvester",
                                                 "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_user_timeline(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([tweet1, tweet2]).__iter__()
        ]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        self.harvester.message = base_timeline_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 2},
                             self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"},
                            self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # # Nothing added to state
        self.assertEqual(0, len(self.harvester.state_store.state))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_incremental_user_timeline(self, iter_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([tweet2]).__iter__()
        ]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester",
                                             "timeline.481186914.since_id",
                                             605726286741434400)
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1},
                             self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"},
                            self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(
            660065173563158500,
            self.harvester.state_store.get_state(
                "twitter_harvester", "timeline.481186914.since_id"))
class TestTwitterHarvester(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()

        self.harvester = TwitterHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.message = base_search_message
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_search(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet1, tweet2)]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5)
        self.assertEqual([call("gelman", since_id=None)], mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_search(self, twarc_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.search.side_effect = [(tweet2,)]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400)
        self.harvester.message = message
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5, connection_errors=5)
        self.assertEqual([call("gelman", since_id=605726286741434400)],
                         mock_twarc.search.mock_calls)
        self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 user timelines. First returns 2 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet1, tweet2), ()]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5)
        self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)],
                         mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_incremental_user_timeline(self, twarc_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 timelines. First returns 1 tweets. Second returns none.
        mock_twarc.timeline.side_effect = [(tweet2,), ()]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]]
        # Return mock_twarc when instantiating a twarc.
        twarc_class.side_effect = [mock_twarc]

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester", "timeline.28101965.since_id", 605726286741434400)
        self.harvester.harvest_seeds()

        twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                            tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                            http_errors=5, connection_errors=5)
        self.assertEqual(
            [call(user_id="28101965", since_id=605726286741434400), call(user_id="9710852", since_id=None)],
            mock_twarc.timeline.mock_calls)
        self.assertDictEqual({"tweets": 1}, self.harvester.result.harvest_counter)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline_with_missing_users(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        # Expecting 2 calls to user_lookup, both which return nothing
        mock_twarc.user_lookup.side_effect = [[], []]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        message = copy.deepcopy(base_timeline_message)
        message["seeds"] = [
            {
                "id": "seed_id1",
                "token": "missing1"
            },
            {
                "id": "seed_id2",
                "token": "missing2"
            }

        ]
        self.harvester.message = message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5)

        self.assertEqual([call(screen_names=("missing1",)), call(screen_names=("missing2",))],
                         mock_twarc.user_lookup.mock_calls)
        self.assertEqual(2, len(self.harvester.result.warnings))
        self.assertEqual(CODE_TOKEN_NOT_FOUND, self.harvester.result.warnings[0].code)

    @patch("twitter_harvester.Twarc", autospec=True)
    def test_user_timeline_with_private_timeline(self, mock_twarc_class):
        mock_twarc = MagicMock(spec=Twarc)
        mock_response = MagicMock()
        mock_response.status_code = 401
        # Expecting 2 user timelines. First returns 2 tweets. Second returns a 404.
        mock_twarc.timeline.side_effect = [(tweet1, tweet2), HTTPError(response=mock_response)]
        # Expecting 2 calls to user_lookup
        mock_twarc.user_lookup.side_effect = [[{"screen_name": "gwtweets"}], [{"id_str": "9710852"}]]
        # Return mock_twarc when instantiating a twarc.
        mock_twarc_class.side_effect = [mock_twarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()

        mock_twarc_class.assert_called_once_with(tests.TWITTER_CONSUMER_KEY, tests.TWITTER_CONSUMER_SECRET,
                                                 tests.TWITTER_ACCESS_TOKEN, tests.TWITTER_ACCESS_TOKEN_SECRET,
                                                 http_errors=5, connection_errors=5)
        self.assertEqual([call(user_id="28101965", since_id=None), call(user_id="9710852", since_id=None)],
                         mock_twarc.timeline.mock_calls)
        self.assertEqual(1, len(self.harvester.result.warnings))
        self.assertEqual(CODE_TOKEN_UNAUTHORIZED, self.harvester.result.warnings[0].code)
        self.assertDictEqual({"tweets": 2}, self.harvester.result.harvest_counter)

    def test_lookup_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[{"screen_name": "justin_littman"}]]

        self.harvester.twarc = mock_twarc
        self.assertEqual("justin_littman", self.harvester._lookup_screen_name("481186914"))

        mock_twarc.user_lookup.assert_called_once_with(user_ids=("481186914",))

    def test_lookup_missing_screen_name(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[]]

        self.harvester.twarc = mock_twarc
        self.assertIsNone(self.harvester._lookup_screen_name("481186914"))

        mock_twarc.user_lookup.assert_called_once_with(user_ids=("481186914",))

    def test_lookup_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[{"id_str": "481186914"}]]

        self.harvester.twarc = mock_twarc
        self.assertEqual("481186914", self.harvester._lookup_user_id("justin_littman"))

        mock_twarc.user_lookup.assert_called_once_with(screen_names=("justin_littman",))

    def test_lookup_missing_user_id(self):
        mock_twarc = MagicMock(spec=Twarc)
        mock_twarc.user_lookup.side_effect = [[]]

        self.harvester.twarc = mock_twarc
        self.assertIsNone(self.harvester._lookup_user_id("justin_littman"))

        mock_twarc.user_lookup.assert_called_once_with(screen_names=("justin_littman",))

    @staticmethod
    def _iter_items(items):
        # This is useful for mocking out a warc iter
        iter_items = []
        for item in items:
            iter_items.append(IterItem(None, None, None, None, item))
        return iter_items

    def test_harvest_options_web(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True
        self.harvester.extract_user_profile_images = False
        # This would normally be passed a warc iter.
        self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual({'http://bit.ly/1ipwd0B',  # url
                             'http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html',  # from retweet
                             'http://bit.ly/1NoNeBF'  # from base tweet of quoted status
                             },
                            self.harvester.result.urls_as_set())

    def test_harvest_options_media(self):
        self.harvester.extract_media = True
        self.harvester.extract_web_resources = False
        self.harvester.extract_user_profile_images = False
        self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual({
            'http://pbs.twimg.com/tweet_video_thumb/Chn_42fWwAASuva.jpg',  # media/extended entity
            'http://pbs.twimg.com/media/Bv4ekbqIYAAcmXY.jpg',  # from quoted status
        }, self.harvester.result.urls_as_set())

    def test_harvest_options_user__images(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = False
        self.harvester.extract_user_profile_images = True
        self.harvester._process_tweets(self._iter_items([tweet2]))
        self.assertSetEqual({
            'http://pbs.twimg.com/profile_images/496478011533713408/GjecBUNj_normal.jpeg',
            'http://abs.twimg.com/images/themes/theme1/bg.png'
        }, self.harvester.result.urls_as_set())

    def test_default_harvest_options(self):
        self.harvester.extract_media = False
        self.harvester.extract_web_resources = False

        self.harvester._process_tweets(self._iter_items([tweet2, tweet3, tweet4, tweet5]))
        self.assertSetEqual(set(), self.harvester.result.urls_as_set())

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_search_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        self.assertEqual(0, len(self.harvester.result.urls_as_set()))
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(None, self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_search_incremental(self, iter_class):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.state_store.set_state("twitter_harvester", "gelman.since_id", 605726286741434400)
        self.harvester.message = message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(660065173563158500,
                         self.harvester.state_store.get_state("twitter_harvester", "gelman.since_id"))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_user_timeline(self, iter_class):
        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet1, tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        self.harvester.message = base_timeline_message
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 2}, self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # # Nothing added to state
        self.assertEqual(0, len(self.harvester.state_store.state))

    @patch("twitter_harvester.TwitterRestWarcIter", autospec=True)
    def test_process_incremental_user_timeline(self, iter_class):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True

        mock_iter = MagicMock(spec=TwitterRestWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([tweet2]).__iter__()]
        # Return mock_iter when instantiating a TwitterRestWarcIter.
        iter_class.side_effect = [mock_iter]

        self.harvester.extract_media = False
        self.harvester.extract_web_resources = True

        self.harvester.message = message
        self.harvester.state_store.set_state("twitter_harvester", "timeline.481186914.since_id", 605726286741434400)
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"tweets": 1}, self.harvester.result.stats_summary())
        self.assertSetEqual({"http://bit.ly/1ipwd0B"}, self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(660065173563158500,
                         self.harvester.state_store.get_state("twitter_harvester", "timeline.481186914.since_id"))