def setUp(self):
     self.working_path = tempfile.mkdtemp()
     self.harvester = WeiboHarvester(self.working_path)
     self.harvester.state_store = DictHarvestStateStore()
     self.harvester.result = HarvestResult()
     self.harvester.stop_harvest_seeds_event = threading.Event()
     self.harvester.message = base_timeline_message
class TestWeiboHarvesterVCR(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()
        self.harvester = WeiboHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()
        self.harvester.message = {
            "id": "test:2",
            "type": "weibo_timeline",
            "path": "/collections/test_collection_set",
            "credentials": {"access_token": tests.WEIBO_ACCESS_TOKEN},
            "collection_set": {"id": "test_collection_set"},
            "collection": {"id": "test_collection"},
            "options": {},
        }

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @vcr.use_cassette(filter_query_parameters=["access_token"])
    def test_search_vcr(self):
        self.harvester.harvest_seeds()
        # check the total number, for new users don't how to check
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 181)
        # check the harvester status
        self.assertTrue(self.harvester.result.success)

    @vcr.use_cassette(filter_query_parameters=["access_token"])
    def test_incremental_search_vcr(self):
        self.harvester.message["options"]["incremental"] = True
        collection_set_id = self.harvester.message["collection_set"]["id"]
        self.harvester.state_store.set_state(
            "weibo_harvester", u"{}.since_id".format(collection_set_id), 3935747172100551
        )
        self.harvester.harvest_seeds()

        # Check harvest result
        self.assertTrue(self.harvester.result.success)
        # for check the number of get
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 5)
 def setUp(self):
     self.working_path = tempfile.mkdtemp()
     self.harvester = WeiboHarvester(self.working_path)
     self.harvester.state_store = DictHarvestStateStore()
     self.harvester.result = HarvestResult()
     self.harvester.stop_harvest_seeds_event = threading.Event()
     self.harvester.message = {
         "id": "test:1",
         "type": "weibo_timeline",
         "path": "/collections/test_collection_set",
         "credentials": {"access_token": tests.WEIBO_ACCESS_TOKEN},
         "collection_set": {"id": "test_collection_set"},
         "collection": {"id": "test_collection"},
         "options": {"web_resources": True, "image_sizes": ["Large"]},
     }
class TestWeiboHarvester(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()
        self.harvester = WeiboHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()
        self.harvester.message = {
            "id": "test:1",
            "type": "weibo_timeline",
            "path": "/collections/test_collection_set",
            "credentials": {"access_token": tests.WEIBO_ACCESS_TOKEN},
            "collection_set": {"id": "test_collection_set"},
            "collection": {"id": "test_collection"},
            "options": {"web_resources": True, "image_sizes": ["Large"]},
        }

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_search_timeline(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # Expecting 2 results. First returns 1tweets. Second returns none.
        mock_weiboarc.search_friendships.side_effect = [(weibo1, weibo2), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        self.harvester.harvest_seeds()
        self.assertDictEqual({"weibos": 2}, self.harvester.result.harvest_counter)
        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)

        self.assertEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_incremental_search(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # Expecting 2 searches. First returns 2 weibos,one is none. Second returns none.
        mock_weiboarc.search_friendships.side_effect = [(weibo2,), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        self.harvester.message["options"] = {
            # Incremental means that will only retrieve new results.
            "incremental": True
        }

        collection_set_id = self.harvester.message["collection_set"]["id"]
        self.harvester.state_store.set_state(
            "weibo_harvester", u"{}.since_id".format(collection_set_id), 3927348724716740
        )
        self.harvester.harvest_seeds()

        self.assertDictEqual({"weibos": 1}, self.harvester.result.harvest_counter)
        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)

        # since_id must be in the mock calls
        self.assertEqual([call(since_id=3927348724716740)], mock_weiboarc.search_friendships.mock_calls)
        self.assertNotEqual([call(since_id=None)], mock_weiboarc.search_friendships.mock_calls)

    @staticmethod
    def _iter_items(items):
        # This is useful for mocking out a warc iter
        iter_items = []
        for item in items:
            iter_items.append(IterItem(None, None, None, None, item))
        return iter_items

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.extract_web_resources = False
        self.harvester.extract_images_sizes = []
        self.harvester.incremental = False

        self.harvester.process_warc("test.warc.gz")

        # The default will not sending web harvest
        self.assertSetEqual(set(), self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        self.assertEqual(3, self.harvester.result.stats_summary()["weibos"])
        # State not set
        self.assertIsNone(self.harvester.state_store.get_state("weibo_harvester", "test_collection_set.since_id"))

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_incremental(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.extract_web_resources = False
        self.harvester.extract_images_sizes = []
        self.harvester.incremental = True
        self.harvester.state_store.set_state("weibo_harvester", "test_collection_set.since_id", 3927348724716740)
        self.harvester.process_warc("test.warc.gz")

        # The default will not sending web harvest
        self.assertSetEqual(set(), self.harvester.result.urls_as_set())
        iter_class.assert_called_once_with("test.warc.gz")
        self.assertEqual(3, self.harvester.result.stats_summary()["weibos"])
        # State updated
        self.assertEqual(
            3973784090711192, self.harvester.state_store.get_state("weibo_harvester", "test_collection_set.since_id")
        )

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_harvest_options_web(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.extract_web_resources = True
        self.harvester.extract_images_sizes = []
        self.harvester.incremental = False

        self.harvester.process_warc("test.warc.gz")

        # Testing URL1&URL2
        self.assertSetEqual(
            {"http://t.cn/RqmQ3ko", "http://m.weibo.cn/1618051664/3973767505640890"},
            self.harvester.result.urls_as_set(),
        )
        iter_class.assert_called_once_with("test.warc.gz")

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_harvest_options_media(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [self._iter_items([weibo3, weibo4, weibo5]).__iter__()]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.extract_web_resources = False
        self.harvester.extract_images_sizes = ["Large", "Medium", "Thumbnail"]
        self.harvester.incremental = False

        self.harvester.process_warc("test.warc.gz")

        # Testing URL3 photos URLs
        self.assertSetEqual(
            {
                "http://ww2.sinaimg.cn/large/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg",
                "http://ww4.sinaimg.cn/large/60718250jw1f3qtzyhai3j20de0vin32.jpg",
                "http://ww2.sinaimg.cn/bmiddle/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg",
                "http://ww4.sinaimg.cn/bmiddle/60718250jw1f3qtzyhai3j20de0vin32.jpg",
                "http://ww2.sinaimg.cn/thumbnail/6b23a52bgw1f3pjhhyofnj208p06c3yq.jpg",
                "http://ww4.sinaimg.cn/thumbnail/60718250jw1f3qtzyhai3j20de0vin32.jpg",
            },
            self.harvester.result.urls_as_set(),
        )
        iter_class.assert_called_once_with("test.warc.gz")
class TestWeiboHarvesterVCR(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()
        self.harvester = WeiboHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()
        self.harvester.message = base_timeline_message

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @vcr.use_cassette(filter_query_parameters=['access_token'])
    def test_timeline_vcr(self):
        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()
        # check the total number, for new users don't how to check
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 181)
        # check the harvester status
        self.assertTrue(self.harvester.result.success)

    @vcr.use_cassette(filter_query_parameters=['access_token'])
    def test_incremental_timeline_vcr(self):
        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True
        self.harvester.message = message
        collection_set_id = self.harvester.message["collection_set"]["id"]
        self.harvester.state_store.set_state(
            "weibo_harvester", u"{}.since_id".format(collection_set_id),
            3935747172100551)
        self.harvester.harvest_seeds()

        # Check harvest result
        self.assertTrue(self.harvester.result.success)
        # for check the number of get
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 5)

    @vcr.use_cassette(filter_query_parameters=['access_token'])
    def test_search_topic_vcr(self):
        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()
        # check the total number, one search return 200
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 200)
        self.assertTrue(self.harvester.result.success)

    @vcr.use_cassette(filter_query_parameters=['access_token'])
    def test_search_topic_empty_vcr(self):
        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()
        # check the total number, one search return 0 with empty list
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 0)
        self.assertTrue(self.harvester.result.success)

    @vcr.use_cassette(filter_query_parameters=['access_token'])
    def test_incremental_search_topic_vcr(self):
        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True
        self.harvester.message = message
        query = self.harvester.message["seeds"][0]["token"]
        self.harvester.state_store.set_state("weibo_harvester",
                                             u"{}.since_id".format(query),
                                             4061065610091375)
        self.harvester.harvest_seeds()

        # Check harvest result
        self.assertTrue(self.harvester.result.success)
        # for check the number, it count as 6
        self.assertEqual(self.harvester.result.harvest_counter["weibos"], 6)
class TestWeiboHarvester(tests.TestCase):
    def setUp(self):
        self.working_path = tempfile.mkdtemp()
        self.harvester = WeiboHarvester(self.working_path)
        self.harvester.state_store = DictHarvestStateStore()
        self.harvester.result = HarvestResult()
        self.harvester.stop_harvest_seeds_event = threading.Event()
        self.harvester.message = base_timeline_message

    def tearDown(self):
        if os.path.exists(self.working_path):
            shutil.rmtree(self.working_path)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_search_timeline(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # Expecting 2 results. First returns 1tweets. Second returns none.
        mock_weiboarc.search_friendships.side_effect = [(weibo1, weibo2), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        self.harvester.message = base_timeline_message
        self.harvester.harvest_seeds()
        self.assertDictEqual({"weibos": 2},
                             self.harvester.result.harvest_counter)
        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)

        self.assertEqual([call(since_id=None)],
                         mock_weiboarc.search_friendships.mock_calls)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_incremental_search_timeline(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # Expecting 2 searches. First returns 2 weibos,one is none. Second returns none.
        mock_weiboarc.search_friendships.side_effect = [(weibo2, ), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        message = copy.deepcopy(base_timeline_message)
        message["options"]["incremental"] = True
        self.harvester.message = message
        collection_set_id = self.harvester.message["collection_set"]["id"]
        self.harvester.state_store.set_state(
            "weibo_harvester", u"{}.since_id".format(collection_set_id),
            3927348724716740)
        self.harvester.harvest_seeds()

        self.assertDictEqual({"weibos": 1},
                             self.harvester.result.harvest_counter)
        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)

        # since_id must be in the mock calls
        self.assertEqual([call(since_id=3927348724716740)],
                         mock_weiboarc.search_friendships.mock_calls)
        self.assertNotEqual([call(since_id=None)],
                            mock_weiboarc.search_friendships.mock_calls)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_search_topic(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # search_topic Expecting 2 results. First returns 1tweets. Second returns none.
        mock_weiboarc.search_topic.side_effect = [(weibo6, weibo7), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        self.harvester.message = base_search_message
        self.harvester.harvest_seeds()
        query = self.harvester.message["seeds"][0]["token"]

        self.assertDictEqual({"weibos": 2},
                             self.harvester.result.harvest_counter)
        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)

        self.assertEqual([call(query, since_id=None)],
                         mock_weiboarc.search_topic.mock_calls)

    @patch("weibo_harvester.Weiboarc", autospec=True)
    def test_incremental_search_topic(self, mock_weiboarc_class):
        mock_weiboarc = MagicMock(spec=Weiboarc)
        # search_topic Expecting 2 searches. First returns 1 weibos
        mock_weiboarc.search_topic.side_effect = [(weibo7, ), ()]
        # Return mock_weiboarc when instantiating a weiboarc.
        mock_weiboarc_class.side_effect = [mock_weiboarc]

        message = copy.deepcopy(base_search_message)
        message["options"]["incremental"] = True
        self.harvester.message = message

        query = self.harvester.message["seeds"][0]["token"]
        self.harvester.state_store.set_state("weibo_harvester",
                                             u"{}.since_id".format(query),
                                             4060927646547531)
        self.harvester.harvest_seeds()

        mock_weiboarc_class.assert_called_once_with(tests.WEIBO_ACCESS_TOKEN)
        self.assertEqual([call(query, since_id=4060927646547531)],
                         mock_weiboarc.search_topic.mock_calls)
        self.assertDictEqual({"weibos": 1},
                             self.harvester.result.harvest_counter)

    @staticmethod
    def _iter_items(items):
        # This is useful for mocking out a warc iter
        iter_items = []
        for item in items:
            iter_items.append(IterItem(None, None, None, None, item))
        return iter_items

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_timeline(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([weibo3, weibo4, weibo5]).__iter__()
        ]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.incremental = False

        self.harvester.message = base_timeline_message
        self.harvester.process_warc("test.warc.gz")

        iter_class.assert_called_once_with("test.warc.gz")
        self.assertEqual(3, self.harvester.result.stats_summary()["weibos"])
        # State not set
        self.assertIsNone(
            self.harvester.state_store.get_state(
                "weibo_harvester", "test_collection_set.since_id"))

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_timeline_incremental(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([weibo3, weibo4, weibo5]).__iter__()
        ]
        iter_class.side_effect = [mock_iter]

        # These are default harvest options
        self.harvester.incremental = True
        self.harvester.state_store.set_state("weibo_harvester",
                                             "test_collection_set.since_id",
                                             3927348724716740)
        self.harvester.message = base_timeline_message
        self.harvester.process_warc("test.warc.gz")

        iter_class.assert_called_once_with("test.warc.gz")
        self.assertEqual(3, self.harvester.result.stats_summary()["weibos"])
        # State updated
        self.assertEqual(
            3973784090711192,
            self.harvester.state_store.get_state(
                "weibo_harvester", "test_collection_set.since_id"))

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_search_topic(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([weibo6, weibo7]).__iter__()
        ]
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_search_message
        self.harvester.process_warc("test.warc.gz")
        self.assertDictEqual({"weibos": 2},
                             self.harvester.result.stats_summary())
        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        query = self.harvester.message["seeds"][0]["token"]
        self.assertEqual(
            None,
            self.harvester.state_store.get_state("weibo_harvester",
                                                 u"{}.since_id".format(query)))

    @patch("weibo_harvester.WeiboWarcIter", autospec=True)
    def test_process_search_topic_incremental(self, iter_class):
        mock_iter = MagicMock(spec=WeiboWarcIter)
        mock_iter.__iter__.side_effect = [
            self._iter_items([weibo6, weibo7]).__iter__()
        ]
        iter_class.side_effect = [mock_iter]

        self.harvester.message = base_search_message
        self.harvester.incremental = True

        # check the result
        query = self.harvester.message["seeds"][0]["token"]
        self.harvester.state_store.set_state("weibo_harvester",
                                             u"{}.since_id".format(query),
                                             4060927646547530)
        self.harvester.process_warc("test.warc.gz")

        self.assertDictEqual({"weibos": 2},
                             self.harvester.result.stats_summary())

        iter_class.assert_called_once_with("test.warc.gz")
        # State updated
        self.assertEqual(
            4060928330955796,
            self.harvester.state_store.get_state("weibo_harvester",
                                                 u"{}.since_id".format(query)))