def test_export_collection_missing_warc(self, mock_api_client_cls): mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] export_message = { "id": "test3", "type": "test_user", "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"}, "seeds": [{"id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1"}], "format": "csv", "segment_size": None, "path": self.export_path, } exporter = BaseExporter( "http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") self.assertFalse(exporter.result.success) self.assertEqual(CODE_BAD_REQUEST, exporter.result.errors[0].code)
def test_export_full_json(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}), IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}), ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter( None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, None) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with( dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None ) file_path = export_filepath + "_001.json" self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertDictEqual({"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}, json.loads(lines[0]))
def __init__(self, api_base_url, working_path, mq_config=None, warc_base_path=None): BaseExporter.__init__(self, api_base_url, TwitterRestWarcIter, TwitterRestStatusTable, working_path, mq_config=mq_config, warc_base_path=warc_base_path)
def test_export_seeds(self, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]])) mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] export_message = { "id": "test2", "type": "test_user", "seeds": [ {"id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1"}, {"id": "105b131f5f854402afa2b08a4b7ba960", "uid": "uid2"}, ], "format": "csv", "segment_size": None, "path": self.export_path, } exporter = BaseExporter( "http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost", ) exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( exclude_web=True, collection_id=None, seed_ids=["005b131f5f854402afa2b08a4b7ba960", "105b131f5f854402afa2b08a4b7ba960"], harvest_date_start=None, harvest_date_end=None, ) mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, ["uid1", "uid2"], None) self.assertTrue(exporter.result.success) csv_filepath = os.path.join(self.export_path, "test2_001.csv") self.assertTrue(os.path.exists(csv_filepath)) with open(csv_filepath, "r") as f: lines = f.readlines() self.assertEqual(3, len(lines))
def __init__(self, api_base_url, working_path, mq_config=None, warc_base_path=None): log.info("Initing TwitterStreamExporter") BaseExporter.__init__(self, api_base_url, TwitterStreamWarcIter, TwitterStreamStatusTable, working_path, mq_config=mq_config, warc_base_path=warc_base_path)
def __init__(self, api_base_url, working_path, mq_config=None, warc_base_path=None): BaseExporter.__init__(self, api_base_url, FlickrWarcIter, FlickrPhotoTable, working_path, mq_config=mq_config, warc_base_path=warc_base_path, limit_item_types=[TYPE_FLICKR_PHOTO])
def test_export_full_json(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, { "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }), IterItem(None, None, None, None, { "key1": "k1v2", "key2": "k2v2", "key3": "k3v2" }) ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter(None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, None) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with(dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None) file_path = export_filepath + '_001.json' self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertDictEqual({ "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }, json.loads(lines[0]))
def test_export_full_json_segment(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}), IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}), IterItem(None, None, None, None, {"key1": "k1v3", "key2": "k2v3", "key3": "k3v3"}), IterItem(None, None, None, None, {"key1": "k1v4", "key2": "k2v4", "key3": "k3v4"}), IterItem(None, None, None, None, {"key1": "k1v5", "key2": "k2v5", "key3": "k3v5"}), IterItem(None, None, None, None, {"key1": "k1v6", "key2": "k2v6", "key3": "k3v6"}), IterItem(None, None, None, None, {"key1": "k1v7", "key2": "k2v7", "key3": "k3v7"}), ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter( None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, 3) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with( dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None ) # file test_1.json, test_2.json , test_3.json for idx in xrange(3): file_path = export_filepath + "_" + str(idx + 1).zfill(3) + ".json" self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() # the test_3.json only has 1 row if idx == 2: self.assertEqual(1, len(lines)) else: self.assertEqual(3, len(lines)) self.assertDictEqual( {"key1": "k1v" + str(1 + idx * 3), "key2": "k2v" + str(1 + idx * 3), "key3": "k3v" + str(1 + idx * 3)}, json.loads(lines[0]), )
def test_export_collection_missing_warc(self, mock_api_client_cls): mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] export_message = { "id": "test3", "type": "test_user", "collection": { "id": "005b131f5f854402afa2b08a4b7ba960" }, "seeds": [{ "id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1" }], "format": "csv", "segment_size": None, "path": self.export_path } exporter = BaseExporter("http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") self.assertFalse(exporter.result.success) self.assertEqual(CODE_BAD_REQUEST, exporter.result.errors[0].code)
def __init__(self, api_base_url, working_path, mq_config=None, warc_base_path=None): BaseExporter.__init__(self, api_base_url, TumblrWarcIter, TumblrStatusTable, working_path, mq_config=mq_config, warc_base_path=warc_base_path)
def test_export_collection(self, mock_producer_cls, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]])) mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" mock_producer = MagicMock(spec=Producer) mock_producer_cls.return_value = mock_producer item_date_start = "2007-01-25T12:00:00Z" item_datetime_start = iso8601.parse_date(item_date_start) item_date_end = "2008-02-25T12:00:00Z" item_datetime_end = iso8601.parse_date(item_date_end) harvest_date_start = "2007-03-25T12:00:00Z" harvest_date_end = "2008-04-25T12:00:00Z" export_message = { "id": "test1", "type": "test_user", "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"}, "format": "csv", "segment_size": None, "path": self.export_path, "dedupe": True, "item_date_start": item_date_start, "item_date_end": item_date_end, "harvest_date_start": harvest_date_start, "harvest_date_end": harvest_date_end, } exporter = BaseExporter( "http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost", ) exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( exclude_web=True, collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_start=harvest_date_start, harvest_date_end=harvest_date_end, ) mock_table_cls.assert_called_once_with( self.warc_filepaths, True, item_datetime_start, item_datetime_end, [], None ) self.assertTrue(exporter.result.success) csv_filepath = os.path.join(self.export_path, "test1_001.csv") self.assertTrue(os.path.exists(csv_filepath)) with open(csv_filepath, "r") as f: lines = f.readlines() self.assertEqual(3, len(lines)) name, _, kwargs = mock_producer.mock_calls[0] self.assertEqual("publish", name) self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("running", export_status_message["status"]) self.assertTrue(iso8601.parse_date(export_status_message["date_started"])) self.assertEqual("test1", export_status_message["id"]) self.assertEqual("Base Exporter", export_status_message["service"]) self.assertEqual("testhost", export_status_message["host"]) self.assertTrue(export_status_message["instance"]) name, _, kwargs = mock_producer.mock_calls[1] self.assertEqual("publish", name) self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed success", export_status_message["status"]) self.assertTrue(iso8601.parse_date(export_status_message["date_started"])) self.assertTrue(iso8601.parse_date(export_status_message["date_ended"])) self.assertEqual("test1", export_status_message["id"]) self.assertEqual("Base Exporter", export_status_message["service"]) self.assertEqual("testhost", export_status_message["host"]) self.assertTrue(export_status_message["instance"])
def test_export_seeds(self, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([ [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")], ])) mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] export_message = { "id": "test2", "type": "test_user", "seeds": [ { "id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1" }, { "id": "105b131f5f854402afa2b08a4b7ba960", "uid": "uid2" }, ], "format": "csv", "segment_size": None, "path": self.export_path, } exporter = BaseExporter("http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( collection_id=None, seed_ids=[ "005b131f5f854402afa2b08a4b7ba960", "105b131f5f854402afa2b08a4b7ba960" ], harvest_date_start=None, harvest_date_end=None) mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, ["uid1", "uid2"], None) self.assertTrue(exporter.result.success) csv_filepath = os.path.join(self.export_path, "test2_001.csv") self.assertTrue(os.path.exists(csv_filepath)) with open(csv_filepath, "r") as f: lines = f.readlines() self.assertEqual(3, len(lines))
def test_export_collection_and_seeds(self, mock_producer_cls, mock_api_client_cls): mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] warcs = [ { "warc_id": "9dc0b9c3a93a49eb8f713330b43f954c", "path": "xtest_1-20151202165907873-00000-306-60892de9dfc6-8001.warc.gz", "sha1": "000ffb3371eadb507d77d181ca3f0c5d3c74a2fc", "bytes": 460518, "date_created": "2016-02-22T14:49:07Z", } ] mock_api_client.warcs.side_effect = [warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" mock_producer = MagicMock(spec=Producer) mock_producer_cls.return_value = mock_producer export_message = { "id": "test2", "type": "test_user", "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"}, "format": "csv", "segment_size": None, "path": self.export_path, } exporter = BaseExporter( "http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( exclude_web=True, collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_end=None, harvest_date_start=None, ) self.assertFalse(exporter.result.success) name, _, kwargs = mock_producer.mock_calls[1] self.assertEqual("publish", name) self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed failure", export_status_message["status"]) self.assertTrue(iso8601.parse_date(export_status_message["date_started"])) self.assertTrue(iso8601.parse_date(export_status_message["date_ended"])) self.assertEqual("test2", export_status_message["id"]) self.assertTrue(CODE_WARC_MISSING, export_status_message["errors"][0]["code"]) self.assertTrue(CODE_NO_WARCS, export_status_message["errors"][0]["code"])
def test_export_dehydrate(self, mock_producer, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([ [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")], ])) mock_table.id_field.return_value = "key2" mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" export_message = { "id": "test1", "type": "test_user", "collection": { "id": "005b131f5f854402afa2b08a4b7ba960" }, "format": "dehydrate", "segment_size": None, "path": self.export_path, } exporter = BaseExporter("http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_end=None, harvest_date_start=None) mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, [], None) self.assertTrue(exporter.result.success) txt_filepath = os.path.join(self.export_path, "test1_001.txt") self.assertTrue(os.path.exists(txt_filepath)) with open(txt_filepath, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertEqual("k2v1\n", lines[0]) name, _, kwargs = mock_producer.mock_calls[3] self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed success", export_status_message["status"]) self.assertEqual("test1", export_status_message["id"])
def test_export_dehydrate(self, mock_producer_cls, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]])) mock_table.id_field.return_value = "key2" mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" mock_producer = MagicMock(spec=Producer) mock_producer_cls.return_value = mock_producer export_message = { "id": "test1", "type": "test_user", "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"}, "format": "dehydrate", "segment_size": None, "path": self.export_path, } exporter = BaseExporter( "http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost", ) exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( exclude_web=True, collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_end=None, harvest_date_start=None, ) mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, [], None) self.assertTrue(exporter.result.success) txt_filepath = os.path.join(self.export_path, "test1_001.txt") self.assertTrue(os.path.exists(txt_filepath)) with open(txt_filepath, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertEqual("k2v1\n", lines[0]) name, _, kwargs = mock_producer.mock_calls[1] self.assertEqual("publish", name) self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed success", export_status_message["status"]) self.assertEqual("test1", export_status_message["id"])
def test_export_collection(self, mock_producer, mock_api_client_cls): mock_warc_iter_cls = MagicMock() mock_table_cls = MagicMock() mock_table = MagicMock(spec=BaseTable) mock_table_cls.side_effect = [mock_table] mock_table.__iter__ = Mock(return_value=iter([ [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")], ])) mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] mock_api_client.warcs.side_effect = [self.warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" item_date_start = "2007-01-25T12:00:00Z" item_datetime_start = iso8601.parse_date(item_date_start) item_date_end = "2008-02-25T12:00:00Z" item_datetime_end = iso8601.parse_date(item_date_end) harvest_date_start = "2007-03-25T12:00:00Z" harvest_date_end = "2008-04-25T12:00:00Z" export_message = { "id": "test1", "type": "test_user", "collection": { "id": "005b131f5f854402afa2b08a4b7ba960" }, "format": "csv", "segment_size": None, "path": self.export_path, "dedupe": True, "item_date_start": item_date_start, "item_date_end": item_date_end, "harvest_date_start": harvest_date_start, "harvest_date_end": harvest_date_end, } exporter = BaseExporter("http://test", mock_warc_iter_cls, mock_table_cls, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_start=harvest_date_start, harvest_date_end=harvest_date_end) mock_table_cls.assert_called_once_with(self.warc_filepaths, True, item_datetime_start, item_datetime_end, [], None) self.assertTrue(exporter.result.success) csv_filepath = os.path.join(self.export_path, "test1_001.csv") self.assertTrue(os.path.exists(csv_filepath)) with open(csv_filepath, "r") as f: lines = f.readlines() self.assertEqual(3, len(lines)) name, _, kwargs = mock_producer.mock_calls[1] self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("running", export_status_message["status"]) self.assertTrue( iso8601.parse_date(export_status_message["date_started"])) self.assertEqual("test1", export_status_message["id"]) self.assertEqual("Base Exporter", export_status_message["service"]) self.assertEqual("testhost", export_status_message["host"]) self.assertTrue(export_status_message["instance"]) name, _, kwargs = mock_producer.mock_calls[3] self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed success", export_status_message["status"]) self.assertTrue( iso8601.parse_date(export_status_message["date_started"])) self.assertTrue(iso8601.parse_date( export_status_message["date_ended"])) self.assertEqual("test1", export_status_message["id"]) self.assertEqual("Base Exporter", export_status_message["service"]) self.assertEqual("testhost", export_status_message["host"]) self.assertTrue(export_status_message["instance"])
def test_export_full_json_segment(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, { "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }), IterItem(None, None, None, None, { "key1": "k1v2", "key2": "k2v2", "key3": "k3v2" }), IterItem(None, None, None, None, { "key1": "k1v3", "key2": "k2v3", "key3": "k3v3" }), IterItem(None, None, None, None, { "key1": "k1v4", "key2": "k2v4", "key3": "k3v4" }), IterItem(None, None, None, None, { "key1": "k1v5", "key2": "k2v5", "key3": "k3v5" }), IterItem(None, None, None, None, { "key1": "k1v6", "key2": "k2v6", "key3": "k3v6" }), IterItem(None, None, None, None, { "key1": "k1v7", "key2": "k2v7", "key3": "k3v7" }) ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter(None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, 3) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with(dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None) # file test_1.json, test_2.json , test_3.json for idx in range(3): file_path = export_filepath + '_' + str(idx + 1).zfill(3) + '.json' self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() # the test_3.json only has 1 row if idx == 2: self.assertEqual(1, len(lines)) else: self.assertEqual(3, len(lines)) self.assertDictEqual( { "key1": "k1v" + str(1 + idx * 3), "key2": "k2v" + str(1 + idx * 3), "key3": "k3v" + str(1 + idx * 3) }, json.loads(lines[0]))
def test_export_collection_and_seeds(self, mock_producer, mock_api_client_cls): mock_api_client = MagicMock(spec=ApiClient) mock_api_client_cls.side_effect = [mock_api_client] warcs = [{ "warc_id": "9dc0b9c3a93a49eb8f713330b43f954c", "path": "xtest_1-20151202165907873-00000-306-60892de9dfc6-8001.warc.gz", "sha1": "000ffb3371eadb507d77d181ca3f0c5d3c74a2fc", "bytes": 460518, "date_created": "2016-02-22T14:49:07Z" }] mock_api_client.warcs.side_effect = [warcs] mock_connection = MagicMock(spec=Connection) mock_exchange = MagicMock(spec=Exchange) mock_exchange.name = "test exchange" export_message = { "id": "test2", "type": "test_user", "collection": { "id": "005b131f5f854402afa2b08a4b7ba960" }, "format": "csv", "segment_size": None, "path": self.export_path } exporter = BaseExporter("http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter.mq_config = True exporter._producer_connection = mock_connection exporter.exchange = mock_exchange exporter.routing_key = "export.start.test.test_user" exporter.message = export_message exporter.on_message() mock_api_client_cls.assert_called_once_with("http://test") mock_api_client.warcs.assert_called_once_with( collection_id="005b131f5f854402afa2b08a4b7ba960", seed_ids=[], harvest_date_end=None, harvest_date_start=None) self.assertFalse(exporter.result.success) name, _, kwargs = mock_producer.mock_calls[3] self.assertEqual("export.status.test.test_user", kwargs["routing_key"]) export_status_message = kwargs["body"] self.assertEqual("completed failure", export_status_message["status"]) self.assertTrue( iso8601.parse_date(export_status_message["date_started"])) self.assertTrue(iso8601.parse_date( export_status_message["date_ended"])) self.assertEqual("test2", export_status_message["id"]) self.assertTrue(CODE_WARC_MISSING, export_status_message["errors"][0]["code"]) self.assertTrue(CODE_NO_WARCS, export_status_message["errors"][0]["code"])