Esempio n. 1
0
    def test_export_collection_missing_warc(self, mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]

        export_message = {
            "id": "test3",
            "type": "test_user",
            "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"},
            "seeds": [{"id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1"}],
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter(
            "http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost"
        )

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")

        self.assertFalse(exporter.result.success)
        self.assertEqual(CODE_BAD_REQUEST, exporter.result.errors[0].code)
Esempio n. 2
0
    def test_export_collection_missing_warc(self, mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]

        export_message = {
            "id": "test3",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "seeds": [{
                "id": "005b131f5f854402afa2b08a4b7ba960",
                "uid": "uid1"
            }],
            "format": "csv",
            "segment_size": None,
            "path": self.export_path
        }

        exporter = BaseExporter("http://test",
                                None,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")

        self.assertFalse(exporter.result.success)
        self.assertEqual(CODE_BAD_REQUEST, exporter.result.errors[0].code)
Esempio n. 3
0
    def test_export_seeds(self, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        export_message = {
            "id": "test2",
            "type": "test_user",
            "seeds": [
                {"id": "005b131f5f854402afa2b08a4b7ba960", "uid": "uid1"},
                {"id": "105b131f5f854402afa2b08a4b7ba960", "uid": "uid2"},
            ],
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter(
            "http://test",
            mock_warc_iter_cls,
            mock_table_cls,
            self.working_path,
            warc_base_path=self.warc_base_path,
            host="testhost",
        )

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            exclude_web=True,
            collection_id=None,
            seed_ids=["005b131f5f854402afa2b08a4b7ba960", "105b131f5f854402afa2b08a4b7ba960"],
            harvest_date_start=None,
            harvest_date_end=None,
        )
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, ["uid1", "uid2"], None)

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test2_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))
Esempio n. 4
0
    def test_export_collection(self, mock_producer_cls, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"
        mock_producer = MagicMock(spec=Producer)
        mock_producer_cls.return_value = mock_producer

        item_date_start = "2007-01-25T12:00:00Z"
        item_datetime_start = iso8601.parse_date(item_date_start)
        item_date_end = "2008-02-25T12:00:00Z"
        item_datetime_end = iso8601.parse_date(item_date_end)
        harvest_date_start = "2007-03-25T12:00:00Z"
        harvest_date_end = "2008-04-25T12:00:00Z"

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"},
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
            "dedupe": True,
            "item_date_start": item_date_start,
            "item_date_end": item_date_end,
            "harvest_date_start": harvest_date_start,
            "harvest_date_end": harvest_date_end,
        }

        exporter = BaseExporter(
            "http://test",
            mock_warc_iter_cls,
            mock_table_cls,
            self.working_path,
            warc_base_path=self.warc_base_path,
            host="testhost",
        )
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            exclude_web=True,
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_start=harvest_date_start,
            harvest_date_end=harvest_date_end,
        )
        mock_table_cls.assert_called_once_with(
            self.warc_filepaths, True, item_datetime_start, item_datetime_end, [], None
        )

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test1_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))

        name, _, kwargs = mock_producer.mock_calls[0]
        self.assertEqual("publish", name)
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("running", export_status_message["status"])
        self.assertTrue(iso8601.parse_date(export_status_message["date_started"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])

        name, _, kwargs = mock_producer.mock_calls[1]
        self.assertEqual("publish", name)
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertTrue(iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(export_status_message["date_ended"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])
Esempio n. 5
0
    def test_export_collection_and_seeds(self, mock_producer_cls, mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        warcs = [
            {
                "warc_id": "9dc0b9c3a93a49eb8f713330b43f954c",
                "path": "xtest_1-20151202165907873-00000-306-60892de9dfc6-8001.warc.gz",
                "sha1": "000ffb3371eadb507d77d181ca3f0c5d3c74a2fc",
                "bytes": 460518,
                "date_created": "2016-02-22T14:49:07Z",
            }
        ]
        mock_api_client.warcs.side_effect = [warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"
        mock_producer = MagicMock(spec=Producer)
        mock_producer_cls.return_value = mock_producer

        export_message = {
            "id": "test2",
            "type": "test_user",
            "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"},
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter(
            "http://test", None, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost"
        )
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            exclude_web=True,
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None,
        )

        self.assertFalse(exporter.result.success)

        name, _, kwargs = mock_producer.mock_calls[1]
        self.assertEqual("publish", name)
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed failure", export_status_message["status"])
        self.assertTrue(iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(export_status_message["date_ended"]))
        self.assertEqual("test2", export_status_message["id"])
        self.assertTrue(CODE_WARC_MISSING, export_status_message["errors"][0]["code"])
        self.assertTrue(CODE_NO_WARCS, export_status_message["errors"][0]["code"])
Esempio n. 6
0
    def test_export_dehydrate(self, mock_producer_cls, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([[("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")]]))
        mock_table.id_field.return_value = "key2"

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"
        mock_producer = MagicMock(spec=Producer)
        mock_producer_cls.return_value = mock_producer

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {"id": "005b131f5f854402afa2b08a4b7ba960"},
            "format": "dehydrate",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter(
            "http://test",
            mock_warc_iter_cls,
            mock_table_cls,
            self.working_path,
            warc_base_path=self.warc_base_path,
            host="testhost",
        )
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            exclude_web=True,
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None,
        )
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False, None, None, [], None)

        self.assertTrue(exporter.result.success)
        txt_filepath = os.path.join(self.export_path, "test1_001.txt")
        self.assertTrue(os.path.exists(txt_filepath))
        with open(txt_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertEqual("k2v1\n", lines[0])

        name, _, kwargs = mock_producer.mock_calls[1]
        self.assertEqual("publish", name)
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertEqual("test1", export_status_message["id"])
Esempio n. 7
0
    def test_export_collection(self, mock_producer, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        item_date_start = "2007-01-25T12:00:00Z"
        item_datetime_start = iso8601.parse_date(item_date_start)
        item_date_end = "2008-02-25T12:00:00Z"
        item_datetime_end = iso8601.parse_date(item_date_end)
        harvest_date_start = "2007-03-25T12:00:00Z"
        harvest_date_end = "2008-04-25T12:00:00Z"

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
            "dedupe": True,
            "item_date_start": item_date_start,
            "item_date_end": item_date_end,
            "harvest_date_start": harvest_date_start,
            "harvest_date_end": harvest_date_end,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_start=harvest_date_start,
            harvest_date_end=harvest_date_end)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, True,
                                               item_datetime_start,
                                               item_datetime_end, [], None)

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test1_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))

        name, _, kwargs = mock_producer.mock_calls[1]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("running", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(
            export_status_message["date_ended"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])
Esempio n. 8
0
    def test_export_collection_and_seeds(self, mock_producer,
                                         mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        warcs = [{
            "warc_id": "9dc0b9c3a93a49eb8f713330b43f954c",
            "path":
            "xtest_1-20151202165907873-00000-306-60892de9dfc6-8001.warc.gz",
            "sha1": "000ffb3371eadb507d77d181ca3f0c5d3c74a2fc",
            "bytes": 460518,
            "date_created": "2016-02-22T14:49:07Z"
        }]
        mock_api_client.warcs.side_effect = [warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        export_message = {
            "id": "test2",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "csv",
            "segment_size": None,
            "path": self.export_path
        }

        exporter = BaseExporter("http://test",
                                None,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None)

        self.assertFalse(exporter.result.success)

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed failure", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(
            export_status_message["date_ended"]))
        self.assertEqual("test2", export_status_message["id"])
        self.assertTrue(CODE_WARC_MISSING,
                        export_status_message["errors"][0]["code"])
        self.assertTrue(CODE_NO_WARCS,
                        export_status_message["errors"][0]["code"])
Esempio n. 9
0
    def test_export_seeds(self, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        export_message = {
            "id":
            "test2",
            "type":
            "test_user",
            "seeds": [
                {
                    "id": "005b131f5f854402afa2b08a4b7ba960",
                    "uid": "uid1"
                },
                {
                    "id": "105b131f5f854402afa2b08a4b7ba960",
                    "uid": "uid2"
                },
            ],
            "format":
            "csv",
            "segment_size":
            None,
            "path":
            self.export_path,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id=None,
            seed_ids=[
                "005b131f5f854402afa2b08a4b7ba960",
                "105b131f5f854402afa2b08a4b7ba960"
            ],
            harvest_date_start=None,
            harvest_date_end=None)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False,
                                               None, None, ["uid1", "uid2"],
                                               None)

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test2_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))
Esempio n. 10
0
    def test_export_dehydrate(self, mock_producer, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))
        mock_table.id_field.return_value = "key2"

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "dehydrate",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False,
                                               None, None, [], None)

        self.assertTrue(exporter.result.success)
        txt_filepath = os.path.join(self.export_path, "test1_001.txt")
        self.assertTrue(os.path.exists(txt_filepath))
        with open(txt_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertEqual("k2v1\n", lines[0])

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertEqual("test1", export_status_message["id"])