def test_export_full_json(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}),
            IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}),
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(
            None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost"
        )

        exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, None)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(
            dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None
        )

        file_path = export_filepath + "_001.json"
        self.assertTrue(os.path.exists(file_path))
        with open(file_path, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertDictEqual({"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}, json.loads(lines[0]))
Exemple #2
0
 def _write_readme(model_name, obj, path):
     readme_template = get_template('readme/{}.txt'.format(model_name))
     readme_txt = readme_template.render(Context({model_name: obj, "now": datetime_now()}))
     readme_filepath = os.path.join(path, "README.txt")
     log.debug("Writing %s README to %s: %s", model_name, readme_filepath, readme_txt)
     with codecs.open(readme_filepath, "w", encoding="utf-8") as f:
         f.write(readme_txt)
Exemple #3
0
    def on_persist_exception(self, exception):
        log.error("Handling on persist exception for %s", self.message["id"])
        message = {
            "id": self.message["id"],
            "status": STATUS_FAILURE,
            "errors": [Msg(CODE_MSG_PERSIST_ERROR, str(exception)).to_map()],
            "date_started": datetime_now().isoformat(),
            "date_ended": datetime_now().isoformat(),
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ',
                              self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        # Routing key may be none
        status_routing_key = self.routing_key.replace("start", "status")
        self._publish_message(status_routing_key, message)
Exemple #4
0
    def serialize_collection(self, collection, force_serialize=False):
        records_path = os.path.join(
            get_collection_path(collection, sfm_data_dir=self.data_dir),
            RECORD_DIR)
        log.debug("Collection records path is %s", records_path)

        # Determine whether to serialize
        if not force_serialize and not self._should_serialize(
                collection, records_path):
            log.info(
                "Skipping serialization of %s since no update since last serialization.",
                collection)
            return

        log.info("Serializing %s", collection)
        serialization_date = datetime_now()

        # Initialize records dir
        self._initialize_records_dir(records_path)

        # Serialize collection set, historical collection sets
        self._serialize_collection_set(collection.collection_set, records_path)

        # Serialize credentials, users, groups
        self._serialize_credentials(collection, records_path)

        # Collection
        collection_record_filepath = os.path.join(records_path,
                                                  COLLECTION_FILENAME)
        self._serialize_objs((collection, ), collection_record_filepath)
        log.debug("Serialized collection to %s", collection_record_filepath)

        # Historical collection
        historical_collection_record_filepath = os.path.join(
            records_path, HISTORICAL_COLLECTION_FILENAME)
        self._serialize_objs(collection.history.all(),
                             historical_collection_record_filepath)
        log.debug("Serialized historical collection to %s",
                  historical_collection_record_filepath)

        # Seeds
        self._serialize_seeds(collection, records_path)

        # Harvests, harvest stats, and warcs
        self._serialize_harvests(collection, records_path)

        # Info file
        self._write_info(serialization_date, records_path)

        # README
        self._write_readme(
            "collection", collection,
            get_collection_path(collection, sfm_data_dir=self.data_dir))
    def test_table(self):

        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}),
            IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}),
            IterItem(None, None, None, None, {"key1": "k1v3", "key2": "k2v3", "key3": "k3v3"}),
            IterItem(None, None, None, None, {"key1": "k1v4", "key2": "k2v4", "key3": "k3v4"}),
            IterItem(None, None, None, None, {"key1": "k1v5", "key2": "k2v5", "key3": "k3v5"}),
            IterItem(None, None, None, None, {"key1": "k1v6", "key2": "k2v6", "key3": "k3v6"}),
            IterItem(None, None, None, None, {"key1": "k1v7", "key2": "k2v7", "key3": "k3v7"}),
        ]
        now = datetime_now()
        limit_uids = [11, 14]

        tables = TestableTable(self.warc_paths, True, now, None, limit_uids, mock_warc_iter_cls, segment_row_size=2)
        chunk_cnt = 0
        for idx, table in enumerate(tables):
            chunk_cnt += 1
            for count, row in enumerate(table):
                # every chunk should start with header row
                if count == 0:
                    # Header row
                    # Just testing first and last, figuring these might change often.
                    self.assertEqual("key1", row[0])
                    self.assertEqual("key2", row[1])
                    self.assertEqual("key3", row[2])
                # chunk 1 and row 2
                if idx == 0 and count == 1:
                    # First row
                    self.assertEqual("k1v1", row[0])
                    self.assertEqual("k2v1", row[1])
                    self.assertEqual("k3v1", row[2])
                # chunk 3 and row 3
                if idx == 2 and count == 2:
                    self.assertEqual("k1v6", row[0])
                    self.assertEqual("k2v6", row[1])
                    self.assertEqual("k3v6", row[2])
                # chunk 4 and row 2
                if idx == 3 and count == 1:
                    self.assertEqual("k1v7", row[0])
                    self.assertEqual("k2v7", row[1])
                    self.assertEqual("k3v7", row[2])

        self.assertEqual(4, chunk_cnt)

        mock_warc_iter_cls.assert_called_with(self.warc_paths, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(
            dedupe=True, item_date_end=None, item_date_start=now, limit_item_types=None
        )
Exemple #6
0
    def _finish_processing(self):
        # Otherwise, will not get the last WARC on a stop.
        # No time is OK on a container kill because will resume and process last file.
        # Queue any new files.
        # Wait for processing to complete.
        log.debug("Waiting for processing to complete.")
        self.warc_processing_queue.join()
        log.debug("Processing complete.")
        self.result.ended = datetime_now()

        # Send final message
        self._send_status_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE)

        # Delete result file
        if os.path.exists(self.result_filepath):
            os.remove(self.result_filepath)
    def test_export_full_json(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {
                "key1": "k1v1",
                "key2": "k2v1",
                "key3": "k3v1"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v2",
                "key2": "k2v2",
                "key3": "k3v2"
            })
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(None,
                                mock_warc_iter_cls,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter._full_json_export(self.warcs, export_filepath, True, now,
                                   None, limit_uids, None)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(dedupe=True,
                                                    item_date_start=now,
                                                    item_date_end=None,
                                                    limit_item_types=None)

        file_path = export_filepath + '_001.json'
        self.assertTrue(os.path.exists(file_path))
        with open(file_path, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertDictEqual({
            "key1": "k1v1",
            "key2": "k2v1",
            "key3": "k3v1"
        }, json.loads(lines[0]))
    def test_export_full_json_segment(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}),
            IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}),
            IterItem(None, None, None, None, {"key1": "k1v3", "key2": "k2v3", "key3": "k3v3"}),
            IterItem(None, None, None, None, {"key1": "k1v4", "key2": "k2v4", "key3": "k3v4"}),
            IterItem(None, None, None, None, {"key1": "k1v5", "key2": "k2v5", "key3": "k3v5"}),
            IterItem(None, None, None, None, {"key1": "k1v6", "key2": "k2v6", "key3": "k3v6"}),
            IterItem(None, None, None, None, {"key1": "k1v7", "key2": "k2v7", "key3": "k3v7"}),
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(
            None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost"
        )

        exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, 3)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(
            dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None
        )

        # file test_1.json, test_2.json , test_3.json
        for idx in xrange(3):
            file_path = export_filepath + "_" + str(idx + 1).zfill(3) + ".json"
            self.assertTrue(os.path.exists(file_path))
            with open(file_path, "r") as f:
                lines = f.readlines()
            # the test_3.json only has 1 row
            if idx == 2:
                self.assertEqual(1, len(lines))
            else:
                self.assertEqual(3, len(lines))
            self.assertDictEqual(
                {"key1": "k1v" + str(1 + idx * 3), "key2": "k2v" + str(1 + idx * 3), "key3": "k3v" + str(1 + idx * 3)},
                json.loads(lines[0]),
            )
Exemple #9
0
    def _finish_processing(self):
        # Otherwise, will not get the last WARC on a stop.
        # No time is OK on a container kill because will resume and process last file.
        # Queue any new files.
        # Wait for processing to complete.
        log.debug("Waiting for processing to complete.")
        self.warc_processing_queue.join()
        log.debug("Processing complete.")

        if not self.is_pause:
            self.result.ended = datetime_now()

            # Send final message
            self._send_status_message(
                STATUS_SUCCESS if self.result.success else STATUS_FAILURE)

            # Delete result file
            if os.path.exists(self.result_filepath):
                os.remove(self.result_filepath)
        else:
            log.info("Pausing this harvest.")

            # Send final message
            self._send_status_message(STATUS_PAUSED)
Exemple #10
0
    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key,
                                    export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            # Only request seed ids if < 20. If use too many, will cause problems calling API.
            # 20 is an arbitrary number
            warc_paths = self._get_warc_paths(
                collection_id, seed_ids if len(seed_ids) <= 20 else None,
                harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]
            ) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]
            ) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe,
                                           item_date_start, item_date_end,
                                           seed_uids, export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath,
                                                      str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table,
                                    filepath,
                                    template="{{{}}}\n".format(
                                        tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(
                            base_filepath,
                            str(idx + 1).zfill(3),
                            export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(
                                filepath,
                                prefix=
                                "<html><head><meta charset='utf-8'></head>\n",
                                suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT,
                            "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(
                    Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(
                Msg(CODE_BAD_REQUEST,
                    "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(
            STATUS_SUCCESS if self.result.success else STATUS_FAILURE,
            self.routing_key, export_id, self.result)
Exemple #11
0
def create_readme_for_collection(collection):
    return _create_readme(collection, None, datetime_now())
Exemple #12
0
    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.debug("Shutdown triggered")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug("Too many retries, so giving up on harvesting seeds.")
                        done = True
                        self.result.success = False
                        self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])
Exemple #13
0
    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids,
                                           export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3),
                                                     export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n",
                                           suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key,
                                    export_id, self.result)
    def test_table(self):

        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {
                "key1": "k1v1",
                "key2": "k2v1",
                "key3": "k3v1"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v2",
                "key2": "k2v2",
                "key3": "k3v2"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v3",
                "key2": "k2v3",
                "key3": "k3v3"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v4",
                "key2": "k2v4",
                "key3": "k3v4"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v5",
                "key2": "k2v5",
                "key3": "k3v5"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v6",
                "key2": "k2v6",
                "key3": "k3v6"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v7",
                "key2": "k2v7",
                "key3": "k3v7"
            })
        ]
        now = datetime_now()
        limit_uids = [11, 14]

        tables = TestableTable(self.warc_paths,
                               True,
                               now,
                               None,
                               limit_uids,
                               mock_warc_iter_cls,
                               segment_row_size=2)
        chunk_cnt = 0
        for idx, table in enumerate(tables):
            chunk_cnt += 1
            for count, row in enumerate(table):
                # every chunk should start with header row
                if count == 0:
                    # Header row
                    # Just testing first and last, figuring these might change often.
                    self.assertEqual("key1", row[0])
                    self.assertEqual("key2", row[1])
                    self.assertEqual("key3", row[2])
                # chunk 1 and row 2
                if idx == 0 and count == 1:
                    # First row
                    self.assertEqual("k1v1", row[0])
                    self.assertEqual("k2v1", row[1])
                    self.assertEqual("k3v1", row[2])
                # chunk 3 and row 3
                if idx == 2 and count == 2:
                    self.assertEqual("k1v6", row[0])
                    self.assertEqual("k2v6", row[1])
                    self.assertEqual("k3v6", row[2])
                # chunk 4 and row 2
                if idx == 3 and count == 1:
                    self.assertEqual("k1v7", row[0])
                    self.assertEqual("k2v7", row[1])
                    self.assertEqual("k3v7", row[2])

        self.assertEqual(4, chunk_cnt)

        mock_warc_iter_cls.assert_called_with(self.warc_paths, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(dedupe=True,
                                                    item_date_end=None,
                                                    item_date_start=now,
                                                    limit_item_types=None)
    def test_export_full_json_segment(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {
                "key1": "k1v1",
                "key2": "k2v1",
                "key3": "k3v1"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v2",
                "key2": "k2v2",
                "key3": "k3v2"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v3",
                "key2": "k2v3",
                "key3": "k3v3"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v4",
                "key2": "k2v4",
                "key3": "k3v4"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v5",
                "key2": "k2v5",
                "key3": "k3v5"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v6",
                "key2": "k2v6",
                "key3": "k3v6"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v7",
                "key2": "k2v7",
                "key3": "k3v7"
            })
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(None,
                                mock_warc_iter_cls,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter._full_json_export(self.warcs, export_filepath, True, now,
                                   None, limit_uids, 3)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(dedupe=True,
                                                    item_date_start=now,
                                                    item_date_end=None,
                                                    limit_item_types=None)

        # file test_1.json, test_2.json , test_3.json
        for idx in range(3):
            file_path = export_filepath + '_' + str(idx + 1).zfill(3) + '.json'
            self.assertTrue(os.path.exists(file_path))
            with open(file_path, "r") as f:
                lines = f.readlines()
            # the test_3.json only has 1 row
            if idx == 2:
                self.assertEqual(1, len(lines))
            else:
                self.assertEqual(3, len(lines))
            self.assertDictEqual(
                {
                    "key1": "k1v" + str(1 + idx * 3),
                    "key2": "k2v" + str(1 + idx * 3),
                    "key3": "k3v" + str(1 + idx * 3)
                }, json.loads(lines[0]))
Exemple #16
0
    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(
            self.working_path,
            "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(
                self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED,
                    "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.info("Shutdown triggered")
            # This is for the consumer.
            self.should_stop = True
            if self.is_pause:
                log.info("This will be a pause of the harvest.")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        def pause(signal_number, stack_frame):
            self.is_pause = True

        signal.signal(signal.SIGUSR1, pause)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(
                self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(
            self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]),
                                    self.warc_temp_dir,
                                    debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs
                                    if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug(
                            "Too many retries, so giving up on harvesting seeds."
                        )
                        done = True
                        self.result.success = False
                        self.result.errors.append(
                            Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])