def test_export_full_json(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}), IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}), ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter( None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, None) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with( dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None ) file_path = export_filepath + "_001.json" self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertDictEqual({"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}, json.loads(lines[0]))
def _write_readme(model_name, obj, path): readme_template = get_template('readme/{}.txt'.format(model_name)) readme_txt = readme_template.render(Context({model_name: obj, "now": datetime_now()})) readme_filepath = os.path.join(path, "README.txt") log.debug("Writing %s README to %s: %s", model_name, readme_filepath, readme_txt) with codecs.open(readme_filepath, "w", encoding="utf-8") as f: f.write(readme_txt)
def on_persist_exception(self, exception): log.error("Handling on persist exception for %s", self.message["id"]) message = { "id": self.message["id"], "status": STATUS_FAILURE, "errors": [Msg(CODE_MSG_PERSIST_ERROR, str(exception)).to_map()], "date_started": datetime_now().isoformat(), "date_ended": datetime_now().isoformat(), # This will add spaces before caps "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', self.__class__.__name__), "host": self.host, "instance": str(os.getpid()) } # Routing key may be none status_routing_key = self.routing_key.replace("start", "status") self._publish_message(status_routing_key, message)
def serialize_collection(self, collection, force_serialize=False): records_path = os.path.join( get_collection_path(collection, sfm_data_dir=self.data_dir), RECORD_DIR) log.debug("Collection records path is %s", records_path) # Determine whether to serialize if not force_serialize and not self._should_serialize( collection, records_path): log.info( "Skipping serialization of %s since no update since last serialization.", collection) return log.info("Serializing %s", collection) serialization_date = datetime_now() # Initialize records dir self._initialize_records_dir(records_path) # Serialize collection set, historical collection sets self._serialize_collection_set(collection.collection_set, records_path) # Serialize credentials, users, groups self._serialize_credentials(collection, records_path) # Collection collection_record_filepath = os.path.join(records_path, COLLECTION_FILENAME) self._serialize_objs((collection, ), collection_record_filepath) log.debug("Serialized collection to %s", collection_record_filepath) # Historical collection historical_collection_record_filepath = os.path.join( records_path, HISTORICAL_COLLECTION_FILENAME) self._serialize_objs(collection.history.all(), historical_collection_record_filepath) log.debug("Serialized historical collection to %s", historical_collection_record_filepath) # Seeds self._serialize_seeds(collection, records_path) # Harvests, harvest stats, and warcs self._serialize_harvests(collection, records_path) # Info file self._write_info(serialization_date, records_path) # README self._write_readme( "collection", collection, get_collection_path(collection, sfm_data_dir=self.data_dir))
def test_table(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}), IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}), IterItem(None, None, None, None, {"key1": "k1v3", "key2": "k2v3", "key3": "k3v3"}), IterItem(None, None, None, None, {"key1": "k1v4", "key2": "k2v4", "key3": "k3v4"}), IterItem(None, None, None, None, {"key1": "k1v5", "key2": "k2v5", "key3": "k3v5"}), IterItem(None, None, None, None, {"key1": "k1v6", "key2": "k2v6", "key3": "k3v6"}), IterItem(None, None, None, None, {"key1": "k1v7", "key2": "k2v7", "key3": "k3v7"}), ] now = datetime_now() limit_uids = [11, 14] tables = TestableTable(self.warc_paths, True, now, None, limit_uids, mock_warc_iter_cls, segment_row_size=2) chunk_cnt = 0 for idx, table in enumerate(tables): chunk_cnt += 1 for count, row in enumerate(table): # every chunk should start with header row if count == 0: # Header row # Just testing first and last, figuring these might change often. self.assertEqual("key1", row[0]) self.assertEqual("key2", row[1]) self.assertEqual("key3", row[2]) # chunk 1 and row 2 if idx == 0 and count == 1: # First row self.assertEqual("k1v1", row[0]) self.assertEqual("k2v1", row[1]) self.assertEqual("k3v1", row[2]) # chunk 3 and row 3 if idx == 2 and count == 2: self.assertEqual("k1v6", row[0]) self.assertEqual("k2v6", row[1]) self.assertEqual("k3v6", row[2]) # chunk 4 and row 2 if idx == 3 and count == 1: self.assertEqual("k1v7", row[0]) self.assertEqual("k2v7", row[1]) self.assertEqual("k3v7", row[2]) self.assertEqual(4, chunk_cnt) mock_warc_iter_cls.assert_called_with(self.warc_paths, limit_uids) mock_warc_iter.iter.assert_called_once_with( dedupe=True, item_date_end=None, item_date_start=now, limit_item_types=None )
def _finish_processing(self): # Otherwise, will not get the last WARC on a stop. # No time is OK on a container kill because will resume and process last file. # Queue any new files. # Wait for processing to complete. log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") self.result.ended = datetime_now() # Send final message self._send_status_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE) # Delete result file if os.path.exists(self.result_filepath): os.remove(self.result_filepath)
def test_export_full_json(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, { "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }), IterItem(None, None, None, None, { "key1": "k1v2", "key2": "k2v2", "key3": "k3v2" }) ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter(None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, None) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with(dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None) file_path = export_filepath + '_001.json' self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertDictEqual({ "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }, json.loads(lines[0]))
def test_export_full_json_segment(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, {"key1": "k1v1", "key2": "k2v1", "key3": "k3v1"}), IterItem(None, None, None, None, {"key1": "k1v2", "key2": "k2v2", "key3": "k3v2"}), IterItem(None, None, None, None, {"key1": "k1v3", "key2": "k2v3", "key3": "k3v3"}), IterItem(None, None, None, None, {"key1": "k1v4", "key2": "k2v4", "key3": "k3v4"}), IterItem(None, None, None, None, {"key1": "k1v5", "key2": "k2v5", "key3": "k3v5"}), IterItem(None, None, None, None, {"key1": "k1v6", "key2": "k2v6", "key3": "k3v6"}), IterItem(None, None, None, None, {"key1": "k1v7", "key2": "k2v7", "key3": "k3v7"}), ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter( None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost" ) exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, 3) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with( dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None ) # file test_1.json, test_2.json , test_3.json for idx in xrange(3): file_path = export_filepath + "_" + str(idx + 1).zfill(3) + ".json" self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() # the test_3.json only has 1 row if idx == 2: self.assertEqual(1, len(lines)) else: self.assertEqual(3, len(lines)) self.assertDictEqual( {"key1": "k1v" + str(1 + idx * 3), "key2": "k2v" + str(1 + idx * 3), "key3": "k3v" + str(1 + idx * 3)}, json.loads(lines[0]), )
def _finish_processing(self): # Otherwise, will not get the last WARC on a stop. # No time is OK on a container kill because will resume and process last file. # Queue any new files. # Wait for processing to complete. log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") if not self.is_pause: self.result.ended = datetime_now() # Send final message self._send_status_message( STATUS_SUCCESS if self.result.success else STATUS_FAILURE) # Delete result file if os.path.exists(self.result_filepath): os.remove(self.result_filepath) else: log.info("Pausing this harvest.") # Send final message self._send_status_message(STATUS_PAUSED)
def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") # Only request seed ids if < 20. If use too many, will cause problems calling API. # 20 is an arbitrary number warc_paths = self._get_warc_paths( collection_id, seed_ids if len(seed_ids) <= 20 else None, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"] ) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"] ) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format( tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format( base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix( filepath, prefix= "<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append( Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append( Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message( STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result)
def create_readme_for_collection(collection): return _create_readme(collection, None, datetime_now())
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.debug("Shutdown triggered") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug("Too many retries, so giving up on harvesting seeds.") done = True self.result.success = False self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])
def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"]) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"]) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result)
def test_table(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, { "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }), IterItem(None, None, None, None, { "key1": "k1v2", "key2": "k2v2", "key3": "k3v2" }), IterItem(None, None, None, None, { "key1": "k1v3", "key2": "k2v3", "key3": "k3v3" }), IterItem(None, None, None, None, { "key1": "k1v4", "key2": "k2v4", "key3": "k3v4" }), IterItem(None, None, None, None, { "key1": "k1v5", "key2": "k2v5", "key3": "k3v5" }), IterItem(None, None, None, None, { "key1": "k1v6", "key2": "k2v6", "key3": "k3v6" }), IterItem(None, None, None, None, { "key1": "k1v7", "key2": "k2v7", "key3": "k3v7" }) ] now = datetime_now() limit_uids = [11, 14] tables = TestableTable(self.warc_paths, True, now, None, limit_uids, mock_warc_iter_cls, segment_row_size=2) chunk_cnt = 0 for idx, table in enumerate(tables): chunk_cnt += 1 for count, row in enumerate(table): # every chunk should start with header row if count == 0: # Header row # Just testing first and last, figuring these might change often. self.assertEqual("key1", row[0]) self.assertEqual("key2", row[1]) self.assertEqual("key3", row[2]) # chunk 1 and row 2 if idx == 0 and count == 1: # First row self.assertEqual("k1v1", row[0]) self.assertEqual("k2v1", row[1]) self.assertEqual("k3v1", row[2]) # chunk 3 and row 3 if idx == 2 and count == 2: self.assertEqual("k1v6", row[0]) self.assertEqual("k2v6", row[1]) self.assertEqual("k3v6", row[2]) # chunk 4 and row 2 if idx == 3 and count == 1: self.assertEqual("k1v7", row[0]) self.assertEqual("k2v7", row[1]) self.assertEqual("k3v7", row[2]) self.assertEqual(4, chunk_cnt) mock_warc_iter_cls.assert_called_with(self.warc_paths, limit_uids) mock_warc_iter.iter.assert_called_once_with(dedupe=True, item_date_end=None, item_date_start=now, limit_item_types=None)
def test_export_full_json_segment(self): mock_warc_iter_cls = MagicMock() mock_warc_iter = MagicMock() mock_warc_iter_cls.side_effect = [mock_warc_iter] mock_warc_iter.iter.return_value = [ IterItem(None, None, None, None, { "key1": "k1v1", "key2": "k2v1", "key3": "k3v1" }), IterItem(None, None, None, None, { "key1": "k1v2", "key2": "k2v2", "key3": "k3v2" }), IterItem(None, None, None, None, { "key1": "k1v3", "key2": "k2v3", "key3": "k3v3" }), IterItem(None, None, None, None, { "key1": "k1v4", "key2": "k2v4", "key3": "k3v4" }), IterItem(None, None, None, None, { "key1": "k1v5", "key2": "k2v5", "key3": "k3v5" }), IterItem(None, None, None, None, { "key1": "k1v6", "key2": "k2v6", "key3": "k3v6" }), IterItem(None, None, None, None, { "key1": "k1v7", "key2": "k2v7", "key3": "k3v7" }) ] export_filepath = os.path.join(self.export_path, "test") now = datetime_now() limit_uids = [11, 14] exporter = BaseExporter(None, mock_warc_iter_cls, None, self.working_path, warc_base_path=self.warc_base_path, host="testhost") exporter._full_json_export(self.warcs, export_filepath, True, now, None, limit_uids, 3) mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids) mock_warc_iter.iter.assert_called_once_with(dedupe=True, item_date_start=now, item_date_end=None, limit_item_types=None) # file test_1.json, test_2.json , test_3.json for idx in range(3): file_path = export_filepath + '_' + str(idx + 1).zfill(3) + '.json' self.assertTrue(os.path.exists(file_path)) with open(file_path, "r") as f: lines = f.readlines() # the test_3.json only has 1 row if idx == 2: self.assertEqual(1, len(lines)) else: self.assertEqual(3, len(lines)) self.assertDictEqual( { "key1": "k1v" + str(1 + idx * 3), "key2": "k2v" + str(1 + idx * 3), "key3": "k3v" + str(1 + idx * 3) }, json.loads(lines[0]))
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join( self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len( self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.info("Shutdown triggered") # This is for the consumer. self.should_stop = True if self.is_pause: log.info("This will be a pause of the harvest.") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) def pause(signal_number, stack_frame): self.is_pause = True signal.signal(signal.SIGUSR1, pause) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer( self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer( self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug( "Too many retries, so giving up on harvesting seeds." ) done = True self.result.success = False self.result.errors.append( Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])