def test_generate_commandline(self): w = warced("test", "/test") self.assertEqual("warcprox -c {} --certs-dir {} --dedup-db-file /dev/null -d /test -n test -p {} -z".format( w.ca_bundle, w.ca_dir, w.port), w._generate_commandline()) w = warced("test", "/test", compress=False, interrupt=True, rollover_time=60) self.assertEqual( "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null -d /test -n test -p {} -i " "--rollover-time 60".format(w.ca_bundle, w.ca_dir, w.port), w._generate_commandline())
def test_generate_commandline(self): w = warced("test", "/test") self.assertEqual( "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null --stats-db-file /dev/null " "-d /test -n test -p {} -z".format(w.ca_bundle, w.ca_dir, w.port), w._generate_commandline()) w = warced("test", "/test", compress=False, interrupt=True, rollover_time=60) self.assertEqual( "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null --stats-db-file /dev/null " "-d /test -n test -p {} -i --rollover-time 60".format( w.ca_bundle, w.ca_dir, w.port), w._generate_commandline())
def test_with(self): warc_dir = tempfile.mkdtemp() try: with warced("test", warc_dir): resp = requests.get("http://www.gwu.edu") self.assertEqual(200, resp.status_code) files = os.listdir(warc_dir) self.assertEqual(1, len(files)) self.assertTrue(files[0].startswith("test")) self.assertTrue(files[0].endswith(".warc.gz")) finally: shutil.rmtree(warc_dir)
def test_set_env(self): self.assertIsNone(os.environ.get("HTTP_PROXY")) self.assertIsNone(os.environ.get("HTTPS_PROXY")) self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE")) w = warced(None, None, port=1234) w._set_envs() self.assertEqual("localhost:1234", os.environ["HTTP_PROXY"]) self.assertEqual("localhost:1234", os.environ["HTTPS_PROXY"]) w._unset_envs() self.assertIsNone(os.environ.get("HTTP_PROXY")) self.assertIsNone(os.environ.get("HTTPS_PROXY")) self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE"))
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.debug("Shutdown triggered") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug("Too many retries, so giving up on harvesting seeds.") done = True self.result.success = False self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join( self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len( self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.info("Shutdown triggered") # This is for the consumer. self.should_stop = True if self.is_pause: log.info("This will be a pause of the harvest.") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) def pause(signal_number, stack_frame): self.is_pause = True signal.signal(signal.SIGUSR1, pause) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer( self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer( self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug( "Too many retries, so giving up on harvesting seeds." ) done = True self.result.success = False self.result.errors.append( Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])