def test_generate_commandline(self):
        w = warced("test", "/test")
        self.assertEqual("warcprox -c {} --certs-dir {} --dedup-db-file /dev/null -d /test -n test -p {} -z".format(
            w.ca_bundle, w.ca_dir, w.port), w._generate_commandline())

        w = warced("test", "/test", compress=False, interrupt=True, rollover_time=60)
        self.assertEqual(
            "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null -d /test -n test -p {} -i "
            "--rollover-time 60".format(w.ca_bundle, w.ca_dir, w.port), w._generate_commandline())
    def test_generate_commandline(self):
        w = warced("test", "/test")
        self.assertEqual(
            "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null --stats-db-file /dev/null "
            "-d /test -n test -p {} -z".format(w.ca_bundle, w.ca_dir, w.port),
            w._generate_commandline())

        w = warced("test",
                   "/test",
                   compress=False,
                   interrupt=True,
                   rollover_time=60)
        self.assertEqual(
            "warcprox -c {} --certs-dir {} --dedup-db-file /dev/null --stats-db-file /dev/null "
            "-d /test -n test -p {} -i --rollover-time 60".format(
                w.ca_bundle, w.ca_dir, w.port), w._generate_commandline())
 def test_with(self):
     warc_dir = tempfile.mkdtemp()
     try:
         with warced("test", warc_dir):
             resp = requests.get("http://www.gwu.edu")
             self.assertEqual(200, resp.status_code)
         files = os.listdir(warc_dir)
         self.assertEqual(1, len(files))
         self.assertTrue(files[0].startswith("test"))
         self.assertTrue(files[0].endswith(".warc.gz"))
     finally:
         shutil.rmtree(warc_dir)
 def test_with(self):
     warc_dir = tempfile.mkdtemp()
     try:
         with warced("test", warc_dir):
             resp = requests.get("http://www.gwu.edu")
             self.assertEqual(200, resp.status_code)
         files = os.listdir(warc_dir)
         self.assertEqual(1, len(files))
         self.assertTrue(files[0].startswith("test"))
         self.assertTrue(files[0].endswith(".warc.gz"))
     finally:
         shutil.rmtree(warc_dir)
    def test_set_env(self):
        self.assertIsNone(os.environ.get("HTTP_PROXY"))
        self.assertIsNone(os.environ.get("HTTPS_PROXY"))
        self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE"))
        w = warced(None, None, port=1234)
        w._set_envs()
        self.assertEqual("localhost:1234", os.environ["HTTP_PROXY"])
        self.assertEqual("localhost:1234", os.environ["HTTPS_PROXY"])

        w._unset_envs()
        self.assertIsNone(os.environ.get("HTTP_PROXY"))
        self.assertIsNone(os.environ.get("HTTPS_PROXY"))
        self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE"))
    def test_set_env(self):
        self.assertIsNone(os.environ.get("HTTP_PROXY"))
        self.assertIsNone(os.environ.get("HTTPS_PROXY"))
        self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE"))
        w = warced(None, None, port=1234)
        w._set_envs()
        self.assertEqual("localhost:1234", os.environ["HTTP_PROXY"])
        self.assertEqual("localhost:1234", os.environ["HTTPS_PROXY"])

        w._unset_envs()
        self.assertIsNone(os.environ.get("HTTP_PROXY"))
        self.assertIsNone(os.environ.get("HTTPS_PROXY"))
        self.assertIsNone(os.environ.get("REQUESTS_CA_BUNDLE"))
Exemple #7
0
    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.debug("Shutdown triggered")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug("Too many retries, so giving up on harvesting seeds.")
                        done = True
                        self.result.success = False
                        self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])
Exemple #8
0
    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(
            self.working_path,
            "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(
                self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED,
                    "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.info("Shutdown triggered")
            # This is for the consumer.
            self.should_stop = True
            if self.is_pause:
                log.info("This will be a pause of the harvest.")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        def pause(signal_number, stack_frame):
            self.is_pause = True

        signal.signal(signal.SIGUSR1, pause)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(
                self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(
            self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]),
                                    self.warc_temp_dir,
                                    debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs
                                    if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug(
                            "Too many retries, so giving up on harvesting seeds."
                        )
                        done = True
                        self.result.success = False
                        self.result.errors.append(
                            Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])