Beispiel #1
0
 def _create_state_store(self):
     """
     Creates a state store for the harvest.
     """
     # We'll be delaying writing to the state store until done processing the warc file.
     self.state_store = DelayedSetStateStoreAdapter(
         JsonHarvestStateStore(self.message["path"]))
Beispiel #2
0
 def _create_state_store(self):
     """
     Creates a state store for the harvest.
     """
     # We'll be delaying writing to the state store until done processing the warc file.
     self.state_store = DelayedSetStateStoreAdapter(JsonHarvestStateStore(self.message["path"]))
Beispiel #3
0
class BaseHarvester(BaseConsumer):
    """
    Base class for a harvester, allowing harvesting from a queue or from a file.

    Note that streams should only be harvested from a file as this does not support
    harvest stop messages. (See sfm-utils.stream_consumer.StreamConsumer.)

    Subclasses should overrride harvest_seeds().
    """
    def __init__(self, working_path, mq_config=None, stream_restart_interval_secs=30 * 60, debug=False,
                 use_warcprox=True, queue_warc_files_interval_secs=5 * 60, warc_rollover_secs=30 * 60,
                 debug_warcprox=False, tries=3, host=None):
        BaseConsumer.__init__(self, working_path=working_path, mq_config=mq_config, persist_messages=True)
        self.stream_restart_interval_secs = stream_restart_interval_secs
        self.is_streaming = False
        self.routing_key = ""
        self.warc_temp_dir = None
        self.stop_harvest_seeds_event = None
        self.stop_harvest_loop_event = None
        self.restart_stream_timer = None
        self.state_store = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.use_warcprox = use_warcprox
        self.warc_processing_queue = Queue()
        self.result_filepath = None
        self.queue_warc_files_interval_secs = queue_warc_files_interval_secs
        self.queue_warc_files_timer = None
        self.warc_rollover_secs = warc_rollover_secs
        self.tries = tries

        # Create and start warc processing thread.
        self.warc_processing_thread = threading.Thread(target=self._process_warc_thread, name="warc_processing_thread")
        self.warc_processing_thread.daemon = True
        self.warc_processing_thread.start()
        self.host = host or os.environ.get("HOSTNAME", "localhost")

    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.debug("Shutdown triggered")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug("Too many retries, so giving up on harvesting seeds.")
                        done = True
                        self.result.success = False
                        self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])

    def _finish_processing(self):
        # Otherwise, will not get the last WARC on a stop.
        # No time is OK on a container kill because will resume and process last file.
        # Queue any new files.
        # Wait for processing to complete.
        log.debug("Waiting for processing to complete.")
        self.warc_processing_queue.join()
        log.debug("Processing complete.")
        self.result.ended = datetime_now()

        # Send final message
        self._send_status_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE)

        # Delete result file
        if os.path.exists(self.result_filepath):
            os.remove(self.result_filepath)

    def _queue_warc_files(self):
        log.debug("Queueing WARC files")
        # Stop the timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Queue warc files
        for warc_filename in self._list_warcs(self.warc_temp_dir):
            log.debug("Queueing %s", warc_filename)
            self.warc_processing_queue.put(warc_filename)

        # Restart the timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files)
            self.queue_warc_files_timer.start()

    def harvest_from_file(self, filepath, is_streaming=False, delete=False):
        """
        Performs a harvest based on the a harvest start message contained in the
        provided filepath.

        :param filepath: filepath of the harvest start message
        :param is_streaming: True to run in streaming mode
        :param delete: True to delete when completed
        """
        self.is_streaming = is_streaming
        return self.message_from_file(filepath, delete=delete)

    def harvest_seeds(self):
        """
        Performs a harvest based on the seeds contained in the message.

        When called, self.message, self.routing_key,
        and self.is_streaming will be populated.
        """
        pass

    def _create_state_store(self):
        """
        Creates a state store for the harvest.
        """
        # We'll be delaying writing to the state store until done processing the warc file.
        self.state_store = DelayedSetStateStoreAdapter(JsonHarvestStateStore(self.message["path"]))

    @staticmethod
    def _list_warcs(path):
        warcs = []
        if os.path.exists(path):
            warcs = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and
                     (f.endswith(".warc") or f.endswith(".warc.gz"))]
            log.debug("Found following WARCs in %s: %s", path, warcs)
        else:
            log.warn("Warc path %s does not exist. This may be OK.", path)
        return warcs

    @staticmethod
    def _path_for_warc(harvest_path, filename):
        m = re.search("-(\d{4})(\d{2})(\d{2})(\d{2})\d{7}-", filename)
        assert m
        return "/".join([harvest_path, m.group(1), m.group(2), m.group(3), m.group(4)])

    def _send_web_harvest_message(self):
        urls_set = self.result.urls_as_set()
        if urls_set:
            message = {
                "id": uuid.uuid4().hex,
                "parent_id": self.message["id"],
                "path": self.message["path"],
                "type": "web",
                "seeds": [],
                "collection_set": {
                    "id": self.message["collection_set"]["id"]
                },
                "collection": {
                    "id": self.message["collection"]["id"]
                }
            }
            for url in urls_set:
                message["seeds"].append({"token": url})

            self._publish_message("harvest.start.web", message, trunate_debug_length=5000)
        else:
            log.debug("No urls, so not sending a web harvest message.")

    def _send_status_message(self, status):
        message = {
            "id": self.message["id"],
            "status": status,
            "infos": [msg.to_map() for msg in self.result.infos],
            "warnings": [msg.to_map() for msg in self.result.warnings],
            "errors": [msg.to_map() for msg in self.result.errors],
            "date_started": self.result.started.isoformat(),
            "stats": dict(),
            "token_updates": self.result.token_updates,
            "uids": self.result.uids,
            "warcs": {
                "count": len(self.result.warcs),
                "bytes": self.result.warc_bytes
            },
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        for day, stats in self.result.stats().items():
            message["stats"][day.isoformat()] = dict(stats)

        if self.result.ended:
            message["date_ended"] = self.result.ended.isoformat()

        # Routing key may be none
        status_routing_key = self.routing_key.replace("start", "status")
        self._publish_message(status_routing_key, message)

    def _clean_name(self, name):
        re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', name)

    def _send_warc_created_message(self, warc_path):
        message = {
            "harvest": {
                "id": self.message["id"],
                "type": self.message["type"]
            },
            "collection_set": {
                "id": self.message["collection_set"]["id"]
            },
            "collection": {
                "id": self.message["collection"]["id"]
            },
            "warc": {
                "id": uuid.uuid4().hex,
                "path": warc_path,
                "date_created": datetime_from_stamp(os.path.getctime(warc_path)).isoformat(),
                "bytes": os.path.getsize(warc_path),
                "sha1": hashlib.sha1(open(warc_path).read()).hexdigest()
            }
        }
        self._publish_message("warc_created", message)

    def _create_warc_temp_dir(self):
        """
        Create temporary directory for WARC files.

        :return: the directory path
        """
        path = os.path.join(self.working_path, "tmp", safe_string(self.message["id"]))
        if not os.path.exists(path):
            os.makedirs(path)
        return path

    def _restart_stream(self):
        log.debug("Restarting stream.")
        self.stop_harvest_seeds_event.set()
        self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream)
        self.restart_stream_timer.start()

    def _save_result(self):
        result_message = {
            "warcs": self.result.warcs,
            "warc_bytes": self.result.warc_bytes,
            "stats": [],
            "started": self.result.started.isoformat(),
            "infos": [msg.to_map() for msg in self.result.infos],
            "warnings": [msg.to_map() for msg in self.result.warnings],
            "errors": [msg.to_map() for msg in self.result.errors]

        }

        for day, stats in self.result.stats().items():
            result_message["stats"].append((day.isoformat(), dict(stats)))

        with codecs.open(self.result_filepath, 'w') as f:
            json.dump(result_message, f)

        log.debug("Persisted result to %s", self.result_filepath)

    def _load_result(self):
        if os.path.exists(self.result_filepath):
            log.info("Resuming from previous results")
            with codecs.open(self.result_filepath, 'r') as f:
                result_message = json.load(f)
            log.debug("Previous results: {}".format(json.dumps(result_message, indent=4)))
            self.result.warcs = result_message["warcs"]
            self.result.warc_bytes = result_message["warc_bytes"]
            self.result.started = iso8601.parse_date(result_message["started"])
            self.result.infos = list([Msg(msg["code"], msg["message"]) for msg in result_message["infos"]])
            self.result.warnings = list([Msg(msg["code"], msg["message"]) for msg in result_message["warnings"]])
            self.result.errors = list([Msg(msg["code"], msg["message"]) for msg in result_message["errors"]])

            for day, stats in result_message["stats"]:
                for item, count in stats.items():
                    self.result.increment_stats(item, count=count, day=iso8601.parse_date(day).date())

    def _process_warc_thread(self):
        log.info("Starting WARC processing thread")
        # This will continue until harvester is killed.
        while True:
            # This will block
            try:
                warc_filename = self.warc_processing_queue.get(timeout=1)
            except Empty:
                continue
            # Make sure file exists. Possible that same file will be put in queue multiple times.
            warc_filepath = os.path.join(self.warc_temp_dir, warc_filename)
            if os.path.exists(warc_filepath):
                log.info("Processing %s", warc_filename)

                # Process the warc
                self.process_warc(warc_filepath)

                # Send web harvest message
                self._send_web_harvest_message()

                # Since the urls were sent, clear them
                self.result.urls = []

                # Move the warc
                dest_path = self._path_for_warc(self.message["path"], warc_filename)
                dest_warc_filepath = os.path.join(dest_path, warc_filename)
                log.debug("Moving %s to %s", warc_filepath, dest_warc_filepath)
                if not os.path.exists(dest_path):
                    os.makedirs(dest_path)
                shutil.move(warc_filepath, dest_warc_filepath)

                # Persist the state
                self.state_store.pass_state()

                # Add it to result
                self.result.add_warc(dest_warc_filepath)

                # Send warc created message
                self._send_warc_created_message(dest_warc_filepath)

                # Send status message
                self._send_status_message(STATUS_RUNNING)

                # Since these were sent, clear them.
                self.result.token_updates = {}
                self.result.uids = {}

                # Persist the result for resuming
                self._save_result()

            else:
                log.debug("Skipping processing %s", warc_filename)
            # Mark this as done.
            self.warc_processing_queue.task_done()
        log.debug("Exiting warc processing thread")

    def process_warc(self, warc_filepath):
        """
        Processes the provided WARC file.

        Processing involves:
        * Extracting URLs and adding to self.result.
        * Save state to self.state_store.
        * Increment counts in self.result.
        """
        pass

    @staticmethod
    def main(cls, queue, routing_keys):
        """
        A configurable main() for a harvester.

        For example:
            if __name__ == "__main__":
                TwitterHarvester.main(TwitterHarvester, QUEUE, [ROUTING_KEY])

        :param cls: the harvester class
        :param queue: queue for the harvester
        :param routing_keys: list of routing keys for the harvester
        """

        # Logging
        logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG)

        # Arguments
        parser = argparse.ArgumentParser()
        parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                            default="False", const="True")
        parser.add_argument("--debug-http", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                            default="False", const="True")
        parser.add_argument("--debug-warcprox", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                            default="False", const="True")

        subparsers = parser.add_subparsers(dest="command")

        service_parser = subparsers.add_parser("service", help="Run harvesting service that consumes messages from "
                                                               "messaging queue.")
        service_parser.add_argument("host")
        service_parser.add_argument("username")
        service_parser.add_argument("password")
        service_parser.add_argument("working_path")
        service_parser.add_argument("--skip-resume", action="store_true")
        service_parser.add_argument("--tries", type=int, default="3", help="Number of times to try harvests if errors.")

        seed_parser = subparsers.add_parser("seed", help="Harvest based on a seed file.")
        seed_parser.add_argument("filepath", help="Filepath of the seed file.")
        seed_parser.add_argument("working_path")
        seed_parser.add_argument("--streaming", action="store_true", help="Run in streaming mode.")
        seed_parser.add_argument("--host")
        seed_parser.add_argument("--username")
        seed_parser.add_argument("--password")
        seed_parser.add_argument("--tries", type=int, default="3", help="Number of times to try harvests if errors.")

        args = parser.parse_args()

        # Logging
        logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
        logging.getLogger("requests").setLevel(logging.debug if args.debug_http else logging.INFO)
        logging.getLogger("requests_oauthlib").setLevel(logging.debug if args.debug_http else logging.INFO)
        logging.getLogger("oauthlib").setLevel(logging.debug if args.debug_http else logging.INFO)

        if args.command == "service":
            harvester = cls(args.working_path, mq_config=MqConfig(args.host, args.username, args.password, EXCHANGE,
                                                                  {queue: routing_keys}),
                            debug=args.debug, debug_warcprox=args.debug_warcprox, tries=args.tries)
            if not args.skip_resume:
                harvester.resume_from_file()
            harvester.run()
        elif args.command == "seed":
            mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \
                if args.host and args.username and args.password else None
            harvester = cls(args.working_path, mq_config=mq_config, debug=args.debug,
                            debug_warcprox=args.debug_warcprox, tries=args.tries)
            harvester.harvest_from_file(args.filepath, is_streaming=args.streaming)
            if __name__ == '__main__':
                if harvester.result:
                    log.info("Result is: %s", harvester.result)
                    sys.exit(0)
                else:
                    log.warning("Result is: %s", harvester.result)
                    sys.exit(1)
Beispiel #4
0
class BaseHarvester(BaseConsumer):
    """
    Base class for a harvester, allowing harvesting from a queue or from a file.

    Note that streams should only be harvested from a file as this does not support
    harvest stop messages. (See sfm-utils.stream_consumer.StreamConsumer.)

    Subclasses should overrride harvest_seeds().
    """
    def __init__(self,
                 working_path,
                 mq_config=None,
                 stream_restart_interval_secs=30 * 60,
                 debug=False,
                 use_warcprox=True,
                 queue_warc_files_interval_secs=5 * 60,
                 warc_rollover_secs=30 * 60,
                 debug_warcprox=False,
                 tries=3,
                 host=None):
        BaseConsumer.__init__(self,
                              working_path=working_path,
                              mq_config=mq_config,
                              persist_messages=True)
        self.stream_restart_interval_secs = stream_restart_interval_secs
        self.is_streaming = False
        self.routing_key = ""
        self.warc_temp_dir = None
        self.stop_harvest_seeds_event = threading.Event()
        self.stop_harvest_loop_event = threading.Event()
        self.restart_stream_timer = None
        self.state_store = None
        self.debug = debug
        self.debug_warcprox = debug_warcprox
        self.use_warcprox = use_warcprox
        self.warc_processing_queue = Queue()
        self.result_filepath = None
        self.queue_warc_files_interval_secs = queue_warc_files_interval_secs
        self.queue_warc_files_timer = None
        self.warc_rollover_secs = warc_rollover_secs
        self.tries = tries

        # Create and start warc processing thread.
        self.warc_processing_thread = threading.Thread(
            target=self._process_warc_thread, name="warc_processing_thread")
        self.warc_processing_thread.daemon = True
        self.warc_processing_thread.start()
        self.host = host or os.environ.get("HOSTNAME", "localhost")
        # Indicates that the next shutdown should be treated as a pause of the harvest, rather than a completion.
        self.is_pause = False

    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(
            self.working_path,
            "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(
                self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED,
                    "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.info("Shutdown triggered")
            # This is for the consumer.
            self.should_stop = True
            if self.is_pause:
                log.info("This will be a pause of the harvest.")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        def pause(signal_number, stack_frame):
            self.is_pause = True

        signal.signal(signal.SIGUSR1, pause)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(
                self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(
            self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]),
                                    self.warc_temp_dir,
                                    debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs
                                    if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug(
                            "Too many retries, so giving up on harvesting seeds."
                        )
                        done = True
                        self.result.success = False
                        self.result.errors.append(
                            Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])

    def _finish_processing(self):
        # Otherwise, will not get the last WARC on a stop.
        # No time is OK on a container kill because will resume and process last file.
        # Queue any new files.
        # Wait for processing to complete.
        log.debug("Waiting for processing to complete.")
        self.warc_processing_queue.join()
        log.debug("Processing complete.")

        if not self.is_pause:
            self.result.ended = datetime_now()

            # Send final message
            self._send_status_message(
                STATUS_SUCCESS if self.result.success else STATUS_FAILURE)

            # Delete result file
            if os.path.exists(self.result_filepath):
                os.remove(self.result_filepath)
        else:
            log.info("Pausing this harvest.")

            # Send final message
            self._send_status_message(STATUS_PAUSED)

    def _queue_warc_files(self):
        log.debug("Queueing WARC files")
        # Stop the timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Queue warc files
        for warc_filename in self._list_warcs(self.warc_temp_dir):
            log.debug("Queueing %s", warc_filename)
            self.warc_processing_queue.put(warc_filename)

        # Restart the timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer = threading.Timer(
                self.queue_warc_files_interval_secs, self._queue_warc_files)
            self.queue_warc_files_timer.start()

    def harvest_from_file(self, filepath, is_streaming=False, delete=False):
        """
        Performs a harvest based on the a harvest start message contained in the
        provided filepath.

        :param filepath: filepath of the harvest start message
        :param is_streaming: True to run in streaming mode
        :param delete: True to delete when completed
        """
        self.is_streaming = is_streaming
        return self.message_from_file(filepath, delete=delete)

    def harvest_seeds(self):
        """
        Performs a harvest based on the seeds contained in the message.

        When called, self.message, self.routing_key,
        and self.is_streaming will be populated.
        """
        pass

    def _create_state_store(self):
        """
        Creates a state store for the harvest.
        """
        # We'll be delaying writing to the state store until done processing the warc file.
        self.state_store = DelayedSetStateStoreAdapter(
            JsonHarvestStateStore(self.message["path"]))

    @staticmethod
    def _list_warcs(path):
        warcs = []
        if os.path.exists(path):
            warcs = [
                f for f in os.listdir(path)
                if os.path.isfile(os.path.join(path, f)) and (
                    f.endswith(".warc") or f.endswith(".warc.gz"))
            ]
            log.debug("Found following WARCs in %s: %s", path, warcs)
        else:
            log.warning("Warc path %s does not exist. This may be OK.", path)
        return warcs

    @staticmethod
    def _path_for_warc(harvest_path, filename):
        m = re.search("-(\d{4})(\d{2})(\d{2})(\d{2})\d{7}-", filename)
        assert m
        return "/".join(
            [harvest_path,
             m.group(1),
             m.group(2),
             m.group(3),
             m.group(4)])

    def _send_status_message(self, status):
        message = {
            "id": self.message["id"],
            "status": status,
            "infos": [msg.to_map() for msg in self.result.infos],
            "warnings": [msg.to_map() for msg in self.result.warnings],
            "errors": [msg.to_map() for msg in self.result.errors],
            "date_started": self.result.started.isoformat(),
            "stats": dict(),
            "token_updates": self.result.token_updates,
            "uids": self.result.uids,
            "warcs": {
                "count": len(self.result.warcs),
                "bytes": self.result.warc_bytes
            },
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ',
                              self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        for day, stats in self.result.stats().items():
            message["stats"][day.isoformat()] = dict(stats)

        if self.result.ended:
            message["date_ended"] = self.result.ended.isoformat()

        # Routing key may be none
        log.info("Sending status message for harvest %s: %s",
                 self.message["id"], status)
        status_routing_key = self.routing_key.replace("start", "status")
        self._publish_message(status_routing_key, message)

    @staticmethod
    def _clean_name(name):
        re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', name)

    def _send_warc_created_message(self, warc_path):
        message = {
            "harvest": {
                "id": self.message["id"],
                "type": self.message["type"]
            },
            "collection_set": {
                "id": self.message["collection_set"]["id"]
            },
            "collection": {
                "id": self.message["collection"]["id"]
            },
            "warc": {
                "id":
                uuid.uuid4().hex,
                "path":
                warc_path,
                "date_created":
                datetime_from_stamp(os.path.getctime(warc_path)).isoformat(),
                "bytes":
                os.path.getsize(warc_path),
                "sha1":
                hashlib.sha1(open(warc_path, 'rb').read()).hexdigest()
            }
        }
        self._publish_message("warc_created", message)

    def _create_warc_temp_dir(self):
        """
        Create temporary directory for WARC files.

        :return: the directory path
        """
        path = os.path.join(self.working_path, "tmp",
                            safe_string(self.message["id"]))
        if not os.path.exists(path):
            os.makedirs(path)
        return path

    def _restart_stream(self):
        log.debug("Restarting stream.")
        self.stop_harvest_seeds_event.set()
        self.restart_stream_timer = threading.Timer(
            self.stream_restart_interval_secs, self._restart_stream)
        self.restart_stream_timer.start()

    def _save_result(self):
        result_message = {
            "warcs": self.result.warcs,
            "warc_bytes": self.result.warc_bytes,
            "stats": [],
            "started": self.result.started.isoformat(),
            "infos": [msg.to_map() for msg in self.result.infos],
            "warnings": [msg.to_map() for msg in self.result.warnings],
            "errors": [msg.to_map() for msg in self.result.errors]
        }

        for day, stats in self.result.stats().items():
            result_message["stats"].append((day.isoformat(), dict(stats)))

        with codecs.open(self.result_filepath, 'w') as f:
            json.dump(result_message, f)

        log.debug("Persisted result to %s", self.result_filepath)

    def _load_result(self):
        if os.path.exists(self.result_filepath):
            log.info("Resuming from previous results")
            with codecs.open(self.result_filepath, 'r') as f:
                result_message = json.load(f)
            log.debug("Previous results: {}".format(
                json.dumps(result_message, indent=4)))
            self.result.warcs = result_message["warcs"]
            self.result.warc_bytes = result_message["warc_bytes"]
            self.result.started = iso8601.parse_date(result_message["started"])
            self.result.infos = list([
                Msg(msg["code"], msg["message"])
                for msg in result_message["infos"]
            ])
            self.result.warnings = list([
                Msg(msg["code"], msg["message"])
                for msg in result_message["warnings"]
            ])
            self.result.errors = list([
                Msg(msg["code"], msg["message"])
                for msg in result_message["errors"]
            ])

            for day, stats in result_message["stats"]:
                for item, count in stats.items():
                    self.result.increment_stats(
                        item, count=count, day=iso8601.parse_date(day).date())

    def _process_warc_thread(self):
        log.info("Starting WARC processing thread")
        # This will continue until harvester is killed.
        while True:
            # This will block
            try:
                warc_filename = self.warc_processing_queue.get(timeout=1)
            except Empty:
                continue
            # Make sure file exists. Possible that same file will be put in queue multiple times.
            warc_filepath = os.path.join(self.warc_temp_dir, warc_filename)
            if os.path.exists(warc_filepath):
                # Process the warc
                self.process_warc(warc_filepath)

                # Move the warc
                dest_path = self._path_for_warc(self.message["path"],
                                                warc_filename)
                dest_warc_filepath = os.path.join(dest_path, warc_filename)
                log.debug("Moving %s to %s", warc_filepath, dest_warc_filepath)
                if not os.path.exists(dest_path):
                    os.makedirs(dest_path)
                shutil.move(warc_filepath, dest_warc_filepath)

                # Persist the state
                self.state_store.pass_state()

                # Add it to result
                self.result.add_warc(dest_warc_filepath)

                # Send warc created message
                self._send_warc_created_message(dest_warc_filepath)

                # Send status message
                self._send_status_message(
                    STATUS_STOPPING if self.stop_harvest_seeds_event.is_set(
                    ) else STATUS_RUNNING)

                # Since these were sent, clear them.
                self.result.token_updates = {}
                self.result.uids = {}

                # Persist the result for resuming
                self._save_result()

            else:
                log.debug("Skipping processing %s", warc_filename)
            # Mark this as done.
            self.warc_processing_queue.task_done()

    def on_persist_exception(self, exception):
        log.error("Handling on persist exception for %s", self.message["id"])
        message = {
            "id": self.message["id"],
            "status": STATUS_FAILURE,
            "errors": [Msg(CODE_MSG_PERSIST_ERROR, str(exception)).to_map()],
            "date_started": datetime_now().isoformat(),
            "date_ended": datetime_now().isoformat(),
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ',
                              self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        # Routing key may be none
        status_routing_key = self.routing_key.replace("start", "status")
        self._publish_message(status_routing_key, message)

    def process_warc(self, warc_filepath):
        """
        Processes the provided WARC file.

        Processing involves:
        * Save state to self.state_store.
        * Increment counts in self.result.
        """
        pass

    @staticmethod
    def main(cls, queue, routing_keys):
        """
        A configurable main() for a harvester.

        For example:
            if __name__ == "__main__":
                TwitterHarvester.main(TwitterHarvester, QUEUE, [ROUTING_KEY])

        :param cls: the harvester class
        :param queue: queue for the harvester
        :param routing_keys: list of routing keys for the harvester
        """
        # Logging
        logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s',
                            level=logging.DEBUG)

        # Arguments
        parser = argparse.ArgumentParser()
        parser.add_argument("--debug",
                            type=lambda v: v.lower() in
                            ("yes", "true", "t", "1"),
                            nargs="?",
                            default="False",
                            const="True")
        parser.add_argument("--debug-http",
                            type=lambda v: v.lower() in
                            ("yes", "true", "t", "1"),
                            nargs="?",
                            default="False",
                            const="True")
        parser.add_argument("--debug-warcprox",
                            type=lambda v: v.lower() in
                            ("yes", "true", "t", "1"),
                            nargs="?",
                            default="False",
                            const="True")

        subparsers = parser.add_subparsers(dest="command")

        service_parser = subparsers.add_parser(
            "service",
            help="Run harvesting service that consumes messages from "
            "messaging queue.")
        service_parser.add_argument("host")
        service_parser.add_argument("username")
        service_parser.add_argument("password")
        service_parser.add_argument("working_path")
        service_parser.add_argument("--skip-resume", action="store_true")
        service_parser.add_argument(
            "--tries",
            type=int,
            default="3",
            help="Number of times to try harvests if errors.")
        service_parser.add_argument("--priority-queues",
                                    type=lambda v: v.lower() in
                                    ("yes", "true", "t", "1"),
                                    nargs="?",
                                    default="False",
                                    const="True")

        seed_parser = subparsers.add_parser(
            "seed", help="Harvest based on a seed file.")
        seed_parser.add_argument("filepath", help="Filepath of the seed file.")
        seed_parser.add_argument("working_path")
        seed_parser.add_argument("--streaming",
                                 action="store_true",
                                 help="Run in streaming mode.")
        seed_parser.add_argument("--host")
        seed_parser.add_argument("--username")
        seed_parser.add_argument("--password")
        seed_parser.add_argument(
            "--tries",
            type=int,
            default="3",
            help="Number of times to try harvests if errors.")

        args = parser.parse_args()

        # Logging
        logging.getLogger("requests").setLevel(
            logging.DEBUG if args.debug_http else logging.INFO)
        logging.getLogger("requests_oauthlib").setLevel(
            logging.DEBUG if args.debug_http else logging.INFO)
        logging.getLogger("oauthlib").setLevel(
            logging.DEBUG if args.debug_http else logging.INFO)
        logging.getLogger("urllib3").setLevel(
            logging.DEBUG if args.debug_http else logging.INFO)

        if args.command == "service":
            # Optionally add priority to queues
            if args.priority_queues:
                for i, key in enumerate(routing_keys):
                    routing_keys[i] = key + ".priority"
                queue += "_priority"

            harvester = cls(args.working_path,
                            mq_config=MqConfig(args.host, args.username,
                                               args.password, EXCHANGE,
                                               {queue: routing_keys}),
                            debug=args.debug,
                            debug_warcprox=args.debug_warcprox,
                            tries=args.tries)
            if not args.skip_resume:
                harvester.resume_from_file()
            harvester.run()
        elif args.command == "seed":
            mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \
                if args.host and args.username and args.password else None
            harvester = cls(args.working_path,
                            mq_config=mq_config,
                            debug=args.debug,
                            debug_warcprox=args.debug_warcprox,
                            tries=args.tries)
            harvester.harvest_from_file(args.filepath,
                                        is_streaming=args.streaming)
            if __name__ == '__main__':
                if harvester.result:
                    log.info("Result is: %s", harvester.result)
                    sys.exit(0)
                else:
                    log.warning("Result is: %s", harvester.result)
                    sys.exit(1)