Ejemplo n.º 1
0
    def __init__(self, settings):
        self.settings = settings
        self.is_stopped = Event()
        self.pid = os.getpid()
        # channels
        self.url_queue = Queue()
        self.content_queue = Queue()
        self.orch_queue = Queue()

        thread_count = self.settings.thread_count
        # create sub processes
        self.content_processor = ContentProcessor(
            "content_processor", self.content_queue, self.orch_queue, 1,
            self.settings)
        self.content_process = Process(target=self.content_processor.run)
        self.url_processor = UrlProcessor(
            "url_processor", self.url_queue, self.orch_queue, thread_count,
            self.settings)
        self.url_process = Process(target=self.url_processor.run)
Ejemplo n.º 2
0
class Orchestrator(object):
    """Orchestrator

    Orchestrator creates and starts the control flow. It builds a multiprocess
    framework, in which there are three object, the orchestrator, the url
    processor, and the content processor. The url processor and the content
    processor run in a subprocess, receive jobs from the orchestrator, execute
    the jobs, send back the results to the orchestrator. They communicate with
    queues.

    Attributes:
        settings: a Settings instance for the orchestrator
        url_queue: queue, the channel for the orchestrator sending jobs
                   (materials) to the url processor
        content_queue: queue, the channel for the orchestrator sending jobs
                       (materials) to the content processor
        orch_queue: queue, the channel for the orchestrator receiving the
                    executing result from the url/content processor
    """

    def __init__(self, settings):
        self.settings = settings
        self.is_stopped = Event()
        self.pid = os.getpid()
        # channels
        self.url_queue = Queue()
        self.content_queue = Queue()
        self.orch_queue = Queue()

        thread_count = self.settings.thread_count
        # create sub processes
        self.content_processor = ContentProcessor(
            "content_processor", self.content_queue, self.orch_queue, 1,
            self.settings)
        self.content_process = Process(target=self.content_processor.run)
        self.url_processor = UrlProcessor(
            "url_processor", self.url_queue, self.orch_queue, thread_count,
            self.settings)
        self.url_process = Process(target=self.url_processor.run)

    def _feed_seed_urls(self):
        """Read urls from url_list_file and put them into url_queue
        """
        # read seed urls from file
        url_list_file = self.settings.url_list_file
        with file(url_list_file) as urls_file:
            urls = [line.strip() for line in urls_file.readlines()]

        # put the urls to url_queue
        for url in urls:
            material = UrlMaterial(url=url, depth=0)
            self.url_queue.put(material)
            logging.debug("feed seed url %s", material.url)

    def _loop(self):
        """Main loop of this instance

        The loop read products from `orch_queue`. For every product, if it is
            1) a UrlProduct instance, then build a content matertial and send
               it to content_queue
            2) a ContentProduct instance, then (a) check depth, (b) build url
               materials if depth < max_depth, (c) send each of the materials
               to url_queue

        The loop would also break if could not get a product with 10 seconds
        """
        while not self.is_stopped.is_set():
            try:
                product = self.orch_queue.get(timeout=5)
            except Empty:
                logging.info("orchestrator loop broken for timeout")
                self.stop()
                break
            except:
                raise

            # dispatching
            if isinstance(product, UrlProduct):
                material = ContentMaterial(url=product.url,
                                           depth=product.depth,
                                           filepath=product.filepath,
                                           encoding=product.encoding)
                self.content_queue.put(material)
            elif isinstance(product, ContentProduct):
                for url in product.urls:
                    material = UrlMaterial(url=url, depth=product.depth + 1)
                    self.url_queue.put(material)

    def start(self):
        """Start the orchestrator
        """
        # feed seed urls
        self._feed_seed_urls()

        # start processors and loopping
        self.content_process.start()
        self.url_process.start()
        logging.info("orchestrator loop")
        self._loop()
        self.stop()

    def stop(self):
        """Stop the orchestrator
        """
        # close processors and wait
        if not self.is_stopped.is_set() and self.pid == os.getpid():
            self.is_stopped.set()

            # logging.info("@@@@ %d %d", self.pid, os.getpid())

            # stop processors
            self.url_processor.stop()
            self.content_processor.stop()

            # wait processes exiting
            self.content_process.join(timeout=3)
            self.url_process.join(timeout=3)

            logging.info("orchestrator exit")