Example #1
0
File: engine.py Project: estin/pomp
    def pump(self, crawler):
        """Start crawling

        :param crawler: instance of :class:`pomp.core.base.BaseCrawler`
        """
        self.prepare(crawler)

        # add ENTRY_REQUESTS to the queue
        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)
        if next_requests:
            self._put_requests(
                iterator(next_requests), None,
            )

        while True:

            if self.queue_lock and self.queue_semaphore_value <= 0:
                self.queue_lock.acquire(True)

            next_requests = self.queue.get_requests(
                count=self.queue_semaphore_value
            )

            if isinstance(next_requests, StopCommand):
                break

            self.process_requests(
                iterator(next_requests), crawler,
            )

        self.finish(crawler)
Example #2
0
    def pump(self, crawler):
        """Start crawling

        :param crawler: instance of :class:`pomp.core.base.BaseCrawler`
        """
        self.prepare(crawler)

        # add ENTRY_REQUESTS to the queue
        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)
        if next_requests:
            self._put_requests(
                iterator(next_requests),
                None,
            )

        while True:

            if self.queue_lock and self.queue_semaphore_value <= 0:
                self.queue_lock.acquire(True)

            next_requests = self.queue.get_requests(
                count=self.queue_semaphore_value)

            if isinstance(next_requests, StopCommand):
                break

            self.process_requests(
                iterator(next_requests),
                crawler,
            )

        self.finish(crawler)
Example #3
0
    def on_parse_result(self, crawler, result, response):
        requests_from_items = ()
        if isinstance(result, types.GeneratorType):
            for items in result:
                requests_from_items = itertools.chain(
                    requests_from_items,
                    self._process_result(
                        crawler, iterator(items)
                    )
                )
        else:
            requests_from_items = self._process_result(
                crawler, iterator(result)
            )

        next_requests = crawler.next_requests(response)

        if requests_from_items:
            if next_requests:
                # chain result of crawler extract_items
                # and next_requests methods
                next_requests = itertools.chain(
                    requests_from_items,
                    iterator(next_requests),
                )
            else:
                next_requests = requests_from_items

        return next_requests
Example #4
0
def _run_crawler_worker(params, response):
    pid = os.getpid()
    log.debug("Crawler worker pid=%s params=%s", pid, params)
    try:
        # Initialize crawler worker
        worker = params['worker_class'](**params.get('worker_kwargs', {}))

        # process response
        items = worker.extract_items(response)
        next_requests = worker.next_requests(response)

        if next_requests:
            return list(
                itertools.chain(
                    iterator(items),
                    iterator(next_requests),
                )
            )
        return list(iterator(items))

    except Exception:
        log.exception(
            "Exception on crawler worker pid=%s request=%s", pid, response
        )
        raise
Example #5
0
def test_iterator():

    assert hasattr(iterator('a'), '__iter__')
    assert list(iterator('a')) == ['a']

    assert hasattr(iterator(1), '__iter__')

    assert hasattr(iterator(iterator('b')), '__iter__')
Example #6
0
def test_iterator():

    assert hasattr(iterator('a'), '__iter__')
    assert list(iterator('a')) == ['a']

    assert hasattr(iterator(1), '__iter__')

    assert hasattr(iterator(iterator('b')), '__iter__')
Example #7
0
    async def pump(self, crawler):
        """Start crawling

        :param crawler: isntance of :class:`pomp.core.base.BaseCrawler`
        """
        await self.prepare(crawler)

        # add ENTRY_REQUESTS to the queue
        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)
        if next_requests:
            await self._put_requests(iterator(next_requests))

        _pending_iteration_tasks = []

        def _on_iterations_task_done(task, future):
            _pending_iteration_tasks.remove(task)

        while True:

            if self.queue_lock and self.queue_semaphore_value <= 0:
                await self.queue_lock.acquire()

            next_requests = await self.queue.get_requests(
                count=self.queue_semaphore_value
            )

            if isinstance(next_requests, StopCommand):
                break

            # process requests and do not block loop
            task = asyncio.ensure_future(
                self.process_requests(
                    iterator(next_requests), crawler,
                )
            )
            _pending_iteration_tasks.append(task)
            task.add_done_callback(
                partial(_on_iterations_task_done, task)
            )

        # loop ended, but we have pending tasks - wait
        if _pending_iteration_tasks:
            log.debug(
                "Wait pending iteration tasks: %s",
                len(_pending_iteration_tasks),
            )
            await asyncio.wait(_pending_iteration_tasks)

        await self.finish(crawler)
Example #8
0
    def response_callback(self, crawler, response):

        log.info('Process %s', response)
        items = crawler.process(response)

        # pipe items
        for pipe in self.pipelines:
            items = filter(
                None,
                [pipe.process(crawler, i) for i in items],
            )

        # get next requests
        next_requests = crawler.next_requests(response)

        if self.queue:
            return next_requests  # return requests to pass through queue
        else:  # execute requests by `witdh first` or `depth first` methods
            if crawler.is_depth_first():
                if next_requests:

                    # next recursion step
                    next_requests = self.downloader.process(
                        iterator(next_requests), self.response_callback,
                        crawler)

                    self._sync_or_async(next_requests, crawler,
                                        self._on_next_requests)
                else:
                    if not self.stoped and not crawler.in_process():
                        self._stop(crawler)

                return None  # end of recursion
            else:
                return next_requests
Example #9
0
 def _do(self, requests, crawler):
     # execute request by downloader
     for req in filter_requests(requests):
         _requests = self.downloader.process(iterator(req),
                                             self.response_callback,
                                             crawler)
         self._sync_or_async(_requests, crawler, self._on_next_requests)
    def put_requests(self, requests):
        if not self.script_sha:
            yield from self.register_script()

        for item in iterator(requests):
            log.debug("Put to queue: %s", item)
            data = self.serializer.dumps(item)
            yield from self.redis.evalsha(
                self.script_sha, keys=[], args=[item.get_identity(), data, ],
            )
Example #11
0
    def pump(self, crawler):
        """Start crawling

        :param crawler: crawler to execute :class:`pomp.core.base.BaseCrawler`
        """
        log.info('Prepare downloader: %s', self.downloader)
        self.downloader.prepare()

        self.stoped = False
        crawler._reset_state()

        log.info('Start crawler: %s', crawler)

        for pipe in self.pipelines:
            log.info('Start pipe: %s', pipe)
            pipe.start(crawler)

        self.stop_deferred = defer.Deferred()

        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)

        # process ENTRY_REQUESTS
        if next_requests:
            next_requests = self.downloader.process(
                iterator(crawler.ENTRY_REQUESTS),
                self.response_callback,
                crawler
            )

        if self.queue:
            if not next_requests:
                # empty request - get it from queue
                next_requests = self._queue_get_requests()

            self._sync_or_async(
                next_requests,
                crawler,
                self._on_next_requests
            )
        else:  # recursive process
            if not crawler.is_depth_first():
                self._sync_or_async(
                    next_requests,
                    crawler,
                    self._on_next_requests
                )
            else:
                # is width first method
                # execute generator
                if isinstance(next_requests, types.GeneratorType):
                    list(next_requests)  # fire generator
        return self.stop_deferred
Example #12
0
 def _do(self, requests, crawler):
     # execute request by downloader
     for req in filter_requests(requests):
         _requests = self.downloader.process(
             iterator(req),
             self.response_callback,
             crawler
         )
         self._sync_or_async(
             _requests,
             crawler,
             self._on_next_requests
         )
Example #13
0
def _run_crawler_worker(params, response):
    pid = os.getpid()
    log.debug("Crawler worker pid=%s params=%s", pid, params)
    try:
        # Initialize crawler worker
        worker = params['worker_class'](**params.get('worker_kwargs', {}))

        # process response
        items = worker.extract_items(response)
        next_requests = worker.next_requests(response)

        if next_requests:
            return list(
                itertools.chain(
                    iterator(items),
                    iterator(next_requests),
                ))
        return list(iterator(items))

    except Exception:
        log.exception("Exception on crawler worker pid=%s request=%s", pid,
                      response)
        raise
Example #14
0
    def put_requests(self, requests):
        if not self.script_sha:
            yield from self.register_script()

        for item in iterator(requests):
            log.debug("Put to queue: %s", item)
            data = self.serializer.dumps(item)
            yield from self.redis.evalsha(
                self.script_sha,
                keys=[],
                args=[
                    item.get_identity(),
                    data,
                ],
            )
Example #15
0
    def prepare(self, crawler):
        log.info('Prepare downloader: %s', self.downloader)
        self.downloader.prepare()
        self.in_progress = 0

        log.info('Start crawler: %s', crawler)

        for pipe in self.pipelines:
            log.info('Start pipe: %s', pipe)
            pipe.start(crawler)

        # add ENTRY_REQUESTS to the queue
        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)
        if next_requests:
            self._put_requests(iterator(next_requests), request_done=False)

        # configure queue semaphore
        self.queue_semaphore = self.get_queue_semaphore()
Example #16
0
    def pump(self, crawler):
        """Start crawling

        :param crawler: crawler to execute :class:`pomp.core.base.BaseCrawler`
        """
        log.info('Prepare downloader: %s', self.downloader)
        self.downloader.prepare()

        self.stoped = False
        crawler._reset_state()

        log.info('Start crawler: %s', crawler)

        for pipe in self.pipelines:
            log.info('Start pipe: %s', pipe)
            pipe.start(crawler)

        self.stop_deferred = defer.Deferred()

        next_requests = getattr(crawler, 'ENTRY_REQUESTS', None)

        # process ENTRY_REQUESTS
        if next_requests:
            next_requests = self.downloader.process(
                iterator(crawler.ENTRY_REQUESTS), self.response_callback,
                crawler)

        if self.queue:
            if not next_requests:
                # empty request - get it from queue
                next_requests = self._queue_get_requests()

            self._sync_or_async(next_requests, crawler, self._on_next_requests)
        else:  # recursive process
            if not crawler.is_depth_first():
                self._sync_or_async(next_requests, crawler,
                                    self._on_next_requests)
            else:
                # is width first method
                # execute generator
                if isinstance(next_requests, types.GeneratorType):
                    list(next_requests)  # fire generator
        return self.stop_deferred
Example #17
0
    def pump(self, crawler):
        """Start crawling

        :param crawler: isntance of :class:`pomp.core.base.BaseCrawler`
        """
        self.prepare(crawler)

        while True:

            # do not fetch from queue request more than downloader can process
            if self.queue_semaphore:
                self.queue_semaphore.acquire(blocking=True)

            next_requests = self.queue.get_requests()
            if isinstance(next_requests, StopCommand):
                break
            self.process_requests(
                iterator(next_requests), crawler,
            )

        self.finish(crawler)
Example #18
0
    def response_callback(self, crawler, response):

        log.info('Process %s', response)
        items = crawler.process(response)

        # pipe items
        for pipe in self.pipelines:
            items = filter(
                None,
                [pipe.process(crawler, i) for i in items],
            )

        # get next requests
        next_requests = crawler.next_requests(response)

        if self.queue:
            return next_requests  # return requests to pass through queue
        else:  # execute requests by `witdh first` or `depth first` methods
            if crawler.is_depth_first():
                if next_requests:

                    # next recursion step
                    next_requests = self.downloader.process(
                        iterator(next_requests),
                        self.response_callback,
                        crawler
                    )

                    self._sync_or_async(
                        next_requests,
                        crawler,
                        self._on_next_requests
                    )
                else:
                    if not self.stoped and not crawler.in_process():
                        self._stop(crawler)

                return None  # end of recursion
            else:
                return next_requests
Example #19
0
    def on_parse_result(self, crawler, result, response):  # asyncio: async
        if isinstance(result, types.GeneratorType):
            for items in result:
                for request in self._process_items(
                        crawler, iterator(items),
                        response=response):  # asyncio: async  # noqa
                    self._put_requests(  # asyncio: await
                        iterator(request),
                        crawler=crawler,
                        response=response,
                    )
        else:
            for request in self._process_items(
                    crawler, iterator(result),
                    response=response):  # asyncio: async  # noqa
                self._put_requests(  # asyncio: await
                    iterator(request),
                    crawler=crawler,
                    response=response,
                )

        next_requests = (
            crawler.next_requests(response)  # asyncio: await _co(REPLACE)
        )

        if hasattr(next_requests, '__anext__'):  # support async generators
            for request in next_requests:  # asyncio: async
                self._put_requests(  # asyncio: await
                    iterator(request),
                    crawler=crawler,
                    response=response,
                )
        else:
            self._put_requests(  # asyncio: await
                iterator(next_requests),
                crawler=crawler,
                response=response,
            )

        self._request_done(response, crawler)  # asyncio: await
Example #20
0
File: engine.py Project: estin/pomp
    def on_parse_result(self, crawler, result, response):  # asyncio: async
        if isinstance(result, types.GeneratorType):
            for items in result:
                for request in self._process_items(crawler, iterator(items), response=response):  # asyncio: async  # noqa
                    self._put_requests(   # asyncio: await
                        iterator(request),
                        crawler=crawler,
                        response=response,
                    )
        else:
            for request in self._process_items(crawler, iterator(result), response=response):  # asyncio: async  # noqa
                self._put_requests(   # asyncio: await
                    iterator(request),
                    crawler=crawler,
                    response=response,
                )

        next_requests = (
            crawler.next_requests(response)  # asyncio: await _co(REPLACE)
        )

        if hasattr(next_requests, '__anext__'):  # support async generators
            for request in next_requests:  # asyncio: async
                self._put_requests(  # asyncio: await
                    iterator(request),
                    crawler=crawler,
                    response=response,
                )
        else:
            self._put_requests(  # asyncio: await
                iterator(next_requests),
                crawler=crawler,
                response=response,
            )

        self._request_done(response, crawler)  # asyncio: await
Example #21
0
def filter_requests(requests):
    return filter(lambda x: True if x else False, iterator(requests))
Example #22
0
 def get(self, requests):
     responses = []
     for request in iterator(requests):
         response = self._fetch(request)
         responses.append(response)
     return responses
Example #23
0
 def get(self, requests):
     for request in iterator(requests):
         yield self._fetch(request)
Example #24
0
 def get(self, requests):
     responses = []
     for request in iterator(requests):
         response = self._fetch(request)
         responses.append(response)
     return responses
Example #25
0
def filter_requests(requests):
    return filter(
        lambda x: True if x else False,
        iterator(requests)
    )
Example #26
0
 def get(self, requests):
     for request in iterator(requests):
         yield self.worker.get_one(request)
Example #27
0
 def get(self, requests):
     for request in iterator(requests):
         yield self._fetch(request)