def pump(self, crawler): """Start crawling :param crawler: instance of :class:`pomp.core.base.BaseCrawler` """ self.prepare(crawler) # add ENTRY_REQUESTS to the queue next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) if next_requests: self._put_requests( iterator(next_requests), None, ) while True: if self.queue_lock and self.queue_semaphore_value <= 0: self.queue_lock.acquire(True) next_requests = self.queue.get_requests( count=self.queue_semaphore_value ) if isinstance(next_requests, StopCommand): break self.process_requests( iterator(next_requests), crawler, ) self.finish(crawler)
def pump(self, crawler): """Start crawling :param crawler: instance of :class:`pomp.core.base.BaseCrawler` """ self.prepare(crawler) # add ENTRY_REQUESTS to the queue next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) if next_requests: self._put_requests( iterator(next_requests), None, ) while True: if self.queue_lock and self.queue_semaphore_value <= 0: self.queue_lock.acquire(True) next_requests = self.queue.get_requests( count=self.queue_semaphore_value) if isinstance(next_requests, StopCommand): break self.process_requests( iterator(next_requests), crawler, ) self.finish(crawler)
def on_parse_result(self, crawler, result, response): requests_from_items = () if isinstance(result, types.GeneratorType): for items in result: requests_from_items = itertools.chain( requests_from_items, self._process_result( crawler, iterator(items) ) ) else: requests_from_items = self._process_result( crawler, iterator(result) ) next_requests = crawler.next_requests(response) if requests_from_items: if next_requests: # chain result of crawler extract_items # and next_requests methods next_requests = itertools.chain( requests_from_items, iterator(next_requests), ) else: next_requests = requests_from_items return next_requests
def _run_crawler_worker(params, response): pid = os.getpid() log.debug("Crawler worker pid=%s params=%s", pid, params) try: # Initialize crawler worker worker = params['worker_class'](**params.get('worker_kwargs', {})) # process response items = worker.extract_items(response) next_requests = worker.next_requests(response) if next_requests: return list( itertools.chain( iterator(items), iterator(next_requests), ) ) return list(iterator(items)) except Exception: log.exception( "Exception on crawler worker pid=%s request=%s", pid, response ) raise
def test_iterator(): assert hasattr(iterator('a'), '__iter__') assert list(iterator('a')) == ['a'] assert hasattr(iterator(1), '__iter__') assert hasattr(iterator(iterator('b')), '__iter__')
async def pump(self, crawler): """Start crawling :param crawler: isntance of :class:`pomp.core.base.BaseCrawler` """ await self.prepare(crawler) # add ENTRY_REQUESTS to the queue next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) if next_requests: await self._put_requests(iterator(next_requests)) _pending_iteration_tasks = [] def _on_iterations_task_done(task, future): _pending_iteration_tasks.remove(task) while True: if self.queue_lock and self.queue_semaphore_value <= 0: await self.queue_lock.acquire() next_requests = await self.queue.get_requests( count=self.queue_semaphore_value ) if isinstance(next_requests, StopCommand): break # process requests and do not block loop task = asyncio.ensure_future( self.process_requests( iterator(next_requests), crawler, ) ) _pending_iteration_tasks.append(task) task.add_done_callback( partial(_on_iterations_task_done, task) ) # loop ended, but we have pending tasks - wait if _pending_iteration_tasks: log.debug( "Wait pending iteration tasks: %s", len(_pending_iteration_tasks), ) await asyncio.wait(_pending_iteration_tasks) await self.finish(crawler)
def response_callback(self, crawler, response): log.info('Process %s', response) items = crawler.process(response) # pipe items for pipe in self.pipelines: items = filter( None, [pipe.process(crawler, i) for i in items], ) # get next requests next_requests = crawler.next_requests(response) if self.queue: return next_requests # return requests to pass through queue else: # execute requests by `witdh first` or `depth first` methods if crawler.is_depth_first(): if next_requests: # next recursion step next_requests = self.downloader.process( iterator(next_requests), self.response_callback, crawler) self._sync_or_async(next_requests, crawler, self._on_next_requests) else: if not self.stoped and not crawler.in_process(): self._stop(crawler) return None # end of recursion else: return next_requests
def _do(self, requests, crawler): # execute request by downloader for req in filter_requests(requests): _requests = self.downloader.process(iterator(req), self.response_callback, crawler) self._sync_or_async(_requests, crawler, self._on_next_requests)
def put_requests(self, requests): if not self.script_sha: yield from self.register_script() for item in iterator(requests): log.debug("Put to queue: %s", item) data = self.serializer.dumps(item) yield from self.redis.evalsha( self.script_sha, keys=[], args=[item.get_identity(), data, ], )
def pump(self, crawler): """Start crawling :param crawler: crawler to execute :class:`pomp.core.base.BaseCrawler` """ log.info('Prepare downloader: %s', self.downloader) self.downloader.prepare() self.stoped = False crawler._reset_state() log.info('Start crawler: %s', crawler) for pipe in self.pipelines: log.info('Start pipe: %s', pipe) pipe.start(crawler) self.stop_deferred = defer.Deferred() next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) # process ENTRY_REQUESTS if next_requests: next_requests = self.downloader.process( iterator(crawler.ENTRY_REQUESTS), self.response_callback, crawler ) if self.queue: if not next_requests: # empty request - get it from queue next_requests = self._queue_get_requests() self._sync_or_async( next_requests, crawler, self._on_next_requests ) else: # recursive process if not crawler.is_depth_first(): self._sync_or_async( next_requests, crawler, self._on_next_requests ) else: # is width first method # execute generator if isinstance(next_requests, types.GeneratorType): list(next_requests) # fire generator return self.stop_deferred
def _do(self, requests, crawler): # execute request by downloader for req in filter_requests(requests): _requests = self.downloader.process( iterator(req), self.response_callback, crawler ) self._sync_or_async( _requests, crawler, self._on_next_requests )
def _run_crawler_worker(params, response): pid = os.getpid() log.debug("Crawler worker pid=%s params=%s", pid, params) try: # Initialize crawler worker worker = params['worker_class'](**params.get('worker_kwargs', {})) # process response items = worker.extract_items(response) next_requests = worker.next_requests(response) if next_requests: return list( itertools.chain( iterator(items), iterator(next_requests), )) return list(iterator(items)) except Exception: log.exception("Exception on crawler worker pid=%s request=%s", pid, response) raise
def put_requests(self, requests): if not self.script_sha: yield from self.register_script() for item in iterator(requests): log.debug("Put to queue: %s", item) data = self.serializer.dumps(item) yield from self.redis.evalsha( self.script_sha, keys=[], args=[ item.get_identity(), data, ], )
def prepare(self, crawler): log.info('Prepare downloader: %s', self.downloader) self.downloader.prepare() self.in_progress = 0 log.info('Start crawler: %s', crawler) for pipe in self.pipelines: log.info('Start pipe: %s', pipe) pipe.start(crawler) # add ENTRY_REQUESTS to the queue next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) if next_requests: self._put_requests(iterator(next_requests), request_done=False) # configure queue semaphore self.queue_semaphore = self.get_queue_semaphore()
def pump(self, crawler): """Start crawling :param crawler: crawler to execute :class:`pomp.core.base.BaseCrawler` """ log.info('Prepare downloader: %s', self.downloader) self.downloader.prepare() self.stoped = False crawler._reset_state() log.info('Start crawler: %s', crawler) for pipe in self.pipelines: log.info('Start pipe: %s', pipe) pipe.start(crawler) self.stop_deferred = defer.Deferred() next_requests = getattr(crawler, 'ENTRY_REQUESTS', None) # process ENTRY_REQUESTS if next_requests: next_requests = self.downloader.process( iterator(crawler.ENTRY_REQUESTS), self.response_callback, crawler) if self.queue: if not next_requests: # empty request - get it from queue next_requests = self._queue_get_requests() self._sync_or_async(next_requests, crawler, self._on_next_requests) else: # recursive process if not crawler.is_depth_first(): self._sync_or_async(next_requests, crawler, self._on_next_requests) else: # is width first method # execute generator if isinstance(next_requests, types.GeneratorType): list(next_requests) # fire generator return self.stop_deferred
def pump(self, crawler): """Start crawling :param crawler: isntance of :class:`pomp.core.base.BaseCrawler` """ self.prepare(crawler) while True: # do not fetch from queue request more than downloader can process if self.queue_semaphore: self.queue_semaphore.acquire(blocking=True) next_requests = self.queue.get_requests() if isinstance(next_requests, StopCommand): break self.process_requests( iterator(next_requests), crawler, ) self.finish(crawler)
def response_callback(self, crawler, response): log.info('Process %s', response) items = crawler.process(response) # pipe items for pipe in self.pipelines: items = filter( None, [pipe.process(crawler, i) for i in items], ) # get next requests next_requests = crawler.next_requests(response) if self.queue: return next_requests # return requests to pass through queue else: # execute requests by `witdh first` or `depth first` methods if crawler.is_depth_first(): if next_requests: # next recursion step next_requests = self.downloader.process( iterator(next_requests), self.response_callback, crawler ) self._sync_or_async( next_requests, crawler, self._on_next_requests ) else: if not self.stoped and not crawler.in_process(): self._stop(crawler) return None # end of recursion else: return next_requests
def on_parse_result(self, crawler, result, response): # asyncio: async if isinstance(result, types.GeneratorType): for items in result: for request in self._process_items( crawler, iterator(items), response=response): # asyncio: async # noqa self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) else: for request in self._process_items( crawler, iterator(result), response=response): # asyncio: async # noqa self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) next_requests = ( crawler.next_requests(response) # asyncio: await _co(REPLACE) ) if hasattr(next_requests, '__anext__'): # support async generators for request in next_requests: # asyncio: async self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) else: self._put_requests( # asyncio: await iterator(next_requests), crawler=crawler, response=response, ) self._request_done(response, crawler) # asyncio: await
def on_parse_result(self, crawler, result, response): # asyncio: async if isinstance(result, types.GeneratorType): for items in result: for request in self._process_items(crawler, iterator(items), response=response): # asyncio: async # noqa self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) else: for request in self._process_items(crawler, iterator(result), response=response): # asyncio: async # noqa self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) next_requests = ( crawler.next_requests(response) # asyncio: await _co(REPLACE) ) if hasattr(next_requests, '__anext__'): # support async generators for request in next_requests: # asyncio: async self._put_requests( # asyncio: await iterator(request), crawler=crawler, response=response, ) else: self._put_requests( # asyncio: await iterator(next_requests), crawler=crawler, response=response, ) self._request_done(response, crawler) # asyncio: await
def filter_requests(requests): return filter(lambda x: True if x else False, iterator(requests))
def get(self, requests): responses = [] for request in iterator(requests): response = self._fetch(request) responses.append(response) return responses
def get(self, requests): for request in iterator(requests): yield self._fetch(request)
def filter_requests(requests): return filter( lambda x: True if x else False, iterator(requests) )
def get(self, requests): for request in iterator(requests): yield self.worker.get_one(request)