def get_next_requests(self, max_next_requests, partition_id, **kwargs): return_requests = [] data = True while data and len(return_requests) < max_next_requests: data = False consumed = [] for batch in self.hcf.read(partition_id, max_next_requests): batch_id = batch['id'] requests = batch['requests'] data = len(requests) == max_next_requests self.logger.debug("got batch %s of size %d from HCF server" % (batch_id, len(requests))) for fingerprint, qdata in requests: decoded = _convert_from_saved_type(qdata) request = Request(decoded.get('url', fingerprint), **decoded['request']) if request is not None: request.meta.update({ 'created_at': datetime.utcnow(), 'depth': 0, }) request.meta.setdefault(b'scrapy_meta', {}) return_requests.append(request) consumed.append(batch_id) if consumed: self.hcf.delete(partition_id, consumed) return return_requests
def get_next_requests(self, max_n_requests, partition_id, **kwargs): results = [] try: for item in self.session.query(self.queue_model).\ filter(RevisitingQueueModel.crawl_at <= datetime.utcnow(), RevisitingQueueModel.partition_id == partition_id).\ limit(max_n_requests): method = 'GET' if not item.method else item.method results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)) self.session.delete(item) self.session.commit() except Exception, exc: self.logger.exception(exc) self.session.rollback()
SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = True SETTINGS.TEST_MODE = True if __name__ == '__main__': # Create graph graph = graphs.Manager('sqlite:///data/graph.db') # Create frontier from settings frontier = FrontierManager.from_settings(SETTINGS) # Add seeds frontier.add_seeds([Request(seed.url) for seed in graph.seeds]) # Get next requests next_requests = frontier.get_next_requests() # Crawl pages for request in next_requests: # Fake page crawling crawled_page = graph.get_page(request.url) # Create response response = Response(url=request.url, status_code=crawled_page.status, request=request) # Create page links