def get_next_requests(self, max_next_requests, partition_id, **kwargs):
     return_requests = []
     data = True
     while data and len(return_requests) < max_next_requests:
         data = False
         consumed = []
         for batch in self.hcf.read(partition_id, max_next_requests):
             batch_id = batch['id']
             requests = batch['requests']
             data = len(requests) == max_next_requests
             self.logger.debug("got batch %s of size %d from HCF server" %
                               (batch_id, len(requests)))
             for fingerprint, qdata in requests:
                 decoded = _convert_from_saved_type(qdata)
                 request = Request(decoded.get('url', fingerprint),
                                   **decoded['request'])
                 if request is not None:
                     request.meta.update({
                         'created_at': datetime.utcnow(),
                         'depth': 0,
                     })
                     request.meta.setdefault(b'scrapy_meta', {})
                     return_requests.append(request)
             consumed.append(batch_id)
         if consumed:
             self.hcf.delete(partition_id, consumed)
     return return_requests
Beispiel #2
0
 def get_next_requests(self, max_n_requests, partition_id, **kwargs):
     results = []
     try:
         for item in self.session.query(self.queue_model).\
                 filter(RevisitingQueueModel.crawl_at <= datetime.utcnow(),
                        RevisitingQueueModel.partition_id == partition_id).\
                 limit(max_n_requests):
             method = 'GET' if not item.method else item.method
             results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies))
             self.session.delete(item)
         self.session.commit()
     except Exception, exc:
         self.logger.exception(exc)
         self.session.rollback()
Beispiel #3
0
SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = True
SETTINGS.TEST_MODE = True

if __name__ == '__main__':
    # Create graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Create frontier from settings
    frontier = FrontierManager.from_settings(SETTINGS)

    # Add seeds
    frontier.add_seeds([Request(seed.url) for seed in graph.seeds])

    # Get next requests
    next_requests = frontier.get_next_requests()

    # Crawl pages
    for request in next_requests:

        # Fake page crawling
        crawled_page = graph.get_page(request.url)

        # Create response
        response = Response(url=request.url,
                            status_code=crawled_page.status,
                            request=request)
        # Create page links