Beispiel #1
0
 def emit(self,
          rule="pass",
          stage=None,
          data={},
          delay=None,
          optional=False):
     """Invoke the next stage, either based on a handling rule, or by
     calling the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if optional and stage is None:
         return
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     if settings.DEBUG:
         # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we
         # aim to execute only 20% of the crawler's tasks.
         sampling_rate = self.get("sampling_rate")
         if sampling_rate and random.random() > float(sampling_rate):
             self.log.info("Skipping emit due to sampling rate")
             return
     if is_sync_mode():
         # In sync mode we use a in-memory backend for the task queue.
         # Make a copy of the data to avoid mutation in that case.
         data = deepcopy(data)
     state = self.dump_state()
     stage = self.crawler.get(stage)
     delay = delay or self.params.get("delay", 0) or self.crawler.delay
     self.sleep(delay)
     Queue.queue(stage, state, data)
Beispiel #2
0
def run(crawler):
    """Run a specified crawler."""
    crawler = get_crawler(crawler)
    crawler.run()
    if is_sync_mode():
        worker = get_worker()
        worker.sync()
Beispiel #3
0
def run(crawler):
    """Run a specified crawler."""
    crawler = get_crawler(crawler)
    crawler.run()
    if is_sync_mode():
        TaskRunner.run_sync()