def emit(self, rule="pass", stage=None, data={}, delay=None, optional=False): """Invoke the next stage, either based on a handling rule, or by calling the `pass` rule by default.""" if stage is None: stage = self.stage.handlers.get(rule) if optional and stage is None: return if stage is None or stage not in self.crawler.stages: self.log.info("No next stage: %s (%s)" % (stage, rule)) return if settings.DEBUG: # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we # aim to execute only 20% of the crawler's tasks. sampling_rate = self.get("sampling_rate") if sampling_rate and random.random() > float(sampling_rate): self.log.info("Skipping emit due to sampling rate") return if is_sync_mode(): # In sync mode we use a in-memory backend for the task queue. # Make a copy of the data to avoid mutation in that case. data = deepcopy(data) state = self.dump_state() stage = self.crawler.get(stage) delay = delay or self.params.get("delay", 0) or self.crawler.delay self.sleep(delay) Queue.queue(stage, state, data)
def run(crawler): """Run a specified crawler.""" crawler = get_crawler(crawler) crawler.run() if is_sync_mode(): worker = get_worker() worker.sync()
def run(crawler): """Run a specified crawler.""" crawler = get_crawler(crawler) crawler.run() if is_sync_mode(): TaskRunner.run_sync()