Beispiel #1
0
 def emit(self,
          rule="pass",
          stage=None,
          data={},
          delay=None,
          optional=False):
     """Invoke the next stage, either based on a handling rule, or by
     calling the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if optional and stage is None:
         return
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     if settings.DEBUG:
         # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we
         # aim to execute only 20% of the crawler's tasks.
         sampling_rate = self.get("sampling_rate")
         if sampling_rate and random.random() > float(sampling_rate):
             self.log.info("Skipping emit due to sampling rate")
             return
     if is_sync_mode():
         # In sync mode we use a in-memory backend for the task queue.
         # Make a copy of the data to avoid mutation in that case.
         data = deepcopy(data)
     state = self.dump_state()
     stage = self.crawler.get(stage)
     delay = delay or self.params.get("delay", 0) or self.crawler.delay
     self.sleep(delay)
     Queue.queue(stage, state, data)
Beispiel #2
0
 def emit(self, rule='pass', stage=None, data={}, delay=None):
     """Invoke the next stage, either based on a handling rule, or by calling
     the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     state = self.dump_state()
     delay = delay or self.crawler.delay
     Queue.queue(stage, state, data, delay)
Beispiel #3
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
            "continue_on_error": settings.CONTINUE_ON_ERROR,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
Beispiel #4
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})
Beispiel #5
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # TaskRunner.execute(stage.name, state, {})
        Queue.queue(self.init_stage, state, {})

        if not settings.REDIS_HOST:
            TaskRunner.run()
Beispiel #6
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
Beispiel #7
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events:
        Event.delete(self)
        Queue.queue(self.init_stage, state, {})

        if not settings.REDIS_HOST:
            TaskRunner.run()
Beispiel #8
0
 def emit(self, rule='pass', stage=None, data={}, delay=None,
          optional=False):
     """Invoke the next stage, either based on a handling rule, or by
     calling the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if optional and stage is None:
         return
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     if settings.DEBUG:
         # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we
         # aim to execute only 20% of the crawler's tasks.
         sampling_rate = self.get('sampling_rate')
         if sampling_rate and random.random() > float(sampling_rate):
             self.log.info("Skipping emit due to sampling rate")
             return
     state = self.dump_state()
     stage = self.crawler.get(stage)
     delay = delay or self.params.get('delay', 0) or self.crawler.delay
     self.sleep(delay)
     Queue.queue(stage, state, data)