def execute(cls, stage, state, data, next_allowed_exec_time=None):
     """Execute the operation, rate limiting allowing."""
     try:
         context = Context.from_state(state, stage)
         now = datetime.utcnow()
         if next_allowed_exec_time and now < next_allowed_exec_time:
             # task not allowed to run yet; put it back in the queue
             Queue.queue(stage, state, data, delay=next_allowed_exec_time)
         elif context.crawler.disabled:
             pass
         elif context.stage.rate_limit:
             try:
                 with rate_limiter(context):
                     context.execute(data)
             except RateLimitException:
                 delay = max(1, 1.0 / context.stage.rate_limit)
                 delay = random.randint(1, int(delay))
                 context.log.info("Rate limit exceeded, delaying %d sec.",
                                  delay)
                 Queue.queue(stage, state, data, delay=delay)
         else:
             context.execute(data)
     except Exception:
         log.exception("Task failed to execute:")
     finally:
         # Decrease the pending task count after excuting a task.
         Queue.decr_pending(context.crawler)
         # If we don't have anymore tasks to execute, time to clean up.
         if not context.crawler.is_running:
             context.crawler.aggregate(context)
Exemple #2
0
    def execute(cls, stage, state, data, next_allowed_exec_time=None):
        """Execute the operation, rate limiting allowing."""
        now = datetime.utcnow()
        if next_allowed_exec_time and now < next_allowed_exec_time:
            # task not allowed to run yet; put it back in the queue
            Queue.queue(stage, state, data, delay=next_allowed_exec_time)
            return

        context = Context.from_state(state, stage)
        if context.crawler.disabled:
            return

        if context.stage.rate_limit:
            try:
                with rate_limiter(context):
                    context.execute(data)
                    return
            except RateLimitException:
                delay = max(1, 1.0 / context.stage.rate_limit)
                delay = random.randint(1, int(delay))
                context.log.info("Rate limit exceeded, delaying %d sec.",
                                 delay)
                Queue.queue(stage, state, data, delay=delay)

        context.execute(data)
Exemple #3
0
 def after_task(self, task):
     if task.job.is_done():
         stage = CrawlerStage.detach_namespace(task.stage.stage)
         state = task.context
         context = Context.from_state(state, stage)
         context.crawler.aggregate(context)
     self.timeout_expiration_check()
Exemple #4
0
 def handle(self, task):
     apply_task_context(task)
     data = task.payload
     stage = CrawlerStage.detach_namespace(task.stage.stage)
     state = task.context
     context = Context.from_state(state, stage)
     context.execute(data)
Exemple #5
0
 def handle(self, task):
     data = task.payload
     stage = task.stage.stage
     state = task.context
     context = Context.from_state(state, stage)
     if context.crawler.disabled:
         return
     context.execute(data)
 def test_dump_load_state(self, context, crawler, stage):
     dump = context.dump_state()
     new_context = Context.from_state(dump, stage.name)
     assert isinstance(new_context, Context)
     assert new_context.run_id == context.run_id
     assert new_context.crawler.name == crawler.name
     assert new_context.stage.name == stage.name
     assert all((k, v) in new_context.state.items()
                for k, v in context.state.items())
Exemple #7
0
def context():
    ctx = Context(crawler(), stage(), {"foo": "bar"})
    # Assign a fake operation id, so that the DB doesn't complain about
    # NotNullContraint while saving events etc.
    ctx.operation_id = randint(1, 99999)
    return ctx
Exemple #8
0
def get_context():
    ctx = Context(get_crawler(), get_stage(), {"foo": "bar"})
    ctx.run_id = str(uuid.uuid4())
    return ctx
Exemple #9
0
 def after_task(self, task):
     if task.job.is_done():
         stage = task.stage.stage
         state = task.context
         context = Context.from_state(state, stage)
         context.crawler.aggregate(context)
Exemple #10
0
 def handle(self, task):
     data = task.payload
     stage = task.stage.stage
     state = task.context
     context = Context.from_state(state, stage)
     context.execute(data)