def _enqueue_message(redis, logger, msg, message_key, message_hash_key, queue_name): timestamp = to_unix(datetime.utcnow()) pipe = redis.pipeline() pipe.set(message_key, msg.serialize()) pipe.zadd(message_hash_key, {msg.id: timestamp}) pipe.lpush(queue_name, msg.id) if msg.cron_str is not None: next_dt = get_next_cron_timestamp(msg.cron_str) timestamp = to_unix(next_dt) _enqueue_at_timestamp( pipe, logger, msg.queue, Queue.get_message_name, timestamp, msg.category, msg.cron_str, *msg.args, **msg.kwargs, ) return pipe.execute()
def enqueue_in(self, incr, category, *args, **kw): start_in = parse_time(incr) future_date = datetime.now(tz=timezone.utc) + start_in timestamp = to_unix(future_date) return self.__enqueue_at_timestamp(timestamp, category, None, *args, **kw)
def test_move_jobs(client): """Test moving jobs from scheduled to enqueued""" redis = client.application.redis jobs_queue = client.application.jobs_queue enqueued_ids = [] for offset in range(-5, 6): if offset == 0: continue timestamp = to_unix(datetime.utcnow()) + offset * 2 enqueued_ids.append( jobs_queue.enqueue_at( timestamp, Categories.Notify, f"something {offset}", other=f"öther {offset}", )) expect(redis.exists(Queue.get_message_name( enqueued_ids[-1]))).to_be_true() expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(10) moved = client.application.queue_group.move_jobs() expect(moved).to_length(5) expect(redis.llen(jobs_queue.queue_name)).to_equal(5) expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(5) for enqueued_id in enqueued_ids: expect(redis.exists(Queue.get_message_name(enqueued_id))).to_be_true()
def validate_expiration(job, ex, logger): now = datetime.utcnow() unixtime = to_unix(now) if ( job.metadata.get("expiration") is not None and job.metadata["expiration"] < unixtime ): expiration_utc = from_unix(job.metadata["expiration"]) ex.status = JobExecution.Status.expired ex.error = ( f"Job was supposed to be done before {expiration_utc.isoformat()}, " f"but was started at {from_unix(unixtime).isoformat()}." ) ex.finished_at = now job.save() logger.info( "Job execution canceled due to being expired.", job_expiration=job.metadata["expiration"], current_ts=unixtime, ) return False return True
def reenqueue_job_due_to_break(task_id, job_id, execution_id, image, command): args = [task_id, job_id, execution_id, image, command] delta = timedelta(seconds=1.0) future_date = to_unix(datetime.utcnow() + delta) enqueued = current_app.jobs_queue.enqueue_at(future_date, Categories.Job, *args) return enqueued
def enqueue_message(self, message): timestamp = to_unix(datetime.utcnow()) pipe = self.redis.pipeline() pipe.set(message.message_key(), message.serialize()) pipe.zadd(message.message_hash_key(), {message.id: timestamp}) pipe.lpush(message.queue, message.id) if message.cron_str: next_dt = get_next_cron_timestamp(message.cron_str) timestamp = to_unix(next_dt) msg_schedule = Message(message.queue, message.category, message.cron_str, *message.args, **message.kwargs) self._pipe_at_timestamp(pipe, msg_schedule, timestamp) return pipe.execute()
def move_jobs(self): logger = self.logger.bind(operation="move_jobs") lock = self.redis.lock( Queue.MOVE_JOBS_LOCK_NAME, timeout=5, sleep=0.1, blocking_timeout=500, thread_local=False, ) if not lock.acquire(): logger.info( "Lock could not be acquired. Trying to move jobs again later.") return None moved_items = [] try: timestamp = to_unix(datetime.utcnow()) pipe = self.redis.pipeline() pipe.zrangebyscore(Queue.SCHEDULED_QUEUE_NAME, "-inf", timestamp) pipe.zremrangebyscore(Queue.SCHEDULED_QUEUE_NAME, "-inf", timestamp) items, _ = pipe.execute() if not items: logger.info("No jobs in need of moving right now.") return moved_items logger.info("Found jobs in need of moving.", job_count=len(items)) for message_id in items: key = Queue.get_message_name(message_id.decode("utf-8")) data = self.redis.get(key) msg = Message.deserialize(data.decode("utf-8")) logger.info( "Moving job to queue.", queue_name=msg.queue, job_id=msg.id, msg=msg.serialize(), ) queue_hash_name = f"{Queue.QUEUE_HASH_NAME}:{msg.queue}" _enqueue_message(self.redis, self.logger, msg, key, queue_hash_name, msg.queue) moved_items.append(msg) logger.info("Moved jobs successfully.") return moved_items finally: lock.release()
def test_schedule3(client): """Test Scheduling a job with cron""" queue = client.application.jobs_queue cron_str = "* * * * *" timestamp = to_unix(get_next_cron_timestamp(cron_str)) enqueued_id = queue.enqueue_cron(cron_str, Categories.Monitor, "something", other="öther") redis = client.application.redis expect(redis.llen(queue.queue_name)).to_equal(0) expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(1) items = redis.zrange(Queue.SCHEDULED_QUEUE_NAME, 0, -1, withscores=True) expect(items).to_length(1) message_id, actual_ts = items[0] expect(actual_ts).to_equal(timestamp) expect(message_id).to_equal(enqueued_id) message_key = queue.get_message_name(enqueued_id) expect(redis.exists(message_key)).to_be_true() data = redis.get(message_key) job = Message.deserialize(data) job = Message.deserialize(data) expect(job.id).to_equal(enqueued_id) expect(job.category).to_equal(Categories.Monitor) expect(job.args).to_length(1) expect(job.args[0]).to_equal("something") expect(job.kwargs).to_include("other") expect(job.kwargs["other"]).to_equal("öther")
def test_schedule2(client): """Test Scheduling a job in a few seconds""" queue = client.application.jobs_queue timestamp = to_unix(datetime.utcnow()) + 5 enqueued_id = queue.enqueue_in("5s", Categories.Monitor, "something", other="öther") redis = client.application.redis expect(redis.llen(queue.queue_name)).to_equal(0) expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(1) items = redis.zrange(Queue.SCHEDULED_QUEUE_NAME, 0, -1, withscores=True) expect(items).to_length(1) message_id, actual_ts = items[0] expect(actual_ts).to_equal(timestamp) expect(message_id).to_equal(enqueued_id) message_key = Message.generate_message_key(enqueued_id) expect(redis.exists(message_key)).to_be_true() data = redis.get(message_key) job = Message.deserialize(data) expect(job.id).to_equal(enqueued_id) expect(job.category).to_equal(Categories.Monitor) expect(job.args).to_length(1) expect(job.args[0]).to_equal("something") expect(job.kwargs).to_include("other") expect(job.kwargs["other"]).to_equal("öther")
def enqueue_cron(self, cron, category, *args, **kw): next_dt = get_next_cron_timestamp(cron) timestamp = to_unix(next_dt) return self.__enqueue_at_timestamp(timestamp, category, cron, *args, **kw)
def send_webhook( task_id, job_id, execution_id, method, url, headers, retries, retry_count ): app = current_app job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="send_webhook", task_id=task_id, job_id=job_id, execution_id=execution_id, method=method, url=url, headers=headers, retries=retries, retry_count=retry_count, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) logger.info("Execution loaded successfully") data = execution.to_dict(include_log=True, include_error=True) data = loads(dumps(data)) if "webhookDispatch" in data["metadata"]: del data["metadata"]["webhookDispatch"] data["metadata"]["custom"] = job.metadata.get("custom", {}) data["job_id"] = job_id data = dumps(data) try: dispatcher = WebhooksDispatcher() response = dispatcher.dispatch(method, url, data, headers) execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": response.status_code, "body": response.body, "headers": response.headers, } ) execution.save() job.save() logger.info("Webhook dispatched successfully.") except WebhooksDispatchError as err: error = traceback.format_exc() execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": err.status_code, "body": err.body, "headers": err.headers, "error": error, } ) execution.metadata["webhookDispatch"] = execution.metadata["webhookDispatch"][ -3: ] execution.save() job.save() logger.error("Failed to dispatch webhook.", err=error) if retry_count < retries: logger.debug("Retrying...") args = [ task_id, job_id, execution_id, method, url, headers, retries, retry_count + 1, ] factor = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = to_unix( datetime.utcnow() + timedelta(seconds=math.pow(factor, retry_count) * min_backoff) ) current_app.webhooks_queue.enqueue_at(delta, Categories.Webhook, *args) logger.info("Webhook dispatch retry scheduled.", date=delta) return True
def monitor_job(task_id, job_id, execution_id): try: app = current_app executor = app.executor job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="monitor_job", task_id=task_id, job_id=job_id, execution_id=execution_id, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) if execution.status not in (JobExecution.Status.running,): logger.error("Execution result already retrieved. Skipping monitoring...") return False try: result = executor.get_result(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to get results.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False if result is None: execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = JobExecution.Status.failed execution.log = "" execution.error = ( "Job failed since container could not be found in docker host." ) logger.debug( "Job failed, since container could not be found in host.", status="failed", ) execution.save() job.save() send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Container result obtained.", container_status=result.status, container_exit_code=result.exit_code, ) if result.status in ( ExecutionResult.Status.created, ExecutionResult.Status.running, ): ellapsed = (datetime.utcnow() - execution.started_at).total_seconds() if ellapsed > job.metadata["timeout"]: execution.finished_at = datetime.utcnow() execution.status = JobExecution.Status.timedout execution.error = "Job execution timed out after %d seconds." % ellapsed try: executor.stop_job(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to stop job.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False logger.debug( "Job execution timed out. Storing job details in mongo db.", status=execution.status, ellapsed=ellapsed, error=result.error, ) execution.save() job.save() logger.info("Job execution timed out.", status=execution.status) send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Job has not finished. Retrying monitoring in the future.", container_status=result.status, seconds=1, ) current_app.monitor_queue.enqueue_in( "5s", Categories.Monitor, task_id, job_id, execution_id ) return True if ( result.exit_code != 0 and "retry_count" in job.metadata and job.metadata["retry_count"] < job.metadata["retries"] ): retry_logger = logger.bind( exit_code=result.exit_code, retry_count=job.metadata["retry_count"], retries=job.metadata["retries"], ) retry_logger.debug("Job failed. Enqueuing job retry...") job.metadata["retry_count"] += 1 new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command, ] factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = timedelta(seconds=min_backoff) if job.metadata["retries"] > 0: delta = timedelta( seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff ) future_date = datetime.utcnow() + delta enqueued_id = current_app.jobs_queue.enqueue_at( to_unix(future_date), Categories.Job, *args ) job.metadata["enqueued_id"] = enqueued_id job.save() retry_logger.info("Job execution enqueued successfully.") # still need to finish current execution as the retry # will be a new execution execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = ( JobExecution.Status.done if execution.exit_code == 0 else JobExecution.Status.failed ) execution.log = result.log.decode("utf-8") execution.error = result.error.decode("utf-8") logger.debug( "Job finished. Storing job details in mongo db.", status=execution.status, log=result.log, error=result.error, ) execution.save() job.save() logger.info("Job details stored in mongo db.", status=execution.status) try: executor.mark_as_done(job.task, job, execution) except HostUnavailableError: error = traceback.format_exc() logger.error("Failed to mark job as done.", error=error) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to monitor job", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) raise err