def test_job_get_by_job_id(client): # pylint: disable=unused-argument """Test getting a job by id""" task_id = str(uuid4()) task = Task.create_task(task_id) job = task.create_job() topic = Job.get_by_id(task_id, job.job_id) expect(topic).not_to_be_null() expect(topic.job_id).to_equal(str(job.job_id)) topic = Job.get_by_id("invalid", "invalid") expect(topic).to_be_null()
def test_get_unfinished_executions(client): with client.application.app_context(): app = client.application app.redis.flushall() for status in [ JobExecution.Status.enqueued, JobExecution.Status.pulling, JobExecution.Status.running, JobExecution.Status.done, JobExecution.Status.failed, ]: _, job, execution = JobExecutionFixture.new_defaults() execution.status = status job.save() topic = Job.get_unfinished_executions() expect(topic).to_length(2) for (_, execution) in topic: expect(execution).to_be_instance_of(JobExecution) expect( execution.status in [JobExecution.Status.pulling, JobExecution.Status.running] ).to_be_true()
def process_job_execution_logs(websocket, task_id, job_id, execution_id, logger): job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: logger.error(f"Job ({job_id}) not found in task ({task_id}).") websocket.close() return if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if execution is None: logger.error("No executions found in job ({execution_id}).") websocket.close(message="wsretry") return executor = current_app.executor process = Process(target=stream_log, args=(executor, task_id, job, execution, websocket)) process.start() while not websocket.closed: time.sleep(10) process.terminate()
def test_enqueue12_2(client): """Test updating a scheduled job removes the scheduling and re-schedules""" with client.application.app_context(): task_id = str(uuid4()) job_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "startIn": "6h"} response = client.put( f"/tasks/{task_id}/jobs/{job_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) job = Job.get_by_id(task_id, job_id) expect(job).not_to_be_null() enqueued_id = job["metadata"]["enqueued_id"] expect(enqueued_id).not_to_be_null() queue = client.application.jobs_queue expect(queue.is_scheduled(enqueued_id)).to_be_true() response = client.put( f"/tasks/{task_id}/jobs/{job_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) job.reload() expect(job.metadata["enqueued_id"]).not_to_equal(enqueued_id) # reschedule job expect(queue.is_scheduled(enqueued_id)).to_be_false() expect(queue.is_scheduled(job.metadata["enqueued_id"])).to_be_true()
def stop_job(task_id, job_id): logger = g.logger.bind(operation="stop_job", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error("Job not found in task.", "stop_job", status=404, logger=logger) execution = job.get_last_execution() _, response = perform_stop_job_execution(job, execution=execution, logger=logger, stop_schedule=True) if response is not None: return response return get_job_summary(task_id, job_id)
def test_enqueue12(client): """Test enqueue a job works with PUT""" with client.application.app_context(): task_id = str(uuid4()) job_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls"} response = client.put( f"/tasks/{task_id}/jobs/{job_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) obj = loads(response.data) expect(obj).not_to_be_null() new_job_id = obj["jobId"] expect(new_job_id).to_equal(job_id) expect(obj["queueJobId"]).not_to_be_null() expect(obj["executionId"]).not_to_be_null() task = Task.get_by_task_id(obj["taskId"]) expect(task).not_to_be_null() expect(task.jobs).not_to_be_empty() j = task.jobs[0] job = Job.objects(id=j.id).first() expect(str(job.job_id)).to_equal(job_id) expect(obj["taskUrl"]).to_equal(task.get_url()) expect(obj).to_be_enqueued() count = Task.objects.count() expect(count).to_equal(1)
def get_job_execution(task_id, job_id, execution_id): logger = g.logger.bind( operation="get_job_execution", task_id=task_id, job_id=job_id, execution_id=execution_id, ) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "get_job_execution", status=404, logger=logger) execution = job.get_execution_by_id(execution_id) if execution is None: msg = f"Job Execution ({execution_id}) not found in job ({job_id})." return return_error(msg, "get_job_execution", status=404, logger=logger) logger.debug("Job execution retrieved successfully...") return format_execution_details(job.task, job, execution)
def test_enqueue7(client): """Test enqueue with metadata""" with client.application.app_context(): app = client.application app.redis.flushall() task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "metadata": {"a": 123, "b": 456}} options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j.metadata).to_include("custom") metadata = j.metadata["custom"] expect(metadata).to_be_like(data["metadata"])
def get_task(task_id): logger = g.logger.bind(operation="get_task", task_id=task_id) logger.debug("Getting job...") task = Task.get_by_task_id(task_id) if task is None: return return_error("Task not found.", "get_task", status=404, logger=logger) logger.debug("Task retrieved successfully...") task_jobs = Job.objects(id__in=[str(job_id.id) for job_id in task.jobs]) jobs = [] for job in task_jobs: url = url_for("task.get_job", task_id=task_id, job_id=str(job.job_id), _external=True) job = { "id": str(job.job_id), "createdAt": job.created_at.isoformat(), "url": url } jobs.append(job) return jsonify({"taskId": task_id, "jobs": jobs})
def get_job(task_id, job_id): logger = g.logger.bind(operation="get_job", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error( "Job not found in task.", "get_task", status=404, logger=logger ) logger.debug("Job retrieved successfully...") details = job.to_dict( include_log=True, include_error=True, blacklist=current_app.config["ENV_BLACKLISTED_WORDS"].lower().split(","), ) for execution in details["executions"]: exec_url = url_for( "execution.get_job_execution", task_id=task_id, job_id=job_id, execution_id=execution["executionId"], _external=True, ) execution["url"] = exec_url task_url = url_for("task.get_task", task_id=task_id, _external=True) return jsonify({"task": {"id": task_id, "url": task_url}, "job": details})
def test_enqueue1(client): """Test enqueue a job works""" with client.application.app_context(): task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls"} response = client.post(f"/tasks/{task_id}/", data=dumps(data), follow_redirects=True) expect(response.status_code).to_equal(200) obj = loads(response.data) expect(obj["taskUrl"]).to_equal( f"http://localhost:10000/tasks/{task_id}/") job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["jobUrl"]).to_equal( f"http://localhost:10000/tasks/{task_id}/jobs/{job_id}/") expect(obj["queueJobId"]).not_to_be_null() expect(obj["executionId"]).not_to_be_null() execution_id = obj["executionId"] expect(obj["executionUrl"]).to_equal( f"http://localhost:10000/tasks/{task_id}/jobs/{job_id}/executions/{execution_id}/" ) task = Task.get_by_task_id(obj["taskId"]) expect(task).not_to_be_null() expect(task.jobs).not_to_be_empty() j = task.jobs[0] job = Job.objects(id=j.id).first() expect(str(job.job_id)).to_equal(job_id) expect(obj["taskUrl"]).to_equal(task.get_url()) expect(obj).to_be_enqueued() expect(obj).to_be_enqueued_with_value("status", "queued") expect(obj).to_be_enqueued_with_value("created_at") expect(obj).to_be_enqueued_with_value("enqueued_at") expect(obj).to_be_enqueued_with_value("data") expect(obj).to_be_enqueued_with_value("origin", "jobs") expect(obj).to_be_enqueued_with_value( "description", f"fastlane.worker.job.run_job('{obj['taskId']}', '{job_id}', '{execution_id}', 'ubuntu', 'ls')", ) expect(obj).to_be_enqueued_with_value("timeout", "-1") count = Task.objects.count() expect(count).to_equal(1)
def enqueue_missing_monitor_jobs(app): lock = app.redis.lock( "EnqueueMissingMonitorJobs", timeout=7, sleep=0.2, blocking_timeout=500, thread_local=False, ) if not lock.acquire(): app.logger.info( "Lock could not be acquired. Trying to enqueue missing monitor jobs later." ) return try: # find running/created executions executions = Job.get_unfinished_executions(app) queue = app.monitor_queue executions_to_monitor = [] for (job, execution) in executions: if "enqueued_id" in job.metadata and queue.is_scheduled( job.metadata["enqueued_id"] ): continue executions_to_monitor.append((job, execution)) if not executions_to_monitor: return current_app.logger.info( "Found executions missing monitoring. Enqueueing monitor.", executions=len(executions_to_monitor), ) # enqueue if execution not scheduled to be monitored for (job, execution) in executions_to_monitor: current_app.monitor_queue.enqueue_in( "5s", Categories.Monitor, job.task.task_id, job.job_id, execution.execution_id, ) finally: try: lock.release() except Exception as err: current_app.logger.error("Lock release error", error=err)
def retrieve_execution_details(task_id, job_id, execution_id=None, get_data_fn=None): if get_data_fn is None: get_data_fn = lambda execution: execution.log # noqa: E731 logger = g.logger.bind(operation="get_response", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "retrieve_execution_details", status=404, logger=logger) if not job.executions: msg = f"No executions found in job ({job_id})." return return_error(msg, "retrieve_execution_details", status=400, logger=logger) if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if not execution: msg = "No executions found in job with specified arguments." return return_error(msg, "retrieve_execution_details", status=400, logger=logger) headers = {"Fastlane-Exit-Code": str(execution.exit_code)} if execution.status in [ JobExecution.Status.running, JobExecution.Status.enqueued ]: logs = "" else: logs = get_data_fn(execution) return Response(headers=headers, response=logs, status=200)
def test_get_unscheduled_jobs1(client): """Test gets unscheduled job without enqueued_id""" with client.application.app_context(): task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "cron": "* * * * *"} response = client.post( f"/tasks/{task_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] Job.get_by_id(task_id, job_id) job = JobFixture.new(metadata={"cron": "* * * * *"}) unscheduled_jobs = Job.get_unscheduled_jobs(client.application) expect(unscheduled_jobs).to_length(1) expect(unscheduled_jobs[0].job_id).to_equal(job.job_id)
def retry_job(task_id, job_id): logger = g.logger.bind(operation="retry", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error("Job not found in task.", "retry_job", status=404, logger=logger) execution = job.get_last_execution() if execution is None: return return_error("No execution yet to retry.", "retry_job", status=400, logger=logger) if "enqueued_id" in job.metadata and current_app.jobs_queue.is_scheduled( job.metadata["enqueued_id"]): msg = "Can't retry a scheduled job." return return_error(msg, "retry_job", status=400, logger=logger) if execution.status == JobExecution.Status.running: logger.debug("Stopping current execution...") executor = current_app.executor executor.stop_job(job.task, job, execution) logger.debug("Current execution stopped.") execution.status = JobExecution.Status.failed job.save() new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued logger.debug("Enqueuing job execution...") args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command ] result = current_app.jobs_queue.enqueue(Categories.Job, *args) job.metadata["enqueued_id"] = result.id job.save() logger.info("Job execution enqueued successfully.") return get_job_summary(task_id, job_id)
def test_enqueue6(client): """Test enqueue with webhooks""" with client.application.app_context(): app = client.application app.redis.flushall() task_id = str(uuid4()) data = { "image": "ubuntu", "command": "ls", "webhooks": { "succeeds": [{ "method": "GET", "url": "http://some.test.url" }], "fails": [{ "method": "GET", "url": "http://some.test.url" }], "finishes": [{ "method": "POST", "url": "http://some.test.url" }], }, } options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j).not_to_be_null() expect(j.metadata).to_include("webhooks") webhooks = j.metadata["webhooks"] expect(webhooks).to_be_like(data["webhooks"])
def enqueue(client, input_data): app = client.application app.redis.flushall() task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "metadata": input_data} options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j.metadata).not_to_include("custom")
def stop_job_execution(task_id, job_id, execution_id): logger = g.logger.bind( operation="stop_job_execution", task_id=task_id, job_id=job_id, execution_id=execution_id, ) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "stop_job_execution", status=404, logger=logger) execution = job.get_execution_by_id(execution_id) if execution is None: msg = f"Job Execution ({execution_id}) not found in Job ({job_id})." return return_error(msg, "stop_job_execution", status=404, logger=logger) _, response = perform_stop_job_execution(job, execution=execution, logger=logger, stop_schedule=False) if response is not None: return response return format_execution_details(job.task, job, execution, shallow=True)
def process_job_execution_logs(websocket, task_id, job_id, execution_id, logger): job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: logger.error(f"Job ({job_id}) not found in task ({task_id}).") websocket.close(code=CODE_OK) return if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if execution is None: logger.error("No executions found in job ({execution_id}).") websocket.close(message="wsretry", code=CODE_OK) return executor = current_app.executor stream_log(executor, task_id, job, execution, websocket)
def send_email(task_id, job_id, execution_id, subject, to_email): job = Job.get_by_id(task_id, job_id) logger = current_app.logger.bind( operation="send_email", task_id=task_id, job_id=job_id, to_email=to_email, execution_id=execution_id, subject=subject, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) logger.info("Execution loaded successfully") smtp_host = current_app.config["SMTP_HOST"] smtp_port = current_app.config["SMTP_PORT"] smtp_from = current_app.config["SMTP_FROM"] if smtp_host is None or smtp_port is None or smtp_from is None: logger.error( "SMTP_HOST, SMTP_PORT and SMTP_FROM must be configured. Skipping sending e-mail." ) return False try: smtp_port = int(smtp_port) logger = logger.bind(smtp_host=smtp_host, smtp_port=smtp_port) logger.info("Connecting to SMTP Server...") server = smtplib.SMTP(smtp_host, smtp_port) server.set_debuglevel(0) if current_app.config.get("SMTP_USE_SSL"): logger.info("Starting TLS...") server.starttls() smtp_user = current_app.config.get("SMTP_USER") smtp_password = current_app.config.get("SMTP_PASSWORD") if smtp_user and smtp_password: logger.info( "Authenticating with SMTP...", smtp_user=smtp_user, smtp_password=smtp_password, ) server.login(smtp_user, smtp_password) from_email = current_app.config["SMTP_FROM"] task_url = url_for("task.get_task", task_id=task_id, _external=True) job_url = url_for( "task.get_job", task_id=task_id, job_id=job_id, _external=True ) job_data = dumps( execution.to_dict(include_log=True, include_error=True), indent=4, sort_keys=True, ) body = ( """ Automatic message. Please do not reply to this! Job Details: %s """ % job_data ) subj = "[Fastlane] %s" % subject msg = MIMEMultipart("alternative") msg["Subject"] = subj msg["From"] = from_email msg["To"] = to_email part1 = MIMEText(body, "plain") html_body = """<html><body> <h2>Job Details:</h2> <div><pre><code>%s</code></pre></div> <div>---</div> <p><a href="%s">[View Task Details]</a> | <a href="%s">[View Job Details]</a></p> <div>---</div> <p>Automatic message. Please do not reply to this!</p> </body></html> """ % ( job_data, task_url, job_url, ) part2 = MIMEText(html_body, "html") msg.attach(part1) msg.attach(part2) logger.info("Sending email...") server.sendmail(from_email, to_email, msg.as_string()) server.quit() logger.info("Email sent successfully.") except Exception as exc: logger.error( "Sending e-mail failed with exception!", error=traceback.format_exc() ) raise exc return True
def run_job(task_id, job_id, execution_id, image, command): app = current_app logger = app.logger.bind( operation="run_job", task_id=task_id, job_id=job_id, image=image, command=command, ) try: executor = app.executor job = Job.get_by_id(task_id, job_id) if job is None: logger.error("Job was not found with task id and job id.") return False if not validate_max_concurrent( executor, task_id, job, execution_id, image, command, logger ): return False tag = "latest" if ":" in image: image, tag = image.split(":") logger = logger.bind(image=image, tag=tag) logger.debug("Changing job status...", status=JobExecution.Status.enqueued) if execution_id is None: ex = job.create_execution(image=image, command=command) ex.status = JobExecution.Status.enqueued ex.save() else: ex = job.get_execution_by_id(execution_id) logger.debug( "Job status changed successfully.", status=ex.status ) logger = logger.bind(execution_id=ex.execution_id) except Exception as err: error = traceback.format_exc() logger.error("Failed to create job execution. Skipping job...", error=error) current_app.report_error( err, metadata=dict(task_id=task_id, job_id=job_id, image=image, command=command), ) return False try: if not validate_expiration(job, ex, logger): return False logger.info("Started processing job.") if not download_image(executor, job, ex, image, tag, command, logger): return False if not run_container(executor, job, ex, image, tag, command, logger): return False logger.debug("Changing job status...", status=JobExecution.Status.running) ex.status = JobExecution.Status.running ex.save() job.save() logger.debug( "Job status changed successfully.", status=JobExecution.Status.running ) current_app.monitor_queue.enqueue_in( "1s", Categories.Monitor, task_id, job_id, ex.execution_id ) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to run job", error=error) ex.status = JobExecution.Status.failed ex.error = "Job failed to run with error: %s" % error ex.save() job.save() current_app.report_error( err, metadata=dict( operation="Running Container", task_id=task_id, job_id=job_id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), )
def status(): executor = current_app.executor version = pkg_resources.get_distribution("fastlane").version metadata = {"hosts": [], "containers": {"running": []}} containers = executor.get_running_containers() for host, port, container_id in containers["running"]: metadata["containers"]["running"].append({ "host": host, "port": port, "id": container_id }) metadata[ "hosts"] = [] + containers["available"] + containers["unavailable"] metadata["queues"] = {} for queue_name in [ QueueNames.Job, QueueNames.Monitor, QueueNames.Webhook, QueueNames.Notify, ]: queue = getattr(current_app, f"{queue_name}_queue") jobs_queue_size = current_app.redis.llen(queue.queue_name) metadata["queues"][queue_name] = {"length": jobs_queue_size} next_scheduled = current_app.redis.zrange(Queue.SCHEDULED_QUEUE_NAME, 0, 0, withscores=True) if not next_scheduled: next_timestamp = None next_human = None else: next_timestamp = next_scheduled[0][1] next_human = from_unix(next_timestamp).isoformat() metadata["queues"]["scheduled"] = { "length": current_app.redis.zcard(Queue.SCHEDULED_QUEUE_NAME), "nextTimeStamp": next_timestamp, "nextHumanReadableDate": next_human, } metadata["tasks"] = {"count": Task.objects.count()} metadata["jobs"] = {"count": Job.objects.count()} metadata["jobs"]["scheduled"] = [] scheduled_jobs = Job.objects(scheduled=True).all() metadata["fastlane"] = { "version": version, "executor": current_app.config["EXECUTOR"], } for job in scheduled_jobs: j = job.to_dict(include_executions=False, blacklist_fn=current_app.blacklist_words_fn) itr = croniter.croniter(job.metadata["cron"], datetime.utcnow()) j["nextScheduledAt"] = itr.get_next(datetime).isoformat() task_id = job.task.task_id job_url = url_for("task.get_job", task_id=task_id, job_id=str(job.job_id), _external=True) j["url"] = job_url stop_job_url = url_for("task.stop_job", task_id=task_id, job_id=str(job.job_id), _external=True) j["stopUrl"] = stop_job_url task_url = url_for("task.get_task", task_id=task_id, _external=True) del j["taskId"] j["task"] = {"id": task_id, "url": task_url} metadata["jobs"]["scheduled"].append(j) return jsonify(metadata), 200
def status(): executor = current_app.executor version = pkg_resources.get_distribution("fastlane").version metadata = {"hosts": [], "containers": {"running": []}} containers = executor.get_running_containers() for host, port, container_id in containers["running"]: metadata["containers"]["running"].append({ "host": host, "port": port, "id": container_id }) metadata[ "hosts"] = [] + containers["available"] + containers["unavailable"] metadata["queues"] = {"jobs": {}, "monitor": {}, "error": {}} for queue in ["jobs", "monitor", "error"]: jobs_queue_size = current_app.redis.llen(f"rq:queue:{queue}") metadata["queues"][queue]["length"] = jobs_queue_size metadata["tasks"] = {"count": Task.objects.count()} metadata["jobs"] = {"count": Job.objects.count()} metadata["jobs"]["scheduled"] = [] scheduled_jobs = Job.objects(scheduled=True).all() metadata["fastlane"] = { "version": version, "executor": current_app.config["EXECUTOR"], } for job in scheduled_jobs: j = job.to_dict(include_executions=False) itr = croniter.croniter(job.metadata["cron"], datetime.utcnow()) j["nextScheduledAt"] = itr.get_next(datetime).isoformat() task_id = job.task.task_id job_url = url_for("task.get_job", task_id=task_id, job_id=str(job.id), _external=True) j["url"] = job_url stop_job_url = url_for("task.stop_job", task_id=task_id, job_id=str(job.id), _external=True) j["stopUrl"] = stop_job_url task_url = url_for("task.get_task", task_id=task_id, _external=True) del j["taskId"] j["task"] = {"id": task_id, "url": task_url} metadata["jobs"]["scheduled"].append(j) return jsonify(metadata), 200
def send_webhook( task_id, job_id, execution_id, method, url, headers, retries, retry_count ): app = current_app job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="send_webhook", task_id=task_id, job_id=job_id, execution_id=execution_id, method=method, url=url, headers=headers, retries=retries, retry_count=retry_count, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) logger.info("Execution loaded successfully") data = execution.to_dict(include_log=True, include_error=True) data = loads(dumps(data)) if "webhookDispatch" in data["metadata"]: del data["metadata"]["webhookDispatch"] data["metadata"]["custom"] = job.metadata.get("custom", {}) data["job_id"] = job_id data = dumps(data) try: dispatcher = WebhooksDispatcher() response = dispatcher.dispatch(method, url, data, headers) execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": response.status_code, "body": response.body, "headers": response.headers, } ) execution.save() job.save() logger.info("Webhook dispatched successfully.") except WebhooksDispatchError as err: error = traceback.format_exc() execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": err.status_code, "body": err.body, "headers": err.headers, "error": error, } ) execution.metadata["webhookDispatch"] = execution.metadata["webhookDispatch"][ -3: ] execution.save() job.save() logger.error("Failed to dispatch webhook.", err=error) if retry_count < retries: logger.debug("Retrying...") args = [ task_id, job_id, execution_id, method, url, headers, retries, retry_count + 1, ] factor = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = to_unix( datetime.utcnow() + timedelta(seconds=math.pow(factor, retry_count) * min_backoff) ) current_app.webhooks_queue.enqueue_at(delta, Categories.Webhook, *args) logger.info("Webhook dispatch retry scheduled.", date=delta) return True
def monitor_job(task_id, job_id, execution_id): try: app = current_app executor = app.executor job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="monitor_job", task_id=task_id, job_id=job_id, execution_id=execution_id, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) if execution.status not in (JobExecution.Status.running,): logger.error("Execution result already retrieved. Skipping monitoring...") return False try: result = executor.get_result(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to get results.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False if result is None: execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = JobExecution.Status.failed execution.log = "" execution.error = ( "Job failed since container could not be found in docker host." ) logger.debug( "Job failed, since container could not be found in host.", status="failed", ) execution.save() job.save() send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Container result obtained.", container_status=result.status, container_exit_code=result.exit_code, ) if result.status in ( ExecutionResult.Status.created, ExecutionResult.Status.running, ): ellapsed = (datetime.utcnow() - execution.started_at).total_seconds() if ellapsed > job.metadata["timeout"]: execution.finished_at = datetime.utcnow() execution.status = JobExecution.Status.timedout execution.error = "Job execution timed out after %d seconds." % ellapsed try: executor.stop_job(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to stop job.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False logger.debug( "Job execution timed out. Storing job details in mongo db.", status=execution.status, ellapsed=ellapsed, error=result.error, ) execution.save() job.save() logger.info("Job execution timed out.", status=execution.status) send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Job has not finished. Retrying monitoring in the future.", container_status=result.status, seconds=1, ) current_app.monitor_queue.enqueue_in( "5s", Categories.Monitor, task_id, job_id, execution_id ) return True if ( result.exit_code != 0 and "retry_count" in job.metadata and job.metadata["retry_count"] < job.metadata["retries"] ): retry_logger = logger.bind( exit_code=result.exit_code, retry_count=job.metadata["retry_count"], retries=job.metadata["retries"], ) retry_logger.debug("Job failed. Enqueuing job retry...") job.metadata["retry_count"] += 1 new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command, ] factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = timedelta(seconds=min_backoff) if job.metadata["retries"] > 0: delta = timedelta( seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff ) future_date = datetime.utcnow() + delta enqueued_id = current_app.jobs_queue.enqueue_at( to_unix(future_date), Categories.Job, *args ) job.metadata["enqueued_id"] = enqueued_id job.save() retry_logger.info("Job execution enqueued successfully.") # still need to finish current execution as the retry # will be a new execution execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = ( JobExecution.Status.done if execution.exit_code == 0 else JobExecution.Status.failed ) execution.log = result.log.decode("utf-8") execution.error = result.error.decode("utf-8") logger.debug( "Job finished. Storing job details in mongo db.", status=execution.status, log=result.log, error=result.error, ) execution.save() job.save() logger.info("Job details stored in mongo db.", status=execution.status) try: executor.mark_as_done(job.task, job, execution) except HostUnavailableError: error = traceback.format_exc() logger.error("Failed to mark job as done.", error=error) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to monitor job", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) raise err