def test_job_get_by_job_id(client): # pylint: disable=unused-argument """Test getting a job by id""" task_id = str(uuid4()) task = Task.create_task(task_id) job = task.create_job() topic = Job.get_by_id(task_id, job.job_id) expect(topic).not_to_be_null() expect(topic.job_id).to_equal(str(job.job_id)) topic = Job.get_by_id("invalid", "invalid") expect(topic).to_be_null()
def test_enqueue7(client): """Test enqueue with metadata""" with client.application.app_context(): app = client.application app.redis.flushall() task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "metadata": {"a": 123, "b": 456}} options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j.metadata).to_include("custom") metadata = j.metadata["custom"] expect(metadata).to_be_like(data["metadata"])
def get_job_execution(task_id, job_id, execution_id): logger = g.logger.bind( operation="get_job_execution", task_id=task_id, job_id=job_id, execution_id=execution_id, ) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "get_job_execution", status=404, logger=logger) execution = job.get_execution_by_id(execution_id) if execution is None: msg = f"Job Execution ({execution_id}) not found in job ({job_id})." return return_error(msg, "get_job_execution", status=404, logger=logger) logger.debug("Job execution retrieved successfully...") return format_execution_details(job.task, job, execution)
def process_job_execution_logs(websocket, task_id, job_id, execution_id, logger): job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: logger.error(f"Job ({job_id}) not found in task ({task_id}).") websocket.close() return if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if execution is None: logger.error("No executions found in job ({execution_id}).") websocket.close(message="wsretry") return executor = current_app.executor process = Process(target=stream_log, args=(executor, task_id, job, execution, websocket)) process.start() while not websocket.closed: time.sleep(10) process.terminate()
def test_enqueue12_2(client): """Test updating a scheduled job removes the scheduling and re-schedules""" with client.application.app_context(): task_id = str(uuid4()) job_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "startIn": "6h"} response = client.put( f"/tasks/{task_id}/jobs/{job_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) job = Job.get_by_id(task_id, job_id) expect(job).not_to_be_null() enqueued_id = job["metadata"]["enqueued_id"] expect(enqueued_id).not_to_be_null() queue = client.application.jobs_queue expect(queue.is_scheduled(enqueued_id)).to_be_true() response = client.put( f"/tasks/{task_id}/jobs/{job_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) job.reload() expect(job.metadata["enqueued_id"]).not_to_equal(enqueued_id) # reschedule job expect(queue.is_scheduled(enqueued_id)).to_be_false() expect(queue.is_scheduled(job.metadata["enqueued_id"])).to_be_true()
def stop_job(task_id, job_id): logger = g.logger.bind(operation="stop_job", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error("Job not found in task.", "stop_job", status=404, logger=logger) execution = job.get_last_execution() _, response = perform_stop_job_execution(job, execution=execution, logger=logger, stop_schedule=True) if response is not None: return response return get_job_summary(task_id, job_id)
def get_job(task_id, job_id): logger = g.logger.bind(operation="get_job", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error( "Job not found in task.", "get_task", status=404, logger=logger ) logger.debug("Job retrieved successfully...") details = job.to_dict( include_log=True, include_error=True, blacklist=current_app.config["ENV_BLACKLISTED_WORDS"].lower().split(","), ) for execution in details["executions"]: exec_url = url_for( "execution.get_job_execution", task_id=task_id, job_id=job_id, execution_id=execution["executionId"], _external=True, ) execution["url"] = exec_url task_url = url_for("task.get_task", task_id=task_id, _external=True) return jsonify({"task": {"id": task_id, "url": task_url}, "job": details})
def test_get_unscheduled_jobs1(client): """Test gets unscheduled job without enqueued_id""" with client.application.app_context(): task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "cron": "* * * * *"} response = client.post( f"/tasks/{task_id}/", data=dumps(data), follow_redirects=True ) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] Job.get_by_id(task_id, job_id) job = JobFixture.new(metadata={"cron": "* * * * *"}) unscheduled_jobs = Job.get_unscheduled_jobs(client.application) expect(unscheduled_jobs).to_length(1) expect(unscheduled_jobs[0].job_id).to_equal(job.job_id)
def retrieve_execution_details(task_id, job_id, execution_id=None, get_data_fn=None): if get_data_fn is None: get_data_fn = lambda execution: execution.log # noqa: E731 logger = g.logger.bind(operation="get_response", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "retrieve_execution_details", status=404, logger=logger) if not job.executions: msg = f"No executions found in job ({job_id})." return return_error(msg, "retrieve_execution_details", status=400, logger=logger) if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if not execution: msg = "No executions found in job with specified arguments." return return_error(msg, "retrieve_execution_details", status=400, logger=logger) headers = {"Fastlane-Exit-Code": str(execution.exit_code)} if execution.status in [ JobExecution.Status.running, JobExecution.Status.enqueued ]: logs = "" else: logs = get_data_fn(execution) return Response(headers=headers, response=logs, status=200)
def retry_job(task_id, job_id): logger = g.logger.bind(operation="retry", task_id=task_id, job_id=job_id) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: return return_error("Job not found in task.", "retry_job", status=404, logger=logger) execution = job.get_last_execution() if execution is None: return return_error("No execution yet to retry.", "retry_job", status=400, logger=logger) if "enqueued_id" in job.metadata and current_app.jobs_queue.is_scheduled( job.metadata["enqueued_id"]): msg = "Can't retry a scheduled job." return return_error(msg, "retry_job", status=400, logger=logger) if execution.status == JobExecution.Status.running: logger.debug("Stopping current execution...") executor = current_app.executor executor.stop_job(job.task, job, execution) logger.debug("Current execution stopped.") execution.status = JobExecution.Status.failed job.save() new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued logger.debug("Enqueuing job execution...") args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command ] result = current_app.jobs_queue.enqueue(Categories.Job, *args) job.metadata["enqueued_id"] = result.id job.save() logger.info("Job execution enqueued successfully.") return get_job_summary(task_id, job_id)
def test_enqueue6(client): """Test enqueue with webhooks""" with client.application.app_context(): app = client.application app.redis.flushall() task_id = str(uuid4()) data = { "image": "ubuntu", "command": "ls", "webhooks": { "succeeds": [{ "method": "GET", "url": "http://some.test.url" }], "fails": [{ "method": "GET", "url": "http://some.test.url" }], "finishes": [{ "method": "POST", "url": "http://some.test.url" }], }, } options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j).not_to_be_null() expect(j.metadata).to_include("webhooks") webhooks = j.metadata["webhooks"] expect(webhooks).to_be_like(data["webhooks"])
def enqueue(client, input_data): app = client.application app.redis.flushall() task_id = str(uuid4()) data = {"image": "ubuntu", "command": "ls", "metadata": input_data} options = dict( data=dumps(data), headers={"Content-Type": "application/json"}, follow_redirects=True, ) response = client.post(f"/tasks/{task_id}/", **options) expect(response.status_code).to_equal(200) obj = loads(response.data) job_id = obj["jobId"] expect(job_id).not_to_be_null() expect(obj["queueJobId"]).not_to_be_null() j = Job.get_by_id(task_id, job_id) expect(j.metadata).not_to_include("custom")
def stop_job_execution(task_id, job_id, execution_id): logger = g.logger.bind( operation="stop_job_execution", task_id=task_id, job_id=job_id, execution_id=execution_id, ) logger.debug("Getting job...") job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: msg = f"Task ({task_id}) or Job ({job_id}) not found." return return_error(msg, "stop_job_execution", status=404, logger=logger) execution = job.get_execution_by_id(execution_id) if execution is None: msg = f"Job Execution ({execution_id}) not found in Job ({job_id})." return return_error(msg, "stop_job_execution", status=404, logger=logger) _, response = perform_stop_job_execution(job, execution=execution, logger=logger, stop_schedule=False) if response is not None: return response return format_execution_details(job.task, job, execution, shallow=True)
def process_job_execution_logs(websocket, task_id, job_id, execution_id, logger): job = Job.get_by_id(task_id=task_id, job_id=job_id) if job is None: logger.error(f"Job ({job_id}) not found in task ({task_id}).") websocket.close(code=CODE_OK) return if execution_id is None: execution = job.get_last_execution() else: execution = job.get_execution_by_id(execution_id) if execution is None: logger.error("No executions found in job ({execution_id}).") websocket.close(message="wsretry", code=CODE_OK) return executor = current_app.executor stream_log(executor, task_id, job, execution, websocket)
def send_email(task_id, job_id, execution_id, subject, to_email): job = Job.get_by_id(task_id, job_id) logger = current_app.logger.bind( operation="send_email", task_id=task_id, job_id=job_id, to_email=to_email, execution_id=execution_id, subject=subject, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) logger.info("Execution loaded successfully") smtp_host = current_app.config["SMTP_HOST"] smtp_port = current_app.config["SMTP_PORT"] smtp_from = current_app.config["SMTP_FROM"] if smtp_host is None or smtp_port is None or smtp_from is None: logger.error( "SMTP_HOST, SMTP_PORT and SMTP_FROM must be configured. Skipping sending e-mail." ) return False try: smtp_port = int(smtp_port) logger = logger.bind(smtp_host=smtp_host, smtp_port=smtp_port) logger.info("Connecting to SMTP Server...") server = smtplib.SMTP(smtp_host, smtp_port) server.set_debuglevel(0) if current_app.config.get("SMTP_USE_SSL"): logger.info("Starting TLS...") server.starttls() smtp_user = current_app.config.get("SMTP_USER") smtp_password = current_app.config.get("SMTP_PASSWORD") if smtp_user and smtp_password: logger.info( "Authenticating with SMTP...", smtp_user=smtp_user, smtp_password=smtp_password, ) server.login(smtp_user, smtp_password) from_email = current_app.config["SMTP_FROM"] task_url = url_for("task.get_task", task_id=task_id, _external=True) job_url = url_for( "task.get_job", task_id=task_id, job_id=job_id, _external=True ) job_data = dumps( execution.to_dict(include_log=True, include_error=True), indent=4, sort_keys=True, ) body = ( """ Automatic message. Please do not reply to this! Job Details: %s """ % job_data ) subj = "[Fastlane] %s" % subject msg = MIMEMultipart("alternative") msg["Subject"] = subj msg["From"] = from_email msg["To"] = to_email part1 = MIMEText(body, "plain") html_body = """<html><body> <h2>Job Details:</h2> <div><pre><code>%s</code></pre></div> <div>---</div> <p><a href="%s">[View Task Details]</a> | <a href="%s">[View Job Details]</a></p> <div>---</div> <p>Automatic message. Please do not reply to this!</p> </body></html> """ % ( job_data, task_url, job_url, ) part2 = MIMEText(html_body, "html") msg.attach(part1) msg.attach(part2) logger.info("Sending email...") server.sendmail(from_email, to_email, msg.as_string()) server.quit() logger.info("Email sent successfully.") except Exception as exc: logger.error( "Sending e-mail failed with exception!", error=traceback.format_exc() ) raise exc return True
def run_job(task_id, job_id, execution_id, image, command): app = current_app logger = app.logger.bind( operation="run_job", task_id=task_id, job_id=job_id, image=image, command=command, ) try: executor = app.executor job = Job.get_by_id(task_id, job_id) if job is None: logger.error("Job was not found with task id and job id.") return False if not validate_max_concurrent( executor, task_id, job, execution_id, image, command, logger ): return False tag = "latest" if ":" in image: image, tag = image.split(":") logger = logger.bind(image=image, tag=tag) logger.debug("Changing job status...", status=JobExecution.Status.enqueued) if execution_id is None: ex = job.create_execution(image=image, command=command) ex.status = JobExecution.Status.enqueued ex.save() else: ex = job.get_execution_by_id(execution_id) logger.debug( "Job status changed successfully.", status=ex.status ) logger = logger.bind(execution_id=ex.execution_id) except Exception as err: error = traceback.format_exc() logger.error("Failed to create job execution. Skipping job...", error=error) current_app.report_error( err, metadata=dict(task_id=task_id, job_id=job_id, image=image, command=command), ) return False try: if not validate_expiration(job, ex, logger): return False logger.info("Started processing job.") if not download_image(executor, job, ex, image, tag, command, logger): return False if not run_container(executor, job, ex, image, tag, command, logger): return False logger.debug("Changing job status...", status=JobExecution.Status.running) ex.status = JobExecution.Status.running ex.save() job.save() logger.debug( "Job status changed successfully.", status=JobExecution.Status.running ) current_app.monitor_queue.enqueue_in( "1s", Categories.Monitor, task_id, job_id, ex.execution_id ) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to run job", error=error) ex.status = JobExecution.Status.failed ex.error = "Job failed to run with error: %s" % error ex.save() job.save() current_app.report_error( err, metadata=dict( operation="Running Container", task_id=task_id, job_id=job_id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), )
def monitor_job(task_id, job_id, execution_id): try: app = current_app executor = app.executor job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="monitor_job", task_id=task_id, job_id=job_id, execution_id=execution_id, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) if execution.status not in (JobExecution.Status.running,): logger.error("Execution result already retrieved. Skipping monitoring...") return False try: result = executor.get_result(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to get results.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False if result is None: execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = JobExecution.Status.failed execution.log = "" execution.error = ( "Job failed since container could not be found in docker host." ) logger.debug( "Job failed, since container could not be found in host.", status="failed", ) execution.save() job.save() send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Container result obtained.", container_status=result.status, container_exit_code=result.exit_code, ) if result.status in ( ExecutionResult.Status.created, ExecutionResult.Status.running, ): ellapsed = (datetime.utcnow() - execution.started_at).total_seconds() if ellapsed > job.metadata["timeout"]: execution.finished_at = datetime.utcnow() execution.status = JobExecution.Status.timedout execution.error = "Job execution timed out after %d seconds." % ellapsed try: executor.stop_job(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to stop job.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False logger.debug( "Job execution timed out. Storing job details in mongo db.", status=execution.status, ellapsed=ellapsed, error=result.error, ) execution.save() job.save() logger.info("Job execution timed out.", status=execution.status) send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Job has not finished. Retrying monitoring in the future.", container_status=result.status, seconds=1, ) current_app.monitor_queue.enqueue_in( "5s", Categories.Monitor, task_id, job_id, execution_id ) return True if ( result.exit_code != 0 and "retry_count" in job.metadata and job.metadata["retry_count"] < job.metadata["retries"] ): retry_logger = logger.bind( exit_code=result.exit_code, retry_count=job.metadata["retry_count"], retries=job.metadata["retries"], ) retry_logger.debug("Job failed. Enqueuing job retry...") job.metadata["retry_count"] += 1 new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command, ] factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = timedelta(seconds=min_backoff) if job.metadata["retries"] > 0: delta = timedelta( seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff ) future_date = datetime.utcnow() + delta enqueued_id = current_app.jobs_queue.enqueue_at( to_unix(future_date), Categories.Job, *args ) job.metadata["enqueued_id"] = enqueued_id job.save() retry_logger.info("Job execution enqueued successfully.") # still need to finish current execution as the retry # will be a new execution execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = ( JobExecution.Status.done if execution.exit_code == 0 else JobExecution.Status.failed ) execution.log = result.log.decode("utf-8") execution.error = result.error.decode("utf-8") logger.debug( "Job finished. Storing job details in mongo db.", status=execution.status, log=result.log, error=result.error, ) execution.save() job.save() logger.info("Job details stored in mongo db.", status=execution.status) try: executor.mark_as_done(job.task, job, execution) except HostUnavailableError: error = traceback.format_exc() logger.error("Failed to mark job as done.", error=error) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to monitor job", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) raise err
def send_webhook( task_id, job_id, execution_id, method, url, headers, retries, retry_count ): app = current_app job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="send_webhook", task_id=task_id, job_id=job_id, execution_id=execution_id, method=method, url=url, headers=headers, retries=retries, retry_count=retry_count, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) logger.info("Execution loaded successfully") data = execution.to_dict(include_log=True, include_error=True) data = loads(dumps(data)) if "webhookDispatch" in data["metadata"]: del data["metadata"]["webhookDispatch"] data["metadata"]["custom"] = job.metadata.get("custom", {}) data["job_id"] = job_id data = dumps(data) try: dispatcher = WebhooksDispatcher() response = dispatcher.dispatch(method, url, data, headers) execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": response.status_code, "body": response.body, "headers": response.headers, } ) execution.save() job.save() logger.info("Webhook dispatched successfully.") except WebhooksDispatchError as err: error = traceback.format_exc() execution.metadata.setdefault("webhookDispatch", []) execution.metadata["webhookDispatch"].append( { "timestamp": datetime.utcnow().isoformat(), "url": url, "statusCode": err.status_code, "body": err.body, "headers": err.headers, "error": error, } ) execution.metadata["webhookDispatch"] = execution.metadata["webhookDispatch"][ -3: ] execution.save() job.save() logger.error("Failed to dispatch webhook.", err=error) if retry_count < retries: logger.debug("Retrying...") args = [ task_id, job_id, execution_id, method, url, headers, retries, retry_count + 1, ] factor = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = to_unix( datetime.utcnow() + timedelta(seconds=math.pow(factor, retry_count) * min_backoff) ) current_app.webhooks_queue.enqueue_at(delta, Categories.Webhook, *args) logger.info("Webhook dispatch retry scheduled.", date=delta) return True