def download_image(executor, job, ex, image, tag, command, logger): try: logger.debug("Downloading updated container image...", image=image, tag=tag) before = time.time() executor.update_image(job.task, job, ex, image, tag) ellapsed = time.time() - before logger.info("Image downloaded successfully.", image=image, tag=tag, ellapsed=ellapsed) except Exception as err: logger.error("Failed to download image.", error=err) ex.error = str(err) ex.status = JobExecution.Status.failed job.save() current_app.report_error( err, metadata=dict( operation="Downloading Image", task_id=job.task.task_id, job_id=job.id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), ) return False return True
def run_container(executor, job, ex, image, tag, command, logger): logger.debug("Running command in container...", image=image, tag=tag, command=command) try: ex.started_at = datetime.utcnow() job.save() before = time.time() executor.run(job.task, job, ex, image, tag, command) ellapsed = time.time() - before logger.info( "Container started successfully.", image=image, tag=tag, command=command, ellapsed=ellapsed, ) except HostUnavailableError: error = traceback.format_exc() logger.error("Host is unavailable.", error=error) enqueued_id = reenqueue_job_due_to_break(job.task.task_id, str(job.job_id), ex.execution_id, image, command) job.metadata["enqueued_id"] = enqueued_id job.save() logger.warn("Job execution re-enqueued successfully.") return False except Exception as err: error = traceback.format_exc() logger.error("Failed to run command", error=error) ex.error = error ex.status = JobExecution.Status.failed job.save() current_app.report_error( err, metadata=dict( operation="Running Container", task_id=job.task.task_id, job_id=job.id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), ) return False return True
def download_image(executor, job, ex, image, tag, command, logger): try: logger.debug("Changing job status...", status=JobExecution.Status.pulling) ex.status = JobExecution.Status.pulling ex.save() logger.debug( "Job status changed successfully.", status=ex.status ) logger.debug("Downloading updated container image...", image=image, tag=tag) before = time.time() executor.update_image(job.task, job, ex, image, tag) ellapsed = time.time() - before logger.info( "Image downloaded successfully.", image=image, tag=tag, ellapsed=ellapsed ) except HostUnavailableError: error = traceback.format_exc() logger.error("Host is unavailable.", error=error) enqueued_id = reenqueue_job_due_to_break( job.task.task_id, str(job.job_id), ex.execution_id, image, command ) job.metadata["enqueued_id"] = enqueued_id job.save() logger.warn("Job execution re-enqueued successfully.") return False except Exception as err: error = traceback.format_exc() logger.error("Failed to download image.", error=error) ex.error = error ex.status = JobExecution.Status.failed ex.save() current_app.report_error( err, metadata=dict( operation="Downloading Image", task_id=job.task.task_id, job_id=job.id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), ) return False return True
def stream_log(executor, task_id, job, ex, websocket): try: if (not websocket.closed and ex.status == JobExecution.Status.done or ex.status == JobExecution.Status.failed): websocket.send("EXIT CODE: %s\n" % ex.exit_code) websocket.send(ex.log) websocket.send("\n-=-\n") websocket.send(ex.error) websocket.close(message="wsdone", code=CODE_NOT_FOUND) return if not websocket.closed and ex.status != JobExecution.Status.running: websocket.close(message="wsretry", code=CODE_NOT_FOUND) return try: for log in executor.get_streaming_logs(task_id, job, ex): if websocket.closed: return websocket.send(log) except geventwebsocket.exceptions.WebSocketError: return except BrokenPipeError: websocket.close(message="wsretry") return except ContainerUnavailableError as err: current_app.report_error( err, metadata=dict( operation="Job Execution Stream", task_id=task_id, job_id=job.job_id, execution_id=ex.execution_id, ), ) websocket.close(message="wsretry") return websocket.close(message="wsdone")
def run_container(executor, job, ex, image, tag, command, logger): logger.debug("Running command in container...", image=image, tag=tag, command=command) try: ex.started_at = datetime.utcnow() job.save() before = time.time() executor.run(job.task, job, ex, image, tag, command) ellapsed = time.time() - before logger.info( "Container started successfully.", image=image, tag=tag, command=command, ellapsed=ellapsed, ) except Exception as err: logger.error("Failed to run command", error=err) ex.error = str(err) ex.status = JobExecution.Status.failed job.save() current_app.report_error( err, metadata=dict( operation="Running Container", task_id=job.task.task_id, job_id=job.id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), ) return False return True
def monitor_job(task_id, job_id, execution_id): try: app = current_app executor = app.executor job = Job.get_by_id(task_id, job_id) logger = app.logger.bind( operation="monitor_job", task_id=task_id, job_id=job_id, execution_id=execution_id, ) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) if execution.status not in (JobExecution.Status.running,): logger.error("Execution result already retrieved. Skipping monitoring...") return False try: result = executor.get_result(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to get results.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False if result is None: execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = JobExecution.Status.failed execution.log = "" execution.error = ( "Job failed since container could not be found in docker host." ) logger.debug( "Job failed, since container could not be found in host.", status="failed", ) execution.save() job.save() send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Container result obtained.", container_status=result.status, container_exit_code=result.exit_code, ) if result.status in ( ExecutionResult.Status.created, ExecutionResult.Status.running, ): ellapsed = (datetime.utcnow() - execution.started_at).total_seconds() if ellapsed > job.metadata["timeout"]: execution.finished_at = datetime.utcnow() execution.status = JobExecution.Status.timedout execution.error = "Job execution timed out after %d seconds." % ellapsed try: executor.stop_job(job.task, job, execution) except HostUnavailableError as err: error = traceback.format_exc() logger.error("Failed to stop job.", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False logger.debug( "Job execution timed out. Storing job details in mongo db.", status=execution.status, ellapsed=ellapsed, error=result.error, ) execution.save() job.save() logger.info("Job execution timed out.", status=execution.status) send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return False logger.info( "Job has not finished. Retrying monitoring in the future.", container_status=result.status, seconds=1, ) current_app.monitor_queue.enqueue_in( "5s", Categories.Monitor, task_id, job_id, execution_id ) return True if ( result.exit_code != 0 and "retry_count" in job.metadata and job.metadata["retry_count"] < job.metadata["retries"] ): retry_logger = logger.bind( exit_code=result.exit_code, retry_count=job.metadata["retry_count"], retries=job.metadata["retries"], ) retry_logger.debug("Job failed. Enqueuing job retry...") job.metadata["retry_count"] += 1 new_exec = job.create_execution(execution.image, execution.command) new_exec.status = JobExecution.Status.enqueued args = [ task_id, job_id, new_exec.execution_id, execution.image, execution.command, ] factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = timedelta(seconds=min_backoff) if job.metadata["retries"] > 0: delta = timedelta( seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff ) future_date = datetime.utcnow() + delta enqueued_id = current_app.jobs_queue.enqueue_at( to_unix(future_date), Categories.Job, *args ) job.metadata["enqueued_id"] = enqueued_id job.save() retry_logger.info("Job execution enqueued successfully.") # still need to finish current execution as the retry # will be a new execution execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = ( JobExecution.Status.done if execution.exit_code == 0 else JobExecution.Status.failed ) execution.log = result.log.decode("utf-8") execution.error = result.error.decode("utf-8") logger.debug( "Job finished. Storing job details in mongo db.", status=execution.status, log=result.log, error=result.error, ) execution.save() job.save() logger.info("Job details stored in mongo db.", status=execution.status) try: executor.mark_as_done(job.task, job, execution) except HostUnavailableError: error = traceback.format_exc() logger.error("Failed to mark job as done.", error=error) reenqueue_monitor_due_to_break(task_id, job_id, execution_id) logger.warn("Job monitor re-enqueued successfully.") return False send_webhooks(job.task, job, execution, logger) notify_users(job.task, job, execution, logger) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to monitor job", error=error) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) raise err
def run_job(task_id, job_id, execution_id, image, command): app = current_app logger = app.logger.bind( operation="run_job", task_id=task_id, job_id=job_id, image=image, command=command, ) try: executor = app.executor job = Job.get_by_id(task_id, job_id) if job is None: logger.error("Job was not found with task id and job id.") return False if not validate_max_concurrent( executor, task_id, job, execution_id, image, command, logger ): return False tag = "latest" if ":" in image: image, tag = image.split(":") logger = logger.bind(image=image, tag=tag) logger.debug("Changing job status...", status=JobExecution.Status.enqueued) if execution_id is None: ex = job.create_execution(image=image, command=command) ex.status = JobExecution.Status.enqueued ex.save() else: ex = job.get_execution_by_id(execution_id) logger.debug( "Job status changed successfully.", status=ex.status ) logger = logger.bind(execution_id=ex.execution_id) except Exception as err: error = traceback.format_exc() logger.error("Failed to create job execution. Skipping job...", error=error) current_app.report_error( err, metadata=dict(task_id=task_id, job_id=job_id, image=image, command=command), ) return False try: if not validate_expiration(job, ex, logger): return False logger.info("Started processing job.") if not download_image(executor, job, ex, image, tag, command, logger): return False if not run_container(executor, job, ex, image, tag, command, logger): return False logger.debug("Changing job status...", status=JobExecution.Status.running) ex.status = JobExecution.Status.running ex.save() job.save() logger.debug( "Job status changed successfully.", status=JobExecution.Status.running ) current_app.monitor_queue.enqueue_in( "1s", Categories.Monitor, task_id, job_id, ex.execution_id ) return True except Exception as err: error = traceback.format_exc() logger.error("Failed to run job", error=error) ex.status = JobExecution.Status.failed ex.error = "Job failed to run with error: %s" % error ex.save() job.save() current_app.report_error( err, metadata=dict( operation="Running Container", task_id=task_id, job_id=job_id, execution_id=ex.execution_id, image=image, tag=tag, command=command, ), )
def monitor_job(task_id, job_id, execution_id): try: app = current_app executor = app.load_executor() job = Job.get_by_id(task_id, job_id) logger = app.logger.bind(task_id=task_id, job_id=job_id) if job is None: logger.error("Failed to retrieve task or job.") return False execution = job.get_execution_by_id(execution_id) result = executor.get_result(job.task, job, execution) logger.info( "Container result obtained.", container_status=result.status, container_exit_code=result.exit_code, ) if result.status in ( ExecutionResult.Status.created, ExecutionResult.Status.running, ): ellapsed = (datetime.utcnow() - execution.started_at).total_seconds() if ellapsed > job.metadata["timeout"]: execution.finished_at = datetime.utcnow() execution.status = JobExecution.Status.timedout execution.error = f"Job execution timed out after {ellapsed} seconds." executor.stop_job(job.task, job, execution) logger.debug( "Job execution timed out. Storing job details in mongo db.", status=execution.status, ellapsed=ellapsed, error=result.error, ) job.save() logger.info("Job execution timed out.", status=execution.status) return False scheduler = Scheduler("monitor", connection=app.redis) logger.info( "Job has not finished. Retrying monitoring in the future.", container_status=result.status, seconds=1, ) interval = timedelta(seconds=5) scheduler.enqueue_in(interval, monitor_job, task_id, job_id, execution_id) return True if (result.exit_code != 0 and "retry_count" in job.metadata and job.metadata["retry_count"] < job.metadata["retries"]): retry_logger = logger.bind( exit_code=result.exit_code, retry_count=job.metadata["retry_count"], retries=job.metadata["retries"], ) retry_logger.debug("Job failed. Enqueuing job retry...") job.metadata["retry_count"] += 1 scheduler = Scheduler("jobs", connection=current_app.redis) args = [task_id, job_id, execution.image, execution.command] factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"] min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0 delta = timedelta(seconds=min_backoff) if job.metadata["retries"] > 0: delta = timedelta( seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff) dt = datetime.utcnow() + delta enqueued = scheduler.enqueue_at(dt, run_job, *args) job.metadata["enqueued_id"] = enqueued.id job.save() retry_logger.info("Job execution enqueued successfully.") # still need to finish current execution as the retry # will be a new execution execution.finished_at = datetime.utcnow() execution.exit_code = result.exit_code execution.status = (JobExecution.Status.done if execution.exit_code == 0 else JobExecution.Status.failed) execution.log = result.log.decode("utf-8") execution.error = result.error.decode("utf-8") logger.debug( "Job finished. Storing job details in mongo db.", status=execution.status, log=result.log, error=result.error, ) job.save() logger.info("Job details stored in mongo db.", status=execution.status) return True except Exception as err: logger.error("Failed to monitor job", error=err) current_app.report_error( err, metadata=dict( operation="Monitoring Job", task_id=task_id, job_id=job_id, execution_id=execution_id, ), ) raise err