Exemple #1
0
def download_image(executor, job, ex, image, tag, command, logger):
    try:
        logger.debug("Downloading updated container image...",
                     image=image,
                     tag=tag)
        before = time.time()
        executor.update_image(job.task, job, ex, image, tag)
        ellapsed = time.time() - before
        logger.info("Image downloaded successfully.",
                    image=image,
                    tag=tag,
                    ellapsed=ellapsed)
    except Exception as err:
        logger.error("Failed to download image.", error=err)
        ex.error = str(err)
        ex.status = JobExecution.Status.failed
        job.save()

        current_app.report_error(
            err,
            metadata=dict(
                operation="Downloading Image",
                task_id=job.task.task_id,
                job_id=job.id,
                execution_id=ex.execution_id,
                image=image,
                tag=tag,
                command=command,
            ),
        )

        return False

    return True
Exemple #2
0
def run_container(executor, job, ex, image, tag, command, logger):
    logger.debug("Running command in container...",
                 image=image,
                 tag=tag,
                 command=command)
    try:
        ex.started_at = datetime.utcnow()
        job.save()

        before = time.time()
        executor.run(job.task, job, ex, image, tag, command)
        ellapsed = time.time() - before
        logger.info(
            "Container started successfully.",
            image=image,
            tag=tag,
            command=command,
            ellapsed=ellapsed,
        )
    except HostUnavailableError:
        error = traceback.format_exc()
        logger.error("Host is unavailable.", error=error)

        enqueued_id = reenqueue_job_due_to_break(job.task.task_id,
                                                 str(job.job_id),
                                                 ex.execution_id, image,
                                                 command)

        job.metadata["enqueued_id"] = enqueued_id
        job.save()

        logger.warn("Job execution re-enqueued successfully.")

        return False
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to run command", error=error)
        ex.error = error
        ex.status = JobExecution.Status.failed
        job.save()

        current_app.report_error(
            err,
            metadata=dict(
                operation="Running Container",
                task_id=job.task.task_id,
                job_id=job.id,
                execution_id=ex.execution_id,
                image=image,
                tag=tag,
                command=command,
            ),
        )

        return False

    return True
Exemple #3
0
def download_image(executor, job, ex, image, tag, command, logger):
    try:
        logger.debug("Changing job status...", status=JobExecution.Status.pulling)
        ex.status = JobExecution.Status.pulling
        ex.save()
        logger.debug(
            "Job status changed successfully.", status=ex.status
        )

        logger.debug("Downloading updated container image...", image=image, tag=tag)
        before = time.time()
        executor.update_image(job.task, job, ex, image, tag)
        ellapsed = time.time() - before
        logger.info(
            "Image downloaded successfully.", image=image, tag=tag, ellapsed=ellapsed
        )
    except HostUnavailableError:
        error = traceback.format_exc()
        logger.error("Host is unavailable.", error=error)

        enqueued_id = reenqueue_job_due_to_break(
            job.task.task_id, str(job.job_id), ex.execution_id, image, command
        )

        job.metadata["enqueued_id"] = enqueued_id
        job.save()

        logger.warn("Job execution re-enqueued successfully.")

        return False
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to download image.", error=error)
        ex.error = error
        ex.status = JobExecution.Status.failed
        ex.save()

        current_app.report_error(
            err,
            metadata=dict(
                operation="Downloading Image",
                task_id=job.task.task_id,
                job_id=job.id,
                execution_id=ex.execution_id,
                image=image,
                tag=tag,
                command=command,
            ),
        )

        return False

    return True
Exemple #4
0
def stream_log(executor, task_id, job, ex, websocket):
    try:
        if (not websocket.closed and ex.status == JobExecution.Status.done
                or ex.status == JobExecution.Status.failed):
            websocket.send("EXIT CODE: %s\n" % ex.exit_code)
            websocket.send(ex.log)
            websocket.send("\n-=-\n")
            websocket.send(ex.error)
            websocket.close(message="wsdone", code=CODE_NOT_FOUND)

            return

        if not websocket.closed and ex.status != JobExecution.Status.running:
            websocket.close(message="wsretry", code=CODE_NOT_FOUND)

            return

        try:
            for log in executor.get_streaming_logs(task_id, job, ex):
                if websocket.closed:
                    return
                websocket.send(log)
        except geventwebsocket.exceptions.WebSocketError:
            return

    except BrokenPipeError:
        websocket.close(message="wsretry")

        return
    except ContainerUnavailableError as err:
        current_app.report_error(
            err,
            metadata=dict(
                operation="Job Execution Stream",
                task_id=task_id,
                job_id=job.job_id,
                execution_id=ex.execution_id,
            ),
        )
        websocket.close(message="wsretry")

        return

    websocket.close(message="wsdone")
Exemple #5
0
def run_container(executor, job, ex, image, tag, command, logger):
    logger.debug("Running command in container...",
                 image=image,
                 tag=tag,
                 command=command)
    try:
        ex.started_at = datetime.utcnow()
        job.save()

        before = time.time()
        executor.run(job.task, job, ex, image, tag, command)
        ellapsed = time.time() - before
        logger.info(
            "Container started successfully.",
            image=image,
            tag=tag,
            command=command,
            ellapsed=ellapsed,
        )
    except Exception as err:
        logger.error("Failed to run command", error=err)
        ex.error = str(err)
        ex.status = JobExecution.Status.failed
        job.save()

        current_app.report_error(
            err,
            metadata=dict(
                operation="Running Container",
                task_id=job.task.task_id,
                job_id=job.id,
                execution_id=ex.execution_id,
                image=image,
                tag=tag,
                command=command,
            ),
        )

        return False

    return True
Exemple #6
0
def monitor_job(task_id, job_id, execution_id):
    try:
        app = current_app
        executor = app.executor

        job = Job.get_by_id(task_id, job_id)
        logger = app.logger.bind(
            operation="monitor_job",
            task_id=task_id,
            job_id=job_id,
            execution_id=execution_id,
        )

        if job is None:
            logger.error("Failed to retrieve task or job.")

            return False

        execution = job.get_execution_by_id(execution_id)

        if execution.status not in (JobExecution.Status.running,):
            logger.error("Execution result already retrieved. Skipping monitoring...")

            return False

        try:
            result = executor.get_result(job.task, job, execution)
        except HostUnavailableError as err:
            error = traceback.format_exc()
            logger.error("Failed to get results.", error=error)
            current_app.report_error(
                err,
                metadata=dict(
                    operation="Monitoring Job",
                    task_id=task_id,
                    job_id=job_id,
                    execution_id=execution_id,
                ),
            )

            reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

            logger.warn("Job monitor re-enqueued successfully.")

            return False

        if result is None:
            execution.finished_at = datetime.utcnow()
            execution.exit_code = result.exit_code
            execution.status = JobExecution.Status.failed
            execution.log = ""
            execution.error = (
                "Job failed since container could not be found in docker host."
            )

            logger.debug(
                "Job failed, since container could not be found in host.",
                status="failed",
            )
            execution.save()
            job.save()

            send_webhooks(job.task, job, execution, logger)
            notify_users(job.task, job, execution, logger)

            return False

        logger.info(
            "Container result obtained.",
            container_status=result.status,
            container_exit_code=result.exit_code,
        )

        if result.status in (
            ExecutionResult.Status.created,
            ExecutionResult.Status.running,
        ):
            ellapsed = (datetime.utcnow() - execution.started_at).total_seconds()

            if ellapsed > job.metadata["timeout"]:
                execution.finished_at = datetime.utcnow()
                execution.status = JobExecution.Status.timedout
                execution.error = "Job execution timed out after %d seconds." % ellapsed

                try:
                    executor.stop_job(job.task, job, execution)
                except HostUnavailableError as err:
                    error = traceback.format_exc()
                    logger.error("Failed to stop job.", error=error)
                    current_app.report_error(
                        err,
                        metadata=dict(
                            operation="Monitoring Job",
                            task_id=task_id,
                            job_id=job_id,
                            execution_id=execution_id,
                        ),
                    )

                    reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

                    logger.warn("Job monitor re-enqueued successfully.")

                    return False

                logger.debug(
                    "Job execution timed out. Storing job details in mongo db.",
                    status=execution.status,
                    ellapsed=ellapsed,
                    error=result.error,
                )
                execution.save()
                job.save()
                logger.info("Job execution timed out.", status=execution.status)

                send_webhooks(job.task, job, execution, logger)
                notify_users(job.task, job, execution, logger)

                return False

            logger.info(
                "Job has not finished. Retrying monitoring in the future.",
                container_status=result.status,
                seconds=1,
            )

            current_app.monitor_queue.enqueue_in(
                "5s", Categories.Monitor, task_id, job_id, execution_id
            )

            return True

        if (
            result.exit_code != 0
            and "retry_count" in job.metadata
            and job.metadata["retry_count"] < job.metadata["retries"]
        ):
            retry_logger = logger.bind(
                exit_code=result.exit_code,
                retry_count=job.metadata["retry_count"],
                retries=job.metadata["retries"],
            )
            retry_logger.debug("Job failed. Enqueuing job retry...")
            job.metadata["retry_count"] += 1

            new_exec = job.create_execution(execution.image, execution.command)
            new_exec.status = JobExecution.Status.enqueued

            args = [
                task_id,
                job_id,
                new_exec.execution_id,
                execution.image,
                execution.command,
            ]
            factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"]
            min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0
            delta = timedelta(seconds=min_backoff)

            if job.metadata["retries"] > 0:
                delta = timedelta(
                    seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff
                )
            future_date = datetime.utcnow() + delta
            enqueued_id = current_app.jobs_queue.enqueue_at(
                to_unix(future_date), Categories.Job, *args
            )

            job.metadata["enqueued_id"] = enqueued_id
            job.save()

            retry_logger.info("Job execution enqueued successfully.")

            # still need to finish current execution as the retry
            # will be a new execution

        execution.finished_at = datetime.utcnow()
        execution.exit_code = result.exit_code
        execution.status = (
            JobExecution.Status.done

            if execution.exit_code == 0
            else JobExecution.Status.failed
        )
        execution.log = result.log.decode("utf-8")
        execution.error = result.error.decode("utf-8")

        logger.debug(
            "Job finished. Storing job details in mongo db.",
            status=execution.status,
            log=result.log,
            error=result.error,
        )
        execution.save()
        job.save()
        logger.info("Job details stored in mongo db.", status=execution.status)

        try:
            executor.mark_as_done(job.task, job, execution)
        except HostUnavailableError:
            error = traceback.format_exc()
            logger.error("Failed to mark job as done.", error=error)
            reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

            logger.warn("Job monitor re-enqueued successfully.")

            return False

        send_webhooks(job.task, job, execution, logger)
        notify_users(job.task, job, execution, logger)

        return True
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to monitor job", error=error)
        current_app.report_error(
            err,
            metadata=dict(
                operation="Monitoring Job",
                task_id=task_id,
                job_id=job_id,
                execution_id=execution_id,
            ),
        )

        raise err
Exemple #7
0
def run_job(task_id, job_id, execution_id, image, command):
    app = current_app
    logger = app.logger.bind(
        operation="run_job",
        task_id=task_id,
        job_id=job_id,
        image=image,
        command=command,
    )

    try:
        executor = app.executor

        job = Job.get_by_id(task_id, job_id)

        if job is None:
            logger.error("Job was not found with task id and job id.")

            return False

        if not validate_max_concurrent(
            executor, task_id, job, execution_id, image, command, logger
        ):
            return False

        tag = "latest"

        if ":" in image:
            image, tag = image.split(":")

        logger = logger.bind(image=image, tag=tag)

        logger.debug("Changing job status...", status=JobExecution.Status.enqueued)
        if execution_id is None:
            ex = job.create_execution(image=image, command=command)
            ex.status = JobExecution.Status.enqueued
            ex.save()
        else:
            ex = job.get_execution_by_id(execution_id)

        logger.debug(
            "Job status changed successfully.", status=ex.status
        )
        logger = logger.bind(execution_id=ex.execution_id)
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to create job execution. Skipping job...", error=error)
        current_app.report_error(
            err,
            metadata=dict(task_id=task_id, job_id=job_id, image=image, command=command),
        )

        return False

    try:
        if not validate_expiration(job, ex, logger):
            return False

        logger.info("Started processing job.")

        if not download_image(executor, job, ex, image, tag, command, logger):
            return False

        if not run_container(executor, job, ex, image, tag, command, logger):
            return False

        logger.debug("Changing job status...", status=JobExecution.Status.running)
        ex.status = JobExecution.Status.running
        ex.save()
        job.save()
        logger.debug(
            "Job status changed successfully.", status=JobExecution.Status.running
        )

        current_app.monitor_queue.enqueue_in(
            "1s", Categories.Monitor, task_id, job_id, ex.execution_id
        )

        return True
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to run job", error=error)
        ex.status = JobExecution.Status.failed
        ex.error = "Job failed to run with error: %s" % error
        ex.save()
        job.save()

        current_app.report_error(
            err,
            metadata=dict(
                operation="Running Container",
                task_id=task_id,
                job_id=job_id,
                execution_id=ex.execution_id,
                image=image,
                tag=tag,
                command=command,
            ),
        )
Exemple #8
0
def monitor_job(task_id, job_id, execution_id):
    try:
        app = current_app
        executor = app.load_executor()

        job = Job.get_by_id(task_id, job_id)
        logger = app.logger.bind(task_id=task_id, job_id=job_id)

        if job is None:
            logger.error("Failed to retrieve task or job.")

            return False

        execution = job.get_execution_by_id(execution_id)
        result = executor.get_result(job.task, job, execution)
        logger.info(
            "Container result obtained.",
            container_status=result.status,
            container_exit_code=result.exit_code,
        )

        if result.status in (
                ExecutionResult.Status.created,
                ExecutionResult.Status.running,
        ):
            ellapsed = (datetime.utcnow() -
                        execution.started_at).total_seconds()

            if ellapsed > job.metadata["timeout"]:
                execution.finished_at = datetime.utcnow()
                execution.status = JobExecution.Status.timedout
                execution.error = f"Job execution timed out after {ellapsed} seconds."

                executor.stop_job(job.task, job, execution)

                logger.debug(
                    "Job execution timed out. Storing job details in mongo db.",
                    status=execution.status,
                    ellapsed=ellapsed,
                    error=result.error,
                )
                job.save()
                logger.info("Job execution timed out.",
                            status=execution.status)

                return False

            scheduler = Scheduler("monitor", connection=app.redis)
            logger.info(
                "Job has not finished. Retrying monitoring in the future.",
                container_status=result.status,
                seconds=1,
            )

            interval = timedelta(seconds=5)
            scheduler.enqueue_in(interval, monitor_job, task_id, job_id,
                                 execution_id)

            return True

        if (result.exit_code != 0 and "retry_count" in job.metadata
                and job.metadata["retry_count"] < job.metadata["retries"]):
            retry_logger = logger.bind(
                exit_code=result.exit_code,
                retry_count=job.metadata["retry_count"],
                retries=job.metadata["retries"],
            )
            retry_logger.debug("Job failed. Enqueuing job retry...")
            job.metadata["retry_count"] += 1

            scheduler = Scheduler("jobs", connection=current_app.redis)

            args = [task_id, job_id, execution.image, execution.command]
            factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"]
            min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0
            delta = timedelta(seconds=min_backoff)

            if job.metadata["retries"] > 0:
                delta = timedelta(
                    seconds=math.pow(factor, job.metadata["retry_count"]) *
                    min_backoff)
            dt = datetime.utcnow() + delta
            enqueued = scheduler.enqueue_at(dt, run_job, *args)

            job.metadata["enqueued_id"] = enqueued.id
            job.save()

            retry_logger.info("Job execution enqueued successfully.")

            # still need to finish current execution as the retry
            # will be a new execution

        execution.finished_at = datetime.utcnow()
        execution.exit_code = result.exit_code
        execution.status = (JobExecution.Status.done if execution.exit_code
                            == 0 else JobExecution.Status.failed)
        execution.log = result.log.decode("utf-8")
        execution.error = result.error.decode("utf-8")

        logger.debug(
            "Job finished. Storing job details in mongo db.",
            status=execution.status,
            log=result.log,
            error=result.error,
        )
        job.save()
        logger.info("Job details stored in mongo db.", status=execution.status)

        return True
    except Exception as err:
        logger.error("Failed to monitor job", error=err)
        current_app.report_error(
            err,
            metadata=dict(
                operation="Monitoring Job",
                task_id=task_id,
                job_id=job_id,
                execution_id=execution_id,
            ),
        )

        raise err