Exemple #1
0
    def test_master_success_influences_other_experiment_workers_status(self):
        with patch('scheduler.tasks.experiments.experiments_build.apply_async'
                   ) as _:  # noqa
            with patch.object(Experiment, 'set_status') as _:  # noqa
                experiment = ExperimentFactory()

        assert ExperimentLifeCycle.is_done(experiment.last_status) is False
        # Add jobs
        master = ExperimentJobFactory(experiment=experiment,
                                      role=TaskType.MASTER)
        assert JobLifeCycle.is_done(master.last_status) is False
        workers = [
            ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER)
            for _ in range(2)
        ]
        for worker in workers:
            worker.refresh_from_db()
            assert JobLifeCycle.is_done(worker.last_status) is False

        # Set master to succeeded
        ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED)

        # All worker should have a success status
        for worker in workers:
            worker.refresh_from_db()
            assert worker.last_status == JobLifeCycle.SUCCEEDED

        # Experiment last status should be success
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
Exemple #2
0
async def log_job(request, ws, job, pod_id, namespace, container):
    job_uuid = job.uuid.hex
    if job_uuid in request.app.job_logs_ws_managers:
        ws_manager = request.app.job_logs_ws_managers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_logs_ws_managers[job_uuid] = ws_manager

    ws_manager.add_socket(ws)

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify_ws(ws=ws, message=get_status_message(status))
            if should_disconnect(ws=ws, ws_manager=ws_manager):
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify_ws(ws=ws, message=get_status_message(status))
        return

    config.load_incluster_config()
    k8s_api = client.CoreV1Api()
    await log_job_pod(k8s_api=k8s_api,
                      ws=ws,
                      ws_manager=ws_manager,
                      pod_id=pod_id,
                      container=container,
                      namespace=namespace)
Exemple #3
0
def update_job_containers(event: Mapping, status: str,
                          job_container_name: str) -> None:
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        RedisJobContainers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                RedisJobContainers.monitor(container_id=container_id,
                                           job_uuid=job_uuid)
            else:

                RedisJobContainers.remove_container(container_id=container_id)
Exemple #4
0
def build_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status

    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'finished_at'])
    auditor.record(event_type=BUILD_JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status)
    if instance.status == JobLifeCycle.CREATED:
        auditor.record(event_type=BUILD_JOB_CREATED, instance=job)
    elif instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=BUILD_JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=BUILD_JOB_FAILED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.SUCCEEDED:
        auditor.record(event_type=BUILD_JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status)

    # handle done status
    if JobLifeCycle.is_done(instance.status):
        auditor.record(event_type=BUILD_JOB_DONE,
                       instance=job,
                       previous_status=previous_status)
Exemple #5
0
def build_handle_done_status(sender, **kwargs):
    instance = kwargs['instance']
    build_job_id = instance.job_id

    if JobLifeCycle.is_done(instance.status):
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
                             kwargs={'build_job_id': build_job_id})
Exemple #6
0
def update_job_containers(event, status, job_container_name):
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        RedisJobContainers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                RedisJobContainers.monitor(container_id=container_id, job_uuid=job_uuid)
            else:

                RedisJobContainers.remove_container(container_id=container_id)
Exemple #7
0
 def calculated_status(self):
     master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status
     calculated_status = master_status if JobLifeCycle.is_done(master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemple #8
0
 def calculated_status(self):
     master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status
     calculated_status = master_status if JobLifeCycle.is_done(master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemple #9
0
def build_handle_done_status(sender, **kwargs):
    instance = kwargs['instance']
    build_job_id = instance.job_id

    if JobLifeCycle.is_done(instance.status):
        celery_app.send_task(
            SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
            kwargs={'build_job_id': build_job_id})
Exemple #10
0
 def calculated_status(self) -> str:
     master_status = self.jobs.order_by('created_at').first().last_status
     calculated_status = master_status if JobLifeCycle.is_done(
         master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(
             self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemple #11
0
def build_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status

    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'finished_at'])
    auditor.record(event_type=BUILD_JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status)
    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=BUILD_JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status)

    if instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=BUILD_JOB_FAILED,
                       instance=job,
                       previous_status=previous_status)

    if instance.status == JobLifeCycle.SUCCEEDED:
        auditor.record(event_type=BUILD_JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status)

    # Check if we need to schedule a job stop
    if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED):
        _logger.info(
            'The build job  `%s` failed or is done, '
            'send signal to stop.', job.unique_name)
        # Schedule stop for this job
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP,
                             kwargs={
                                 'project_name': job.project.unique_name,
                                 'project_uuid': job.project.uuid.hex,
                                 'build_job_name': job.unique_name,
                                 'build_job_uuid': job.uuid.hex,
                                 'update_status': False,
                                 'collect_logs': True,
                             },
                             countdown=RedisTTL.get_for_build(build_id=job.id))

    # handle done status
    if JobLifeCycle.is_done(instance.status):
        auditor.record(event_type=BUILD_JOB_DONE,
                       instance=job,
                       previous_status=previous_status)
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
                             kwargs={'build_job_id': job.id})
Exemple #12
0
def job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status'])
    auditor.record(event_type=JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status)
    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status)

    if instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=JOB_FAILED,
                       instance=job,
                       previous_status=previous_status)

    if instance.status == JobLifeCycle.SUCCEEDED:
        auditor.record(event_type=JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status)
    if JobLifeCycle.is_done(instance.status):
        auditor.record(event_type=JOB_DONE,
                       instance=job,
                       previous_status=previous_status)

    # Check if we need to schedule a job stop
    if not job.specification:
        return

    if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED):
        _logger.debug('The build job  `%s` failed or is done, '
                      'send signal to stop.', job.unique_name)
        # Schedule stop for this job because
        celery_app.send_task(
            SchedulerCeleryTasks.JOBS_STOP,
            kwargs={
                'project_name': job.project.unique_name,
                'project_uuid': job.project.uuid.hex,
                'job_name': job.unique_name,
                'job_uuid': job.uuid.hex,
                'specification': job.config,
                'update_status': False
            },
            countdown=RedisTTL.get_for_job(job_id=job.id))
Exemple #13
0
def build_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    job.save()
    auditor.record(event_type=BUILD_JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status,
                   target='project')
    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=BUILD_JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')

    if instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=BUILD_JOB_FAILED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')

    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=BUILD_JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')

    # Check if we need to schedule a job stop
    if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED):
        _logger.info(
            'The build job  `%s` failed or is done, '
            'send signal to stop.', job.unique_name)
        # Schedule stop for this job
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP,
                             kwargs={
                                 'project_name': job.project.unique_name,
                                 'project_uuid': job.project.uuid.hex,
                                 'build_job_name': job.unique_name,
                                 'build_job_uuid': job.uuid.hex,
                                 'update_status': False
                             })

    # handle done status
    if JobLifeCycle.is_done(instance.status):
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
                             kwargs={'build_job_id': job.id})
Exemple #14
0
async def job_logs(
        request,  # pylint:disable=too-many-branches
        ws,
        username,
        project_name,
        job_id):
    job, message = validate_job(request=request,
                                username=username,
                                project_name=project_name,
                                job_id=job_id)
    if job is None:
        await ws.send(get_error_message(message))
        return

    job_uuid = job.uuid.hex

    auditor.record(event_type=JOB_LOGS_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        logger.info('Job uuid `%s` logs is now being monitored', job_uuid)
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        logger.info('Add job log consumer for %s', job_uuid)
        consumer = Consumer(
            routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS,
                                       job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    def should_disconnect():
        if not consumer.ws:
            logger.info('Stopping logs monitor for job uuid %s', job_uuid)
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            return True
        return False

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify(consumer=consumer, message=get_status_message(status))
            if should_disconnect():
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify(consumer=consumer, message=get_status_message(status))
        RedisToStream.remove_job_logs(job_uuid=job_uuid)
        return

    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            await notify(consumer=consumer, message=message)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done',
                            job_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for job uuid %s', job_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if should_disconnect():
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)