Beispiel #1
0
    def test_job_monitoring(self):
        job_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False
        RedisToStream.monitor_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is True
        RedisToStream.remove_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False

        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
        RedisToStream.monitor_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is True
        RedisToStream.remove_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
    def publish_experiment_job_log(self,
                                   log_lines,
                                   status,
                                   experiment_uuid,
                                   experiment_name,
                                   job_uuid,
                                   task_type=None,
                                   task_idx=None):

        self._logger.debug("Publishing log event for task: %s.%s, %s",
                           task_type, task_idx, experiment_name)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB,
            kwargs={
                'experiment_name': experiment_name,
                'experiment_uuid': experiment_uuid,
                'job_uuid': job_uuid,
                'log_lines': log_lines,
                'task_type': task_type,
                'task_idx': task_idx
            })
        try:
            should_stream = (
                RedisToStream.is_monitored_job_logs(job_uuid)
                or RedisToStream.is_monitored_experiment_logs(experiment_uuid))
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info(
                "Streaming new log event for experiment: %s job: %s",
                experiment_uuid, job_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'experiment_uuid': experiment_uuid,
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                            'status': status,
                            'task_type': task_type,
                            'task_idx': task_idx
                        },
                        retry=True,
                        routing_key='{}.{}.{}'.format(
                            RoutingKeys.LOGS_SIDECARS_EXPERIMENTS,
                            experiment_uuid, job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Beispiel #3
0
    def publish_experiment_job_log(self,
                                   log_lines,
                                   experiment_uuid,
                                   experiment_name,
                                   job_uuid,
                                   send_task=True):
        self._logger.debug("Publishing log event for task: %s, %s", job_uuid,
                           experiment_name)

        if send_task:
            workers.send(LogsCeleryTasks.LOGS_HANDLE_EXPERIMENT_JOB,
                         kwargs={
                             'experiment_name': experiment_name,
                             'experiment_uuid': experiment_uuid,
                             'log_lines': log_lines,
                             'temp': True
                         },
                         countdown=None)
        try:
            should_stream = (
                RedisToStream.is_monitored_job_logs(job_uuid)
                or RedisToStream.is_monitored_experiment_logs(experiment_uuid))
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info(
                "Streaming new log event for experiment: %s job: %s",
                experiment_uuid, job_uuid)

            with workers.app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'experiment_uuid': experiment_uuid,
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                        },
                        retry=True,
                        routing_key='{}.{}.{}'.format(
                            RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS,
                            experiment_uuid, job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Beispiel #4
0
    def _stream_job_log(self, job_uuid, log_lines, routing_key):
        try:
            should_stream = RedisToStream.is_monitored_job_logs(job_uuid)
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info("Streaming new log event for job: %s", job_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                        },
                        routing_key='{}.{}'.format(routing_key, job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Beispiel #5
0
async def job_logs(
        request,  # pylint:disable=too-many-branches
        ws,
        username,
        project_name,
        job_id):
    job, message = validate_job(request=request,
                                username=username,
                                project_name=project_name,
                                job_id=job_id)
    if job is None:
        await ws.send(get_error_message(message))
        return

    job_uuid = job.uuid.hex

    auditor.record(event_type=JOB_LOGS_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        logger.info('Job uuid `%s` logs is now being monitored', job_uuid)
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        logger.info('Add job log consumer for %s', job_uuid)
        consumer = Consumer(
            routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS,
                                       job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    def should_disconnect():
        if not consumer.ws:
            logger.info('Stopping logs monitor for job uuid %s', job_uuid)
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            return True
        return False

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify(consumer=consumer, message=get_status_message(status))
            if should_disconnect():
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify(consumer=consumer, message=get_status_message(status))
        RedisToStream.remove_job_logs(job_uuid=job_uuid)
        return

    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            await notify(consumer=consumer, message=message)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done',
                            job_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for job uuid %s', job_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if should_disconnect():
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Beispiel #6
0
 def test_monitor_job_logs(self):
     job_uuid = uuid.uuid4().hex
     RedisToStream.monitor_job_logs(job_uuid)
     assert RedisToStream.is_monitored_job_logs(job_uuid) is True
     RedisToStream.remove_job_logs(job_uuid)
     assert RedisToStream.is_monitored_job_logs(job_uuid) is False