def test_job_monitoring(self): job_uuid = uuid.uuid4().hex assert RedisToStream.is_monitored_job_resources(job_uuid) is False RedisToStream.monitor_job_resources(job_uuid) assert RedisToStream.is_monitored_job_resources(job_uuid) is True RedisToStream.remove_job_resources(job_uuid) assert RedisToStream.is_monitored_job_resources(job_uuid) is False assert RedisToStream.is_monitored_job_logs(job_uuid) is False RedisToStream.monitor_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is True RedisToStream.remove_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is False
async def job_logs(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex auditor.record(event_type=EXPERIMENT_JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS, experiment.uuid.hex, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 disconnected_ws = set() for _ws in consumer.ws: try: await _ws.send(message) except ConnectionClosed: disconnected_ws.add(_ws) consumer.remove_sockets(disconnected_ws) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ ws, }) should_quite = True if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)
def test_monitor_job_logs(self): job_uuid = uuid.uuid4().hex RedisToStream.monitor_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is True RedisToStream.remove_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is False
async def build_logs( request, # pylint:disable=too-many-branches ws, username, project_name, build_id): job, message = validate_build(request=request, username=username, project_name=project_name, build_id=build_id) if job is None: await ws.send(get_error_message(message)) return job_uuid = job.uuid.hex auditor.record(event_type=BUILD_JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): _logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: _logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}'.format(RoutingKeys.LOGS_SIDECARS_BUILDS, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: _logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify(consumer=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify(consumer=consumer, message=get_status_message(status)) RedisToStream.remove_job_logs(job_uuid=job_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(consumer=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: _logger.info( 'removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access _logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)
async def job_logs(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex auditor.record(event_type=EXPERIMENT_JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS, experiment.uuid.hex, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 disconnected_ws = set() for _ws in consumer.ws: try: await _ws.send(message) except ConnectionClosed: disconnected_ws.add(_ws) consumer.remove_sockets(disconnected_ws) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ws, }) should_quite = True if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)