def experiments_schedule_deletion(experiment_id, immediate=False): experiment = get_valid_experiment(experiment_id=experiment_id, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return experiment.archive() if experiment.is_stoppable: project = experiment.project workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': None, 'experiment_group_uuid': None, 'specification': experiment.content, 'update_status': True, 'collect_logs': False, 'message': 'Experiment is scheduled for deletion.', 'is_managed': experiment.is_managed, }) if immediate: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT, kwargs={ 'experiment_id': experiment_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def experiments_set_metrics(experiment_id, data): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return kwargs = {} is_list = isinstance(data, list) if is_list: kwargs['many'] = True serializer = ExperimentMetricSerializer(data=data, **kwargs) try: serializer.is_valid(raise_exception=True) except ValidationError: _logger.error( 'Could not create metrics, a validation error was raised.') if is_list: merged_metrics = {} metrics_instances = [] for metric_data in serializer.data: metrics_instances.append( ExperimentMetric(experiment=experiment, **metric_data)) merged_metrics.update(metric_data['values']) ExperimentMetric.objects.bulk_create(metrics_instances) experiment.set_metric(merged_metrics) else: serializer.save(experiment=experiment)
def experiments_stop(project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True): specification = ExperimentSpecification.read(specification) experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED)
def experiments_schedule_deletion(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return experiment.archive() if not experiment.is_running: return project = experiment.project celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': None, 'experiment_group_uuid': None, 'specification': experiment.config, 'update_status': True, 'collect_logs': False, 'message': 'Experiment is scheduled for deletion.' })
def build_experiment(self, experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: if self.request.retries < 2: _logger.info('Trying again for Experiment `%s`.', experiment_id) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition( status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING) # Building the docker image try: status = experiments_builder.build_experiment(experiment) except DockerException as e: _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return except Repo.DoesNotExist: _logger.warning('No code was found for this project') experiment.set_status( ExperimentLifeCycle.FAILED, message='No code was found for to build this experiment.') return except Exception as e: # Other exceptions _logger.error( 'Failed to build experiment, unexpected error occurred.\n%s', traceback.format_exc()) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return if not status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return # Now we can start the experiment celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id})
def experiments_stop(experiment_id, update_status=True): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return experiment_scheduler.stop_experiment(experiment, update_status=update_status)
def experiments_set_metrics(experiment_uuid, metrics, created_at=None): experiment = get_valid_experiment(experiment_uuid=experiment_uuid) if not experiment: return kwargs = {} if created_at: kwargs['created_at'] = created_at ExperimentMetric.objects.create(experiment=experiment, values=metrics, **kwargs)
def _run(task_bind, *args, **kwargs): experiment_id = kwargs['experiment_id'] experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: raise OperationRunError( 'The Experiment `{}` does not exist anymore.'.format( experiment_id)) celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': experiment_id})
def experiments_stop(self, project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True, collect_logs=True, is_managed=True, message=None): if collect_logs and is_managed: try: collectors.logs_collect_experiment_jobs( experiment_uuid=experiment_uuid) except (OSError, StoreNotFoundError, PolyaxonStoresException): _logger.warning( 'Scheduler could not collect ' 'the logs for experiment `%s`.', experiment_name) if specification and is_managed: specification = compiler.compile(kind=kinds.EXPERIMENT, values=specification) deleted = experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) else: deleted = True if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s` in experiment.', experiment_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED, message=message or 'Experiment was stopped')
def experiments_check_heartbeat(experiment_id): if RedisHeartBeat.experiment_is_alive(experiment_id=experiment_id): return experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return # Experiment is zombie status experiment.set_status(ExperimentLifeCycle.FAILED, message='Experiment is in zombie state (no heartbeat was reported).')
def build_experiment(self, experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: if self.request.retries < 2: _logger.info('Trying again for Experiment `%s`.', experiment_id) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING) # Building the docker image try: status = experiments_builder.build_experiment(experiment) except DockerException as e: _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return except Repo.DoesNotExist: _logger.warning('No code was found for this project') experiment.set_status(ExperimentLifeCycle.FAILED, message='No code was found for to build this experiment.') return except Exception as e: # Other exceptions _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return if not status: return # Now we can start the experiment celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id})
def experiments_start(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.SCHEDULED): _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.', experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED) return None experiment_scheduler.start_experiment(experiment)
def experiments_set_metrics(experiment_id, data): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return kwargs = {} if isinstance(data, list): kwargs['many'] = True serializer = ExperimentMetricSerializer(data=data, **kwargs) try: serializer.is_valid(raise_exception=True) except ValidationError: _logger.error('Could not create metrics, a validation error was raised.') serializer.save(experiment=experiment)
def experiments_build(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) return last_status = experiment.last_status if not ExperimentLifeCycle.can_transition(status_from=last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, last_status, ExperimentLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=experiment.user, project=experiment.project, config=experiment.specification.build, configmap_refs=experiment.specification.configmap_refs, secret_refs=experiment.specification.secret_refs, code_reference=experiment.code_reference) experiment.build_job = build_job experiment.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) return if not build_status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.') return # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING)
def experiments_stop(self, project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True): if specification: specification = ExperimentSpecification.read(specification) deleted = experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) else: deleted = True if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s` in experiment.', experiment_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED, message='Experiment was stopped')
def experiments_start(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.SCHEDULED): _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.', experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED) return None # Check if we need to copy an experiment if experiment.is_copy: copy_experiment(experiment) else: create_experiment_outputs_path(experiment.unique_name) experiment_scheduler.start_experiment(experiment)
def experiments_build(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=experiment.user, project=experiment.project, config=experiment.specification.build, code_reference=experiment.code_reference) experiment.build_job = build_job experiment.save() if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not build_status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.') return # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING)
def experiments_check_status(experiment_uuid=None, experiment_id=None): experiment = get_valid_experiment(experiment_id=experiment_id, experiment_uuid=experiment_uuid) if not experiment: return experiment.update_status()