def test_master_success_influences_other_experiment_workers_status(self): with patch('runner.tasks.experiments.start_experiment.delay' ) as _: # noqa with patch.object(Experiment, 'set_status') as _: # noqa experiment = ExperimentFactory() assert ExperimentLifeCycle.is_done(experiment.last_status) is False # Add jobs master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER) assert JobLifeCycle.is_done(master.last_status) is False workers = [ ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER) for _ in range(2) ] for worker in workers: worker.refresh_from_db() assert JobLifeCycle.is_done(worker.last_status) is False # Set master to succeeded ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED) # All worker should have a success status for worker in workers: worker.refresh_from_db() assert worker.last_status == JobLifeCycle.SUCCEEDED # Experiment last status should be success experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
def calculated_status(self): master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status calculated_status = master_status if JobLifeCycle.is_done( master_status) else None if calculated_status is None: calculated_status = ExperimentLifeCycle.jobs_status( self.last_job_statuses) if calculated_status is None: return self.last_status return calculated_status
def build_experiment(self, experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: if self.request.retries < 2: logger.info('Trying again for Experiment `%s`.', experiment_id) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return # No need to build the image, start the experiment directly if not experiment.specification.run_exec: start_experiment.delay(experiment_id=experiment_id) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING) # Building the docker image try: status = experiments_builder.build_experiment(experiment) except DockerException as e: logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return except Repo.DoesNotExist: logger.warning('No code was found for this project') experiment.set_status(ExperimentLifeCycle.FAILED, message='No code was found for to build this experiment.') return except Exception as e: # Other exceptions logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return if not status: return # Now we can start the experiment start_experiment.delay(experiment_id=experiment_id)
def start_experiment(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.SCHEDULED): logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Check if we need to copy an experiment if experiment.is_copy: copy_experiment(experiment) else: create_experiment_outputs_path(experiment.unique_name) experiment_scheduler.start_experiment(experiment)