def test_independent_experiment_creation_with_run_triggers_experiment_building_scheduling(self): config = ExperimentSpecification.read(exec_experiment_spec_content) # Create a repo for the project repo = RepoFactory() with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build: experiment = ExperimentFactory(config=config.parsed_data, project=repo.project) assert mock_build.call_count == 1 assert experiment.project.repo is not None assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED] with patch('dockerizer.builders.experiments.build_experiment') as mock_build: build_experiment(experiment_id=experiment.id) assert mock_build.call_count == 1 assert ExperimentStatus.objects.filter(experiment=experiment).count() == 4 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_create_experiment_with_resources_spec(self, spawner_mock): config = ExperimentSpecification.read(exec_experiment_resources_content) mock_instance = spawner_mock.return_value mock_instance.start_experiment.return_value = start_experiment_value mock_instance.spec = config experiment = ExperimentFactory(config=config.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STARTING # Assert 3 jobs were created with resources assert ExperimentJob.objects.filter(experiment=experiment).count() == 3 assert JobResources.objects.count() == 3 jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True) assert set(jobs_statuses) == {JobLifeCycle.CREATED, } jobs = ExperimentJob.objects.filter(experiment=experiment) assert experiment.calculated_status == ExperimentLifeCycle.STARTING for job in jobs: # Assert the jobs status is created assert job.last_status == JobLifeCycle.CREATED
def test_set_metrics(self): config = ExperimentSpecification.read(experiment_spec_content) experiment = ExperimentFactory(config=config.parsed_data) assert experiment.metrics.count() == 0 create_at = timezone.now() experiments_set_metrics(experiment_id=experiment.id, data={ 'created_at': create_at, 'values': { 'accuracy': 0.9, 'precision': 0.9 } }) assert experiment.metrics.count() == 1 experiments_set_metrics(experiment_id=experiment.id, data=[{ 'created_at': create_at, 'values': { 'accuracy': 0.9, 'precision': 0.9 } }, { 'created_at': create_at, 'values': { 'accuracy': 0.9, 'precision': 0.9 } }]) assert experiment.metrics.count() == 3
def experiments_stop(project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True): specification = ExperimentSpecification.read(specification) experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED, message='Experiment was stopped')
def test_serialize_with_environment_section(self): spec_content = """--- version: 1 kind: experiment environment: resources: cpu: requests: 2 limits: 4 memory: requests: 4096 limits: 10240 pytorch: n_workers: 2 default_worker: resources: cpu: requests: 2 limits: 4 memory: requests: 4096 limits: 10240 run: image: my_image cmd: video_prediction_train --model=DNA --num_masks=1 """ spec = ExperimentSpecification.read(spec_content) obj = self.factory_class(config=spec.parsed_data) serializer = self.serializer_class(obj) data = serializer.data assert 'resources' in data
def test_create_experiment_with_resources_spec(self, spawner_mock): config = ExperimentSpecification.read(exec_experiment_resources_content) mock_instance = spawner_mock.return_value mock_instance.start_experiment.return_value = start_experiment_value mock_instance.job_uuids = {'master': ['fa6203c189a855dd977019854a7ffcc3'], 'worker': ['3a9c9b0bd56b5e9fbdbd1a3d43d57960'], 'ps': ['59e3601232b85a3d8be2511f23a62945']} mock_instance.spec = config experiment = ExperimentFactory(config=config.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STARTING # Assert 3 jobs were created with resources assert ExperimentJob.objects.filter(experiment=experiment).count() == 3 assert JobResources.objects.count() == 3 jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True) assert set(jobs_statuses) == {JobLifeCycle.CREATED, } jobs = ExperimentJob.objects.filter(experiment=experiment) assert experiment.calculated_status == ExperimentLifeCycle.STARTING for job in jobs: # Assert the jobs status is created assert job.last_status == JobLifeCycle.CREATED
def test_independent_experiment_creation_with_run_triggers_experiment_scheduling(self): config = ExperimentSpecification.read(exec_experiment_spec_content) # Create a repo for the project repo = RepoFactory() with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build: experiment = ExperimentFactory(config=config.parsed_data, project=repo.project) assert mock_build.call_count == 1 assert experiment.project.repo is not None assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED] with patch('scheduler.dockerizer_scheduler.create_build_job') as mock_start: build = BuildJobFactory() BuildJobStatus.objects.create(status=JobLifeCycle.SUCCEEDED, job=build) mock_start.return_value = build, True, True experiments_build(experiment_id=experiment.id) assert mock_start.call_count == 1 assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_independent_experiment_creation_triggers_experiment_scheduling(self): content = ExperimentSpecification.read(experiment_spec_content) experiment = ExperimentFactory(config=content.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def experiments_stop(self, project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True, collect_logs=True, message=None): if collect_logs: try: collectors.logs_collect_experiment_jobs( experiment_uuid=experiment_uuid) except OSError: _logger.warning( 'Scheduler could not collect ' 'the logs for experiment `%s`.', experiment_name) if specification: specification = ExperimentSpecification.read(specification) deleted = experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) else: deleted = True if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s` in experiment.', experiment_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED, message=message or 'Experiment was stopped')
def validate_experiment_spec_config(config, raise_for_rest: bool = False): try: spec = ExperimentSpecification.read(config) except (MarshmallowValidationError, PolyaxonfileError, PolyaxonConfigurationError) as e: message_error = 'Received non valid specification config. %s' % e if raise_for_rest: raise ValidationError(message_error) else: raise DjangoValidationError(message_error) return spec
def test_set_metrics(self): config = ExperimentSpecification.read(experiment_spec_content) experiment = ExperimentFactory(config=config.parsed_data) assert experiment.metrics.count() == 0 create_at = timezone.now() experiments_set_metrics(experiment_uuid=experiment.uuid.hex, created_at=create_at, metrics={ 'accuracy': 0.9, 'precision': 0.9 }) assert experiment.metrics.count() == 1
def test_serialize_with_environment_section(self): spec_content = """--- version: 1 kind: experiment framework: pytorch environment: resources: cpu: requests: 2 limits: 4 memory: requests: 4096 limits: 10240 replicas: n_workers: 2 default_worker: resources: cpu: requests: 2 limits: 4 memory: requests: 4096 limits: 10240 build: image: foo run: cmd: video_prediction_train --model=DNA --num_masks=1 """ spec = ExperimentSpecification.read(spec_content) obj = self.factory_class(config=spec.parsed_data) obj1_query = queries.experiments_details.get(id=obj.id) serializer = self.serializer_class(obj1_query) data = serializer.data assert 'resources' in data
def experiments_stop(self, project_name, project_uuid, experiment_name, experiment_group_name, experiment_group_uuid, experiment_uuid, specification, update_status=True): specification = ExperimentSpecification.read(specification) deleted = experiment_scheduler.stop_experiment( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, specification=specification, ) if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s` in experiment.', experiment_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return experiment = get_valid_experiment(experiment_uuid=experiment_uuid) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_uuid) return # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED, message='Experiment was stopped')
- Flatten: - Dense: units: 10 activation: softmax train: data_pipeline: TFRecordImagePipeline: batch_size: 64 num_epochs: 1 shuffle: true dynamic_pad: false data_files: ["../data/mnist/mnist_train.tfrecord"] meta_data_file: "../data/mnist/meta_data.json" """ experiment_spec_parsed_content = ExperimentSpecification.read( experiment_spec_content) exec_experiment_spec_content = """--- version: 1 kind: experiment tags: [fixtures] build: image: my_image run: cmd: video_prediction_train --model=DNA --num_masks=1 """
def create_experiment(self, config): config = ExperimentSpecification.read(config) return ExperimentFactory(config=config.parsed_data, project=self.project)