def start_experiment_run(spec_config, experiment_idx, task_type, task_id, schedule): spec = Specification.read(spec_config) experiment = prepare_experiment_run(spec, experiment_idx, task_type, int(task_id)) task = getattr(experiment, schedule) return task()
def run_experiment(polyaxonfile, xp): plx_file = Specification.read(polyaxonfile) logging.info("running Experiment n: {}".format(xp)) cluster, is_distributed = plx_file.get_cluster_def_at(xp) if not is_distributed: start_experiment_run(plx_file, xp, TaskType.MASTER, 0, 'continuous_train_and_eval') current_run['finished'] = True else: env = { 'polyaxonfile': json.dumps(polyaxonfile.get_parsed_data_at(xp)), 'task_type': TaskType.MASTER, 'experiment_id': xp, 'task_id': 0, 'schedule': 'train_and_evaluate' } create_process(env) for i in xrange(cluster.get(TaskType.WORKER, 0)): env['task_id'] = i env['task_type'] = TaskType.WORKER env['schedule'] = 'train' create_process(env) for i in xrange(cluster.get(TaskType.PS, 0)): env['task_id'] = i env['task_type'] = TaskType.PS env['schedule'] = 'run_std_server' create_process(env) for job in jobs: job.join()
def parse(cls, data): cls.validate_version(data) sections = Specification.sections() for key in (set(six.iterkeys(data)) - sections): raise PolyaxonfileError( "Unexpected section `{}` in Polyaxonfile version `{}`." "Please check the Polyaxonfile specification " "for this version.".format(key, 'v1')) parsed_data = { 'version': data['version'], } if 'declarations' in data: parsed_data['declarations'] = cls.parse_expression( data['declarations'], data['declarations']) if 'matrix' in data: parsed_data['matrix'] = cls.parse_expression( data['matrix'], parsed_data.get('declarations')) for section in Specification.SECTIONS: if section in data: parsed_data[section] = cls.parse_expression( data[section], parsed_data.get('declarations', {})) for section in Specification.GRAPH_SECTIONS: if section in data: parsed_data[section] = cls.parse_expression( data[section], parsed_data.get('declarations', {}), True, True) return parsed_data
def test_create_experiment_with_resources_spec(self, spawner_mock): content = Specification.read(exec_experiment_resources_content) mock_instance = spawner_mock.return_value mock_instance.start_experiment.return_value = start_experiment_value mock_instance.spec = content experiment = ExperimentFactory(config=content.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter( experiment=experiment).count() == 3 assert list( ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING ] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STARTING # Assert 3 job were created with resources assert ExperimentJob.objects.filter(experiment=experiment).count() == 3 assert JobResources.objects.count() == 3 jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True) assert set(jobs_statuses) == { JobLifeCycle.CREATED, } jobs = ExperimentJob.objects.filter(experiment=experiment) assert experiment.calculated_status == ExperimentLifeCycle.STARTING for job in jobs: # Assert the jobs status is created assert job.last_status == JobLifeCycle.CREATED
def test_independent_experiment_creation_with_run_triggers_experiment_building_scheduling( self): content = Specification.read(exec_experiment_spec_content) # Create a repo for the project repo = RepoFactory() with patch('dockerizer.builders.experiments.build_experiment' ) as mock_docker_build: experiment = ExperimentFactory(config=content.parsed_data, project=repo.project) assert mock_docker_build.call_count == 1 assert experiment.project.repo is not None assert experiment.is_independent is True assert ExperimentStatus.objects.filter( experiment=experiment).count() == 3 assert list( ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ ExperimentLifeCycle.CREATED, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.SCHEDULED ] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SCHEDULED
def _get_run_configs(spec_config, experiment_idx): spec = Specification.read(spec_config) environment = spec.environment cluster_def, is_distributed = spec.cluster_def def get_master_config(config, task_type=None, task_id=None): config = RunConfig.from_config(config) if task_type is None and task_id is None: return config return config.replace(task_type=task_type, task_id=task_id) config = environment.run_config or RunConfig.CONFIG() if not is_distributed: return {TaskType.MASTER: [get_master_config(config)]}, False if spec.is_local: config.cluster = spec.get_local_cluster() else: # Get value from env cluster_dict = get_cluster_def() config.cluster = ClusterConfig.from_dict(cluster_dict) configs = { TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)] } if cluster_def.get(TaskType.WORKER, 0) > 0: configs[TaskType.WORKER] = [] if cluster_def.get(TaskType.PS, 0) > 0: configs[TaskType.PS] = [] worker_session_configs = spec.worker_configs ps_session_configs = spec.ps_configs for i in range(cluster_def.get(TaskType.WORKER, 0)): w_config = get_master_config(config, task_type=TaskType.WORKER, task_id=i) session_config = worker_session_configs.get(i) if session_config: session_config = RunConfig.get_session_config(session_config) w_config = w_config.replace(session_config=session_config) configs[TaskType.WORKER].append(w_config) for i in range(cluster_def.get(TaskType.PS, 0)): ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i) session_config = ps_session_configs.get(i) if session_config: session_config = RunConfig.get_session_config(session_config) ps_config = ps_config.replace(session_config=session_config) configs[TaskType.PS].append(ps_config) return configs, True
def _get_run_configs(polyaxonfile, experiment_id): plx_file = Specification.read(polyaxonfile) environment = plx_file.get_environment_at(experiment_id) cluster_def, is_distributed = plx_file.get_cluster_def_at(experiment_id) def get_master_config(config, task_type=None, task_id=None): config = RunConfig.from_config(config) if task_type is None and task_id is None: return config return config.replace(task_type=task_type, task_id=task_id) config = environment.run_config or RunConfig.CONFIG() if not is_distributed: return {TaskType.MASTER: [get_master_config(config)]}, False config.cluster = plx_file.get_cluster(experiment=experiment_id) configs = {TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)]} if cluster_def.get(TaskType.WORKER, 0) > 0: configs[TaskType.WORKER] = [] if cluster_def.get(TaskType.PS, 0) > 0: configs[TaskType.PS] = [] # TODO: Replace with plxfile.get_worker_configs_at worker_session_configs = {} for session_config in environment.worker_configs or []: worker_session_configs[session_config.index] = session_config ps_session_configs = {} for session_config in environment.ps_configs or []: ps_session_configs[session_config.index] = session_config default_worker_config = environment.default_worker_config for i in range(cluster_def.get(TaskType.WORKER, 0)): w_config = get_master_config(config, task_type=TaskType.WORKER, task_id=i) session_config = worker_session_configs.get(i, default_worker_config) if session_config: session_config = RunConfig.get_session_config(session_config) w_config = w_config.replace(session_config=session_config) configs[TaskType.WORKER].append(w_config) default_ps_config = environment.default_ps_config for i in range(cluster_def.get(TaskType.PS, 0)): ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i) session_config = ps_session_configs.get(i, default_ps_config) if session_config: session_config = RunConfig.get_session_config(session_config) ps_config = ps_config.replace(session_config=session_config) configs[TaskType.PS].append(ps_config) return configs, True
def test_set_metrics(self): content = Specification.read(experiment_spec_content) experiment = ExperimentFactory(config=content.parsed_data) assert experiment.metrics.count() == 0 create_at = datetime.utcnow() set_metrics(experiment_uuid=experiment.uuid.hex, created_at=create_at, metrics={'accuracy': 0.9, 'precision': 0.9}) assert experiment.metrics.count() == 1
def run(polyaxonfile): plx_file = Specification.read(polyaxonfile) for xp in range(plx_file.matrix_space): run_experiment(plx_file, xp) while not current_run['finished']: check_master_process() time.sleep(10) current_run['finished'] = False current_run['master'] = None
def __init__(self, project_name, experiment_name, project_uuid, experiment_uuid, spec_config, experiment_group_uuid=None, experiment_group_name=None, k8s_config=None, namespace='default', in_cluster=False, job_container_name=None, job_docker_image=None, sidecar_container_name=None, sidecar_docker_image=None, role_label=None, type_label=None, ports=None, use_sidecar=False, sidecar_config=None, sidecar_args_fn=None, persist=False): self.specification = Specification.read(spec_config) self.project_name = project_name self.experiment_group_name = experiment_group_name self.experiment_name = experiment_name self.project_uuid = project_uuid self.experiment_group_uuid = experiment_group_uuid self.experiment_uuid = experiment_uuid self.pod_manager = pods.PodManager( namespace=namespace, project_name=self.project_name, experiment_group_name=self.experiment_group_name, experiment_name=self.experiment_name, project_uuid=self.project_uuid, experiment_group_uuid=self.experiment_group_uuid, experiment_uuid=experiment_uuid, job_container_name=job_container_name, job_docker_image=job_docker_image, sidecar_container_name=sidecar_container_name, sidecar_docker_image=sidecar_docker_image, role_label=role_label, type_label=type_label, ports=ports, use_sidecar=use_sidecar, sidecar_config=sidecar_config) self.sidecar_args_fn = sidecar_args_fn or constants.SIDECAR_ARGS_FN self.persist = persist super(K8SSpawner, self).__init__(k8s_config=k8s_config, namespace=namespace, in_cluster=in_cluster)
def test_independent_experiment_creation_triggers_experiment_scheduling(self): content = Specification.read(experiment_spec_content) experiment = ExperimentFactory(config=content.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 2 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SCHEDULED # Assert also that experiment is monitored assert experiment.last_status == ExperimentLifeCycle.SCHEDULED
def prepare_experiment_run(spec_config, experiment_idx, task_type=TaskType.MASTER, task_id=0): spec = Specification.read(spec_config) cluster, _ = spec.cluster_def if (task_type not in cluster or not isinstance(cluster[task_type], int) or task_id >= cluster[task_type]): raise ValueError('task_type, task_id `{}, {}` is not supported by ' 'the specification file passed.'.format( task_type, task_id)) env = spec.environment if spec.is_local: output_dir = spec.project_path log_level = LOGGING_LEVEL[spec.settings.logging.level] else: output_dir = get_outputs_path() log_level = get_log_level() if not env: tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity(log_level) configs, _ = _get_run_configs(spec, experiment_idx) delay_workers_by_global_step = env.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train(spec.train) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval(spec.eval) estimator = getters.get_estimator(spec.model, configs[task_type][task_id], output_dir=output_dir) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=spec.settings.export_strategies)
def run_all(polyaxonfile): plx_file = Specification.read(polyaxonfile) for xp in range(plx_file.matrix_space): xp_runs = prepare_all_experiment_runs(polyaxonfile, xp) for i, xp_run in enumerate(xp_runs): if i == 0: schedule = 'train_and_evaluate' else: schedule = 'train' p = Process(target=getattr(xp_run, schedule)) p.start() jobs.append(p) for job in jobs: job.join()
def prepare_all_experiment_runs(polyaxonfile, experiment_id): plx_file = Specification.read(polyaxonfile) is_distributed = False if not plx_file.get_environment_at(experiment_id): tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity(LOGGING_LEVEL[plx_file.settings.logging.level]) configs, is_distributed = _get_run_configs(plx_file.settings.environment, experiment_id) delay_workers_by_global_step = plx_file.settings.environment.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train(plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval(plx_file.get_eval_at(experiment_id)) def get_experiment(config): estimator = getters.get_estimator(plx_file.model, config, output_dir=plx_file.project_path) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies) xps = [get_experiment(configs[TaskType.MASTER][0])] if not is_distributed: return xps for i_config in configs.get(TaskType.WORKER, []): xps.append(get_experiment(i_config)) for i_config in configs.get(TaskType.PS, []): xps.append(get_experiment(i_config)) return xps
def prepare_experiment_run(polyaxonfile, experiment_id, task_type=TaskType.MASTER, task_id=0): plx_file = Specification.read(polyaxonfile) cluster, _ = plx_file.get_cluster_def_at(experiment_id) if (task_type not in cluster or not isinstance(cluster[task_type], int) or task_id >= cluster[task_type]): raise ValueError('task_type, task_id `{}, {}` is not supported by ' 'the specification file passed.'.format(task_type, task_id)) env = plx_file.get_environment_at(experiment_id) if not env: tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity(LOGGING_LEVEL[plx_file.settings.logging.level]) configs, _ = _get_run_configs(plx_file, experiment_id) delay_workers_by_global_step = env.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train(plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval(plx_file.get_eval_at(experiment_id)) estimator = getters.get_estimator(plx_file.get_model_at(experiment_id), configs[task_type][task_id], output_dir=plx_file.get_project_path_at(experiment_id)) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies)
- Flatten: - Dense: units: 10 activation: softmax train: data_pipeline: TFRecordImagePipeline: batch_size: 64 num_epochs: 1 shuffle: true dynamic_pad: false data_files: ["../data/mnist/mnist_train.tfrecord"] meta_data_file: "../data/mnist/meta_data.json" """ experiment_spec_parsed_content = Specification.read(experiment_spec_content) exec_experiment_spec_content = """--- version: 1 project: name: project1 run: image: my_image cmd: video_prediction_train --model=DNA --num_masks=1 """ exec_experiment_spec_parsed_content = Specification.read( exec_experiment_spec_content)
def start_experiment_run(polyaxonfile, experiment_id, task_type, task_id, schedule): plx_file = Specification.read(polyaxonfile) experiment = prepare_experiment_run(plx_file, int(experiment_id), task_type, int(task_id)) task = getattr(experiment, schedule) return task()
def create_experiment(self, content): config = Specification.read(content) with patch('repos.dockerize.build_experiment') as _: return ExperimentFactory(config=config.parsed_data, project=self.project)
def compiled_spec(self): return Specification(experiment=self.uuid, values=self.config)