コード例 #1
0
ファイル: manager.py プロジェクト: gridl/polyaxon-lib
def start_experiment_run(spec_config, experiment_idx, task_type, task_id,
                         schedule):
    spec = Specification.read(spec_config)
    experiment = prepare_experiment_run(spec, experiment_idx, task_type,
                                        int(task_id))
    task = getattr(experiment, schedule)
    return task()
コード例 #2
0
def run_experiment(polyaxonfile, xp):
    plx_file = Specification.read(polyaxonfile)
    logging.info("running Experiment n: {}".format(xp))
    cluster, is_distributed = plx_file.get_cluster_def_at(xp)
    if not is_distributed:
        start_experiment_run(plx_file, xp, TaskType.MASTER, 0,
                             'continuous_train_and_eval')
        current_run['finished'] = True
    else:
        env = {
            'polyaxonfile': json.dumps(polyaxonfile.get_parsed_data_at(xp)),
            'task_type': TaskType.MASTER,
            'experiment_id': xp,
            'task_id': 0,
            'schedule': 'train_and_evaluate'
        }

        create_process(env)

        for i in xrange(cluster.get(TaskType.WORKER, 0)):
            env['task_id'] = i
            env['task_type'] = TaskType.WORKER
            env['schedule'] = 'train'
            create_process(env)

        for i in xrange(cluster.get(TaskType.PS, 0)):
            env['task_id'] = i
            env['task_type'] = TaskType.PS
            env['schedule'] = 'run_std_server'
            create_process(env)

        for job in jobs:
            job.join()
コード例 #3
0
    def parse(cls, data):
        cls.validate_version(data)
        sections = Specification.sections()
        for key in (set(six.iterkeys(data)) - sections):
            raise PolyaxonfileError(
                "Unexpected section `{}` in Polyaxonfile version `{}`."
                "Please check the Polyaxonfile specification "
                "for this version.".format(key, 'v1'))

        parsed_data = {
            'version': data['version'],
        }

        if 'declarations' in data:
            parsed_data['declarations'] = cls.parse_expression(
                data['declarations'], data['declarations'])

        if 'matrix' in data:
            parsed_data['matrix'] = cls.parse_expression(
                data['matrix'], parsed_data.get('declarations'))

        for section in Specification.SECTIONS:
            if section in data:
                parsed_data[section] = cls.parse_expression(
                    data[section], parsed_data.get('declarations', {}))

        for section in Specification.GRAPH_SECTIONS:
            if section in data:
                parsed_data[section] = cls.parse_expression(
                    data[section], parsed_data.get('declarations', {}), True,
                    True)

        return parsed_data
コード例 #4
0
ファイル: test_models.py プロジェクト: www3838438/polyaxon
    def test_create_experiment_with_resources_spec(self, spawner_mock):
        content = Specification.read(exec_experiment_resources_content)

        mock_instance = spawner_mock.return_value
        mock_instance.start_experiment.return_value = start_experiment_value
        mock_instance.spec = content

        experiment = ExperimentFactory(config=content.parsed_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(
            experiment=experiment).count() == 3
        assert list(
            ExperimentStatus.objects.filter(experiment=experiment).values_list(
                'status', flat=True)) == [
                    ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED,
                    ExperimentLifeCycle.STARTING
                ]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STARTING

        # Assert 3 job were created with resources
        assert ExperimentJob.objects.filter(experiment=experiment).count() == 3
        assert JobResources.objects.count() == 3
        jobs_statuses = ExperimentJob.objects.values_list('statuses__status',
                                                          flat=True)
        assert set(jobs_statuses) == {
            JobLifeCycle.CREATED,
        }
        jobs = ExperimentJob.objects.filter(experiment=experiment)
        assert experiment.calculated_status == ExperimentLifeCycle.STARTING

        for job in jobs:
            # Assert the jobs status is created
            assert job.last_status == JobLifeCycle.CREATED
コード例 #5
0
    def test_independent_experiment_creation_with_run_triggers_experiment_building_scheduling(
            self):
        content = Specification.read(exec_experiment_spec_content)
        # Create a repo for the project
        repo = RepoFactory()

        with patch('dockerizer.builders.experiments.build_experiment'
                   ) as mock_docker_build:
            experiment = ExperimentFactory(config=content.parsed_data,
                                           project=repo.project)

        assert mock_docker_build.call_count == 1
        assert experiment.project.repo is not None
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(
            experiment=experiment).count() == 3
        assert list(
            ExperimentStatus.objects.filter(experiment=experiment).values_list(
                'status', flat=True)) == [
                    ExperimentLifeCycle.CREATED, ExperimentLifeCycle.BUILDING,
                    ExperimentLifeCycle.SCHEDULED
                ]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SCHEDULED
コード例 #6
0
ファイル: manager.py プロジェクト: gridl/polyaxon-lib
def _get_run_configs(spec_config, experiment_idx):
    spec = Specification.read(spec_config)
    environment = spec.environment
    cluster_def, is_distributed = spec.cluster_def

    def get_master_config(config, task_type=None, task_id=None):
        config = RunConfig.from_config(config)
        if task_type is None and task_id is None:
            return config
        return config.replace(task_type=task_type, task_id=task_id)

    config = environment.run_config or RunConfig.CONFIG()

    if not is_distributed:
        return {TaskType.MASTER: [get_master_config(config)]}, False

    if spec.is_local:
        config.cluster = spec.get_local_cluster()
    else:
        # Get value from env
        cluster_dict = get_cluster_def()
        config.cluster = ClusterConfig.from_dict(cluster_dict)

    configs = {
        TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)]
    }

    if cluster_def.get(TaskType.WORKER, 0) > 0:
        configs[TaskType.WORKER] = []

    if cluster_def.get(TaskType.PS, 0) > 0:
        configs[TaskType.PS] = []

    worker_session_configs = spec.worker_configs
    ps_session_configs = spec.ps_configs

    for i in range(cluster_def.get(TaskType.WORKER, 0)):
        w_config = get_master_config(config,
                                     task_type=TaskType.WORKER,
                                     task_id=i)
        session_config = worker_session_configs.get(i)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            w_config = w_config.replace(session_config=session_config)

        configs[TaskType.WORKER].append(w_config)

    for i in range(cluster_def.get(TaskType.PS, 0)):
        ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i)
        session_config = ps_session_configs.get(i)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            ps_config = ps_config.replace(session_config=session_config)

        configs[TaskType.PS].append(ps_config)

    return configs, True
コード例 #7
0
def _get_run_configs(polyaxonfile, experiment_id):
    plx_file = Specification.read(polyaxonfile)
    environment = plx_file.get_environment_at(experiment_id)
    cluster_def, is_distributed = plx_file.get_cluster_def_at(experiment_id)

    def get_master_config(config, task_type=None, task_id=None):
        config = RunConfig.from_config(config)
        if task_type is None and task_id is None:
            return config
        return config.replace(task_type=task_type, task_id=task_id)

    config = environment.run_config or RunConfig.CONFIG()

    if not is_distributed:
        return {TaskType.MASTER: [get_master_config(config)]}, False

    config.cluster = plx_file.get_cluster(experiment=experiment_id)

    configs = {TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)]}

    if cluster_def.get(TaskType.WORKER, 0) > 0:
        configs[TaskType.WORKER] = []

    if cluster_def.get(TaskType.PS, 0) > 0:
        configs[TaskType.PS] = []

    # TODO: Replace with plxfile.get_worker_configs_at
    worker_session_configs = {}
    for session_config in environment.worker_configs or []:
        worker_session_configs[session_config.index] = session_config

    ps_session_configs = {}
    for session_config in environment.ps_configs or []:
        ps_session_configs[session_config.index] = session_config

    default_worker_config = environment.default_worker_config
    for i in range(cluster_def.get(TaskType.WORKER, 0)):
        w_config = get_master_config(config, task_type=TaskType.WORKER, task_id=i)
        session_config = worker_session_configs.get(i, default_worker_config)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            w_config = w_config.replace(session_config=session_config)

        configs[TaskType.WORKER].append(w_config)

    default_ps_config = environment.default_ps_config
    for i in range(cluster_def.get(TaskType.PS, 0)):
        ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i)
        session_config = ps_session_configs.get(i, default_ps_config)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            ps_config = ps_config.replace(session_config=session_config)

        configs[TaskType.PS].append(ps_config)

    return configs, True
コード例 #8
0
    def test_set_metrics(self):
        content = Specification.read(experiment_spec_content)
        experiment = ExperimentFactory(config=content.parsed_data)
        assert experiment.metrics.count() == 0

        create_at = datetime.utcnow()
        set_metrics(experiment_uuid=experiment.uuid.hex,
                    created_at=create_at,
                    metrics={'accuracy': 0.9, 'precision': 0.9})

        assert experiment.metrics.count() == 1
コード例 #9
0
def run(polyaxonfile):
    plx_file = Specification.read(polyaxonfile)
    for xp in range(plx_file.matrix_space):
        run_experiment(plx_file, xp)

        while not current_run['finished']:
            check_master_process()
            time.sleep(10)

        current_run['finished'] = False
        current_run['master'] = None
コード例 #10
0
    def __init__(self,
                 project_name,
                 experiment_name,
                 project_uuid,
                 experiment_uuid,
                 spec_config,
                 experiment_group_uuid=None,
                 experiment_group_name=None,
                 k8s_config=None,
                 namespace='default',
                 in_cluster=False,
                 job_container_name=None,
                 job_docker_image=None,
                 sidecar_container_name=None,
                 sidecar_docker_image=None,
                 role_label=None,
                 type_label=None,
                 ports=None,
                 use_sidecar=False,
                 sidecar_config=None,
                 sidecar_args_fn=None,
                 persist=False):
        self.specification = Specification.read(spec_config)
        self.project_name = project_name
        self.experiment_group_name = experiment_group_name
        self.experiment_name = experiment_name
        self.project_uuid = project_uuid
        self.experiment_group_uuid = experiment_group_uuid
        self.experiment_uuid = experiment_uuid
        self.pod_manager = pods.PodManager(
            namespace=namespace,
            project_name=self.project_name,
            experiment_group_name=self.experiment_group_name,
            experiment_name=self.experiment_name,
            project_uuid=self.project_uuid,
            experiment_group_uuid=self.experiment_group_uuid,
            experiment_uuid=experiment_uuid,
            job_container_name=job_container_name,
            job_docker_image=job_docker_image,
            sidecar_container_name=sidecar_container_name,
            sidecar_docker_image=sidecar_docker_image,
            role_label=role_label,
            type_label=type_label,
            ports=ports,
            use_sidecar=use_sidecar,
            sidecar_config=sidecar_config)
        self.sidecar_args_fn = sidecar_args_fn or constants.SIDECAR_ARGS_FN
        self.persist = persist

        super(K8SSpawner, self).__init__(k8s_config=k8s_config,
                                         namespace=namespace,
                                         in_cluster=in_cluster)
コード例 #11
0
    def test_independent_experiment_creation_triggers_experiment_scheduling(self):
        content = Specification.read(experiment_spec_content)
        experiment = ExperimentFactory(config=content.parsed_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 2
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SCHEDULED

        # Assert also that experiment is monitored
        assert experiment.last_status == ExperimentLifeCycle.SCHEDULED
コード例 #12
0
ファイル: manager.py プロジェクト: gridl/polyaxon-lib
def prepare_experiment_run(spec_config,
                           experiment_idx,
                           task_type=TaskType.MASTER,
                           task_id=0):
    spec = Specification.read(spec_config)
    cluster, _ = spec.cluster_def

    if (task_type not in cluster or not isinstance(cluster[task_type], int)
            or task_id >= cluster[task_type]):
        raise ValueError('task_type, task_id `{}, {}` is not supported by '
                         'the specification file passed.'.format(
                             task_type, task_id))

    env = spec.environment

    if spec.is_local:
        output_dir = spec.project_path
        log_level = LOGGING_LEVEL[spec.settings.logging.level]
    else:
        output_dir = get_outputs_path()
        log_level = get_log_level()

    if not env:
        tf.logging.set_verbosity(tf.logging.INFO)
        configs = {TaskType.MASTER: [RunConfig()]}
        delay_workers_by_global_step = False
    else:
        tf.logging.set_verbosity(log_level)
        configs, _ = _get_run_configs(spec, experiment_idx)
        delay_workers_by_global_step = env.delay_workers_by_global_step

    train_input_fn, train_steps, train_hooks = _get_train(spec.train)
    (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs,
     continuous_eval_throttle_secs) = _get_eval(spec.eval)

    estimator = getters.get_estimator(spec.model,
                                      configs[task_type][task_id],
                                      output_dir=output_dir)

    return Experiment(
        estimator=estimator,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        train_steps=train_steps,
        eval_steps=eval_steps,
        train_hooks=train_hooks,
        eval_hooks=eval_hooks,
        eval_delay_secs=eval_delay_secs,
        continuous_eval_throttle_secs=continuous_eval_throttle_secs,
        delay_workers_by_global_step=delay_workers_by_global_step,
        export_strategies=spec.settings.export_strategies)
コード例 #13
0
def run_all(polyaxonfile):
    plx_file = Specification.read(polyaxonfile)
    for xp in range(plx_file.matrix_space):
        xp_runs = prepare_all_experiment_runs(polyaxonfile, xp)
        for i, xp_run in enumerate(xp_runs):
            if i == 0:
                schedule = 'train_and_evaluate'
            else:
                schedule = 'train'
            p = Process(target=getattr(xp_run, schedule))
            p.start()
            jobs.append(p)

        for job in jobs:
            job.join()
コード例 #14
0
def prepare_all_experiment_runs(polyaxonfile, experiment_id):
    plx_file = Specification.read(polyaxonfile)
    is_distributed = False

    if not plx_file.get_environment_at(experiment_id):
        tf.logging.set_verbosity(tf.logging.INFO)
        configs = {TaskType.MASTER: [RunConfig()]}
        delay_workers_by_global_step = False
    else:
        tf.logging.set_verbosity(LOGGING_LEVEL[plx_file.settings.logging.level])
        configs, is_distributed = _get_run_configs(plx_file.settings.environment, experiment_id)
        delay_workers_by_global_step = plx_file.settings.environment.delay_workers_by_global_step

    train_input_fn, train_steps, train_hooks = _get_train(plx_file.get_train_at(experiment_id))
    (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs,
     continuous_eval_throttle_secs) = _get_eval(plx_file.get_eval_at(experiment_id))

    def get_experiment(config):
        estimator = getters.get_estimator(plx_file.model,
                                          config,
                                          output_dir=plx_file.project_path)

        return Experiment(
            estimator=estimator,
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            train_steps=train_steps,
            eval_steps=eval_steps,
            train_hooks=train_hooks,
            eval_hooks=eval_hooks,
            eval_delay_secs=eval_delay_secs,
            continuous_eval_throttle_secs=continuous_eval_throttle_secs,
            delay_workers_by_global_step=delay_workers_by_global_step,
            export_strategies=plx_file.settings.export_strategies)

    xps = [get_experiment(configs[TaskType.MASTER][0])]
    if not is_distributed:
        return xps

    for i_config in configs.get(TaskType.WORKER, []):
        xps.append(get_experiment(i_config))

    for i_config in configs.get(TaskType.PS, []):
        xps.append(get_experiment(i_config))

    return xps
コード例 #15
0
def prepare_experiment_run(polyaxonfile, experiment_id, task_type=TaskType.MASTER, task_id=0):
    plx_file = Specification.read(polyaxonfile)
    cluster, _ = plx_file.get_cluster_def_at(experiment_id)

    if (task_type not in cluster or
            not isinstance(cluster[task_type], int) or
            task_id >= cluster[task_type]):
        raise ValueError('task_type, task_id `{}, {}` is not supported by '
                         'the specification file passed.'.format(task_type, task_id))

    env = plx_file.get_environment_at(experiment_id)
    if not env:
        tf.logging.set_verbosity(tf.logging.INFO)
        configs = {TaskType.MASTER: [RunConfig()]}
        delay_workers_by_global_step = False
    else:
        tf.logging.set_verbosity(LOGGING_LEVEL[plx_file.settings.logging.level])
        configs, _ = _get_run_configs(plx_file, experiment_id)
        delay_workers_by_global_step = env.delay_workers_by_global_step

    train_input_fn, train_steps, train_hooks = _get_train(plx_file.get_train_at(experiment_id))
    (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs,
     continuous_eval_throttle_secs) = _get_eval(plx_file.get_eval_at(experiment_id))

    estimator = getters.get_estimator(plx_file.get_model_at(experiment_id),
                                      configs[task_type][task_id],
                                      output_dir=plx_file.get_project_path_at(experiment_id))

    return Experiment(
        estimator=estimator,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        train_steps=train_steps,
        eval_steps=eval_steps,
        train_hooks=train_hooks,
        eval_hooks=eval_hooks,
        eval_delay_secs=eval_delay_secs,
        continuous_eval_throttle_secs=continuous_eval_throttle_secs,
        delay_workers_by_global_step=delay_workers_by_global_step,
        export_strategies=plx_file.settings.export_strategies)
コード例 #16
0
          - Flatten:
          - Dense:
              units: 10
              activation: softmax

    train:
      data_pipeline:
        TFRecordImagePipeline:
          batch_size: 64
          num_epochs: 1
          shuffle: true
          dynamic_pad: false
          data_files: ["../data/mnist/mnist_train.tfrecord"]
          meta_data_file: "../data/mnist/meta_data.json"
"""
experiment_spec_parsed_content = Specification.read(experiment_spec_content)

exec_experiment_spec_content = """---
    version: 1

    project:
      name: project1

    run:
      image: my_image
      cmd: video_prediction_train --model=DNA --num_masks=1
"""

exec_experiment_spec_parsed_content = Specification.read(
    exec_experiment_spec_content)
コード例 #17
0
def start_experiment_run(polyaxonfile, experiment_id, task_type, task_id, schedule):
    plx_file = Specification.read(polyaxonfile)
    experiment = prepare_experiment_run(plx_file, int(experiment_id), task_type, int(task_id))
    task = getattr(experiment, schedule)
    return task()
コード例 #18
0
ファイル: test_models.py プロジェクト: www3838438/polyaxon
 def create_experiment(self, content):
     config = Specification.read(content)
     with patch('repos.dockerize.build_experiment') as _:
         return ExperimentFactory(config=config.parsed_data,
                                  project=self.project)
コード例 #19
0
ファイル: models.py プロジェクト: uber-haupt/polyaxon
 def compiled_spec(self):
     return Specification(experiment=self.uuid, values=self.config)