コード例 #1
0
ファイル: local_runner.py プロジェクト: saadmahboob/polyaxon
def start_experiment_run(polyaxonfile, experiment_id, task_type, task_id,
                         schedule):
    plx_file = PolyaxonFile.read(polyaxonfile)
    experiment = prepare_experiment_run(plx_file, experiment_id, task_type,
                                        task_id)
    task = getattr(experiment, schedule)
    return task()
コード例 #2
0
ファイル: manager.py プロジェクト: saadmahboob/polyaxon
def _get_run_configs(polyaxonfile, experiment_id):
    plx_file = PolyaxonFile.read(polyaxonfile)
    environment = plx_file.get_environment_at(experiment_id)
    cluster_def, is_distributed = plx_file.get_cluster_def_at(experiment_id)

    def get_master_config(config, task_type=None, task_id=None):
        config = RunConfig.from_config(config)
        if task_type is None and task_id is None:
            return config
        return config.replace(task_type=task_type, task_id=task_id)

    config = environment.run_config or RunConfig.CONFIG()

    if not is_distributed:
        return {TaskType.MASTER: get_master_config(config)}, False

    config.cluster = plx_file.get_cluster(experiment=experiment_id)

    configs = {
        TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)]
    }

    if cluster_def.get(TaskType.WORKER, 0) > 0:
        configs[TaskType.WORKER] = []

    if cluster_def.get(TaskType.PS, 0) > 0:
        configs[TaskType.PS] = []

    worker_session_configs = {}
    for session_config in environment.worker_configs or []:
        worker_session_configs[session_config.index] = session_config

    ps_session_configs = {}
    for session_config in environment.ps_configs or []:
        ps_session_configs[session_config.index] = session_config

    default_worker_config = environment.default_worker_config
    for i in range(cluster_def.get(TaskType.WORKER, 0)):
        w_config = get_master_config(config,
                                     task_type=TaskType.WORKER,
                                     task_id=i)
        session_config = worker_session_configs.get(i, default_worker_config)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            w_config = w_config.replace(session_config=session_config)

        configs[TaskType.WORKER].append(w_config)

    default_ps_config = environment.default_ps_config
    for i in range(cluster_def.get(TaskType.PS, 0)):
        ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i)
        session_config = ps_session_configs.get(i, default_ps_config)
        if session_config:
            session_config = RunConfig.get_session_config(session_config)
            ps_config = ps_config.replace(session_config=session_config)

        configs[TaskType.PS].append(ps_config)

    return configs, True
コード例 #3
0
def run(polyaxonfile):
    plx_file = PolyaxonFile.read(polyaxonfile)
    for xp in range(plx_file.matrix_space):
        run_experiment(plx_file.experiment_specs[xp], xp)

        while not current_run['finished']:
            check_master_process()
            time.sleep(10)

        current_run['finished'] = False
        current_run['master'] = None
コード例 #4
0
def init(project, model, run):
    """Init a new polyaxonfile specification."""
    user, project_name = get_project_or_local(project)
    try:
        project_config = PolyaxonClients().project.get_project(user, project_name)
    except PolyaxonHTTPError:
        Printer.print_error('Make sure you have a project with this name `{}`'.format(project))
        Printer.print_error('You can a new project with this command: '
                            'polyaxon project create --name={} --description=...'.format(project))
        sys.exit(1)

    if not any([model, run]) and not all([model, run]):
        Printer.print_error("You must specify which an init option, "
                            "possible values: `--model` or `--run`.")
        sys.exit(1)

    result = False
    if model:
        result = create_init_file(constants.INIT_FILE_MODEL, project)

    elif run:
        result = create_init_file(constants.INIT_FILE_RUN, project)

    if result:
        ProjectManager.set_config(project_config, init=True)
        IgnoreManager.init_config()
        Printer.print_success(
            "Polyaxonfile was created successfully `{}`".format(constants.INIT_FILE))
        sys.exit(1)

    # if we are here the file was not created
    if not os.path.isfile(constants.INIT_FILE):
        Printer.print_error(
            "Something went wrong, init command did not create a file.\n"
            "Possible reasons: you don't have the write to create the file.")
        sys.exit(1)

    # file was already there, let's check if the project passed correspond to this file
    spec = PolyaxonFile.read(constants.INIT_FILE)
    print(spec.project.name)
    if project_config.api_url != spec.project.name:
        Printer.print_error(
            "Something went wrong, init command did not create a file.\n"
            "Anothor file already exist with different "
            "project name `{}`.".format(spec.project.name))
        sys.exit(1)

    # At this point we check if we need to re init configurations
    ProjectManager.set_config(project_config, init=True)
    IgnoreManager.init_config()
    Printer.print_success(
        "Polyaxonfile was created successfully `{}`".format(constants.INIT_FILE))
コード例 #5
0
ファイル: manager.py プロジェクト: saadmahboob/polyaxon
def prepare_all_experiment_runs(polyaxonfile, experiment_id):
    plx_file = PolyaxonFile.read(polyaxonfile)
    is_distributed = False

    if not plx_file.get_environment_at(experiment_id):
        tf.logging.set_verbosity(tf.logging.INFO)
        configs = {TaskType.MASTER: [RunConfig()]}
        delay_workers_by_global_step = False
    else:
        tf.logging.set_verbosity(
            LOGGING_LEVEL[plx_file.settings.logging.level])
        configs, is_distributed = _get_run_configs(
            plx_file.settings.environment, experiment_id)
        delay_workers_by_global_step = plx_file.settings.environment.delay_workers_by_global_step

    train_input_fn, train_steps, train_hooks = _get_train(
        plx_file.get_train_at(experiment_id))
    (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs,
     continuous_eval_throttle_secs) = _get_eval(
         plx_file.get_eval_at(experiment_id))

    def get_experiment(config):
        estimator = getters.get_estimator(plx_file.model,
                                          config,
                                          output_dir=plx_file.project_path)

        return Experiment(
            estimator=estimator,
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            train_steps=train_steps,
            eval_steps=eval_steps,
            train_hooks=train_hooks,
            eval_hooks=eval_hooks,
            eval_delay_secs=eval_delay_secs,
            continuous_eval_throttle_secs=continuous_eval_throttle_secs,
            delay_workers_by_global_step=delay_workers_by_global_step,
            export_strategies=plx_file.settings.export_strategies)

    xps = [get_experiment(configs[TaskType.MASTER][0])]
    if not is_distributed:
        return xps

    for i_config in configs.get(TaskType.WORKER, []):
        xps.append(get_experiment(i_config))

    for i_config in configs.get(TaskType.PS, []):
        xps.append(get_experiment(i_config))

    return xps
コード例 #6
0
ファイル: local_runner.py プロジェクト: saadmahboob/polyaxon
def run_all(polyaxonfile):
    plx_file = PolyaxonFile.read(polyaxonfile)
    for xp in range(plx_file.matrix_space):
        xp_runs = prepare_all_experiment_runs(polyaxonfile, xp)
        for i, xp_run in enumerate(xp_runs):
            if i == 0:
                schedule = 'train_and_evaluate'
            else:
                schedule = 'train'
            p = Process(target=getattr(xp_run, schedule))
            p.start()
            jobs.append(p)

        for job in jobs:
            job.join()
コード例 #7
0
def check_polyaxonfile(file):
    file = to_list(file)
    exists = [os.path.isfile(f) for f in file]

    if not any(exists):
        Printer.print_error('Polyaxonfile is not present, '
                            'please run {}'.format(constants.INIT_COMMAND))
        sys.exit(1)

    try:
        plx_file = PolyaxonFile.read(file)
        Printer.print_success("Polyaxonfile valid")
        return plx_file
    except Exception as e:
        Printer.print_error("Polyaxonfile is not valid")
        sys.exit(1)
コード例 #8
0
ファイル: manager.py プロジェクト: saadmahboob/polyaxon
def prepare_experiment_run(polyaxonfile,
                           experiment_id,
                           task_type=TaskType.MASTER,
                           task_id=0):
    plx_file = PolyaxonFile.read(polyaxonfile)
    cluster, _ = plx_file.get_cluster_def_at(experiment_id)

    if (task_type not in cluster or not isinstance(cluster[task_type], int)
            or task_id >= cluster[task_type]):
        raise ValueError('task_type, task_id `{}, {}` is not supported by '
                         'the specification file passed.'.format(
                             task_type, task_id))

    env = plx_file.get_environment_at(experiment_id)
    if not env:
        tf.logging.set_verbosity(tf.logging.INFO)
        configs = {TaskType.MASTER: [RunConfig()]}
        delay_workers_by_global_step = False
    else:
        tf.logging.set_verbosity(
            LOGGING_LEVEL[plx_file.settings.logging.level])
        configs, _ = _get_run_configs(plx_file, experiment_id)
        delay_workers_by_global_step = env.delay_workers_by_global_step

    train_input_fn, train_steps, train_hooks = _get_train(
        plx_file.get_train_at(experiment_id))
    (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs,
     continuous_eval_throttle_secs) = _get_eval(
         plx_file.get_eval_at(experiment_id))

    estimator = getters.get_estimator(
        plx_file.get_model_at(experiment_id),
        configs[task_type][task_id],
        output_dir=plx_file.get_project_path_at(experiment_id))

    return Experiment(
        estimator=estimator,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        train_steps=train_steps,
        eval_steps=eval_steps,
        train_hooks=train_hooks,
        eval_hooks=eval_hooks,
        eval_delay_secs=eval_delay_secs,
        continuous_eval_throttle_secs=continuous_eval_throttle_secs,
        delay_workers_by_global_step=delay_workers_by_global_step,
        export_strategies=plx_file.settings.export_strategies)
コード例 #9
0
ファイル: local_runner.py プロジェクト: saadmahboob/polyaxon
def run(polyaxonfile):
    plx_file = PolyaxonFile.read(polyaxonfile)
    for xp in range(plx_file.matrix_space):
        logging.info("running Experiment n: {}".format(xp))
        cluster, is_distributed = plx_file.get_cluster_def_at(xp)
        if not is_distributed:
            start_experiment_run(plx_file, xp, TaskType.MASTER, 0,
                                 'continuous_train_and_eval')
            current_run['finished'] = True
        else:
            env = {
                'polyaxonfile': polyaxonfile,
                'task_type': TaskType.MASTER,
                'task_id': 0,
                'schedule': 'train_and_evaluate'
            }

            create_process(env)

            for i in xrange(cluster.get(TaskType.WORKER, 0)):
                env['task_id'] = i
                env['task_type'] = TaskType.WORKER
                env['schedule'] = 'train'
                create_process(env)

            for i in xrange(cluster.get(TaskType.PS, 0)):
                env['task_id'] = i
                env['task_type'] = TaskType.PS
                env['schedule'] = 'run_std_server'
                create_process(env)

            for job in jobs:
                job.join()

        while not current_run['finished']:
            time.sleep(30)

        current_run['finished'] = False