def start_experiment_run(polyaxonfile, experiment_id, task_type, task_id, schedule): plx_file = PolyaxonFile.read(polyaxonfile) experiment = prepare_experiment_run(plx_file, experiment_id, task_type, task_id) task = getattr(experiment, schedule) return task()
def _get_run_configs(polyaxonfile, experiment_id): plx_file = PolyaxonFile.read(polyaxonfile) environment = plx_file.get_environment_at(experiment_id) cluster_def, is_distributed = plx_file.get_cluster_def_at(experiment_id) def get_master_config(config, task_type=None, task_id=None): config = RunConfig.from_config(config) if task_type is None and task_id is None: return config return config.replace(task_type=task_type, task_id=task_id) config = environment.run_config or RunConfig.CONFIG() if not is_distributed: return {TaskType.MASTER: get_master_config(config)}, False config.cluster = plx_file.get_cluster(experiment=experiment_id) configs = { TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)] } if cluster_def.get(TaskType.WORKER, 0) > 0: configs[TaskType.WORKER] = [] if cluster_def.get(TaskType.PS, 0) > 0: configs[TaskType.PS] = [] worker_session_configs = {} for session_config in environment.worker_configs or []: worker_session_configs[session_config.index] = session_config ps_session_configs = {} for session_config in environment.ps_configs or []: ps_session_configs[session_config.index] = session_config default_worker_config = environment.default_worker_config for i in range(cluster_def.get(TaskType.WORKER, 0)): w_config = get_master_config(config, task_type=TaskType.WORKER, task_id=i) session_config = worker_session_configs.get(i, default_worker_config) if session_config: session_config = RunConfig.get_session_config(session_config) w_config = w_config.replace(session_config=session_config) configs[TaskType.WORKER].append(w_config) default_ps_config = environment.default_ps_config for i in range(cluster_def.get(TaskType.PS, 0)): ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i) session_config = ps_session_configs.get(i, default_ps_config) if session_config: session_config = RunConfig.get_session_config(session_config) ps_config = ps_config.replace(session_config=session_config) configs[TaskType.PS].append(ps_config) return configs, True
def run(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): run_experiment(plx_file.experiment_specs[xp], xp) while not current_run['finished']: check_master_process() time.sleep(10) current_run['finished'] = False current_run['master'] = None
def init(project, model, run): """Init a new polyaxonfile specification.""" user, project_name = get_project_or_local(project) try: project_config = PolyaxonClients().project.get_project(user, project_name) except PolyaxonHTTPError: Printer.print_error('Make sure you have a project with this name `{}`'.format(project)) Printer.print_error('You can a new project with this command: ' 'polyaxon project create --name={} --description=...'.format(project)) sys.exit(1) if not any([model, run]) and not all([model, run]): Printer.print_error("You must specify which an init option, " "possible values: `--model` or `--run`.") sys.exit(1) result = False if model: result = create_init_file(constants.INIT_FILE_MODEL, project) elif run: result = create_init_file(constants.INIT_FILE_RUN, project) if result: ProjectManager.set_config(project_config, init=True) IgnoreManager.init_config() Printer.print_success( "Polyaxonfile was created successfully `{}`".format(constants.INIT_FILE)) sys.exit(1) # if we are here the file was not created if not os.path.isfile(constants.INIT_FILE): Printer.print_error( "Something went wrong, init command did not create a file.\n" "Possible reasons: you don't have the write to create the file.") sys.exit(1) # file was already there, let's check if the project passed correspond to this file spec = PolyaxonFile.read(constants.INIT_FILE) print(spec.project.name) if project_config.api_url != spec.project.name: Printer.print_error( "Something went wrong, init command did not create a file.\n" "Anothor file already exist with different " "project name `{}`.".format(spec.project.name)) sys.exit(1) # At this point we check if we need to re init configurations ProjectManager.set_config(project_config, init=True) IgnoreManager.init_config() Printer.print_success( "Polyaxonfile was created successfully `{}`".format(constants.INIT_FILE))
def prepare_all_experiment_runs(polyaxonfile, experiment_id): plx_file = PolyaxonFile.read(polyaxonfile) is_distributed = False if not plx_file.get_environment_at(experiment_id): tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity( LOGGING_LEVEL[plx_file.settings.logging.level]) configs, is_distributed = _get_run_configs( plx_file.settings.environment, experiment_id) delay_workers_by_global_step = plx_file.settings.environment.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train( plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval( plx_file.get_eval_at(experiment_id)) def get_experiment(config): estimator = getters.get_estimator(plx_file.model, config, output_dir=plx_file.project_path) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies) xps = [get_experiment(configs[TaskType.MASTER][0])] if not is_distributed: return xps for i_config in configs.get(TaskType.WORKER, []): xps.append(get_experiment(i_config)) for i_config in configs.get(TaskType.PS, []): xps.append(get_experiment(i_config)) return xps
def run_all(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): xp_runs = prepare_all_experiment_runs(polyaxonfile, xp) for i, xp_run in enumerate(xp_runs): if i == 0: schedule = 'train_and_evaluate' else: schedule = 'train' p = Process(target=getattr(xp_run, schedule)) p.start() jobs.append(p) for job in jobs: job.join()
def check_polyaxonfile(file): file = to_list(file) exists = [os.path.isfile(f) for f in file] if not any(exists): Printer.print_error('Polyaxonfile is not present, ' 'please run {}'.format(constants.INIT_COMMAND)) sys.exit(1) try: plx_file = PolyaxonFile.read(file) Printer.print_success("Polyaxonfile valid") return plx_file except Exception as e: Printer.print_error("Polyaxonfile is not valid") sys.exit(1)
def prepare_experiment_run(polyaxonfile, experiment_id, task_type=TaskType.MASTER, task_id=0): plx_file = PolyaxonFile.read(polyaxonfile) cluster, _ = plx_file.get_cluster_def_at(experiment_id) if (task_type not in cluster or not isinstance(cluster[task_type], int) or task_id >= cluster[task_type]): raise ValueError('task_type, task_id `{}, {}` is not supported by ' 'the specification file passed.'.format( task_type, task_id)) env = plx_file.get_environment_at(experiment_id) if not env: tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity( LOGGING_LEVEL[plx_file.settings.logging.level]) configs, _ = _get_run_configs(plx_file, experiment_id) delay_workers_by_global_step = env.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train( plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval( plx_file.get_eval_at(experiment_id)) estimator = getters.get_estimator( plx_file.get_model_at(experiment_id), configs[task_type][task_id], output_dir=plx_file.get_project_path_at(experiment_id)) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies)
def run(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): logging.info("running Experiment n: {}".format(xp)) cluster, is_distributed = plx_file.get_cluster_def_at(xp) if not is_distributed: start_experiment_run(plx_file, xp, TaskType.MASTER, 0, 'continuous_train_and_eval') current_run['finished'] = True else: env = { 'polyaxonfile': polyaxonfile, 'task_type': TaskType.MASTER, 'task_id': 0, 'schedule': 'train_and_evaluate' } create_process(env) for i in xrange(cluster.get(TaskType.WORKER, 0)): env['task_id'] = i env['task_type'] = TaskType.WORKER env['schedule'] = 'train' create_process(env) for i in xrange(cluster.get(TaskType.PS, 0)): env['task_id'] = i env['task_type'] = TaskType.PS env['schedule'] = 'run_std_server' create_process(env) for job in jobs: job.join() while not current_run['finished']: time.sleep(30) current_run['finished'] = False