def _submit_job(image_name, client_args, container_args): client = k8s.Client( image_name=image_name, namespace=client_args.namespace, job_name=client_args.job_name, cluster_spec=client_args.cluster_spec, cluster_spec_json=client_args.cluster_spec_json, force_use_kube_config_file=client_args.force_use_kube_config_file, ) container_args = wrap_python_args_with_string(container_args) master_client_command = ( BashCommandTemplate.SET_PIPEFAIL + " python -m elasticdl.python.master.main" ) container_args.insert(0, master_client_command) if client_args.log_file_path: container_args.append( BashCommandTemplate.REDIRECTION.format(client_args.log_file_path) ) python_command = " ".join(container_args) container_args = ["-c", python_command] if client_args.yaml: client.dump_master_yaml( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), yaml=client_args.yaml, ) logger.info( "ElasticDL job %s YAML has been dumped into file %s." % (client_args.job_name, client_args.yaml) ) else: client.create_master( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), ) logger.info( "ElasticDL job %s was successfully submitted. " "The master pod is: %s." % (client_args.job_name, client.get_master_pod_name()) )
def __init__(self, args): envs = parse_envs(args.envs) self._init_environment(envs) ( self.model_inst, self.dataset_fn, self.loss_fn, self.opt_fn, self.eval_metrics_fn, self.prediction_outputs_processor, self.custom_data_reader, self.callback_list, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_params=args.model_params, prediction_outputs_processor="", custom_data_reader=args.custom_data_reader, callbacks=args.callbacks, ) self.opt = self.opt_fn() self.epoch = args.num_epochs self.evaluation_steps = args.evaluation_steps self.batch_size = args.minibatch_size self.data_reader_params = get_dict_from_params_str( args.data_reader_params) self.records_per_task = (args.minibatch_size * args.num_minibatches_per_task) create_data_reader_fn = (create_data_reader if self.custom_data_reader is None else self.custom_data_reader) self.data_reader = create_data_reader_fn( data_origin=args.training_data, records_per_task=self.records_per_task, **self.data_reader_params) self.training_data = args.training_data self.validation_data = args.validation_data self.save_model_dir = args.output
def _create_instance_manager(self, args): instance_manager = None container_command = ["/bin/bash"] if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_client_command = ( BashCommandTemplate.SET_PIPEFAIL + " python -m elasticdl.python.worker.main" ) worker_args = [ "--master_addr", self.master_addr, "--job_type", self.job_type, ] worker_args.extend( build_arguments_from_parsed_result(args, filter_args=["envs"]) ) worker_args = wrap_python_args_with_string(worker_args) worker_args.insert(0, worker_client_command) if args.use_go_ps: opt_type, opt_args = get_optimizer_info(self.optimizer) ps_command = "elasticdl_ps" ps_command_args = [ "-job_name=" + args.job_name, "-namespace=" + args.namespace, "-master_addr=" + self.master_addr, "-port=2222", "-use_async=" + ("true" if args.use_async else "false"), "-grads_to_wait=" + str(args.grads_to_wait), "-lr_staleness_modulation=" + ("true" if args.lr_staleness_modulation else "false"), "-sync_version_tolerance=" + str(args.sync_version_tolerance), "-evaluation_steps=" + str(args.evaluation_steps), "-num_ps_pods=" + str(args.num_ps_pods), "-num_workers=" + str(args.num_workers), "-checkpoint_dir=" + str(args.checkpoint_dir), "-checkpoint_steps=" + str(args.checkpoint_steps), "-keep_checkpoint_max=" + str(args.keep_checkpoint_max), "-checkpoint_dir_for_init=" + str(args.checkpoint_dir_for_init), "-opt_type=" + opt_type, "-opt_args=" + opt_args, ] ps_command_args = wrap_go_args_with_string(ps_command_args) # Execute source /root/.bashrc to add the file path # of `elasticdl_ps` into the PATH environment variable. ps_args = [ "source", "/root/.bashrc_elasticdl", "&&", ps_command, ] ps_args.extend(ps_command_args) else: ps_command = ( BashCommandTemplate.SET_PIPEFAIL + " python -m elasticdl.python.ps.main" ) ps_command_args = [ "--grads_to_wait", str(args.grads_to_wait), "--lr_staleness_modulation", str(args.lr_staleness_modulation), "--sync_version_tolerance", str(args.sync_version_tolerance), "--use_async", str(args.use_async), "--model_zoo", args.model_zoo, "--model_def", args.model_def, "--job_name", args.job_name, "--port", "2222", "--master_addr", self.master_addr, "--namespace", args.namespace, "--evaluation_steps", str(args.evaluation_steps), "--checkpoint_dir", str(args.checkpoint_dir), "--checkpoint_steps", str(args.checkpoint_steps), "--keep_checkpoint_max", str(args.keep_checkpoint_max), "--num_ps_pods", str(args.num_ps_pods), "--checkpoint_dir_for_init", str(args.checkpoint_dir_for_init), "--num_workers", str(args.num_workers), "--log_level", str(args.log_level), "--minibatch_size", str(args.minibatch_size), "--num_minibatches_per_task", str(args.num_minibatches_per_task), ] ps_args = wrap_python_args_with_string(ps_command_args) ps_args.insert(0, ps_command) worker_args = ["-c", " ".join(worker_args)] ps_args = ["-c", " ".join(ps_args)] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) kwargs = get_dict_from_params_str(args.aux_params) disable_relaunch = kwargs.get("disable_relaunch", False) cluster_spec = self._get_image_cluster_spec(args.cluster_spec) instance_manager = InstanceManager( self.task_d, rendezvous_server=self.rendezvous_server, job_name=args.job_name, image_name=args.worker_image, worker_command=container_command, worker_args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, worker_pod_priority=args.worker_pod_priority, num_ps=args.num_ps_pods, ps_command=container_command, ps_args=ps_args, ps_resource_request=args.ps_resource_request, ps_resource_limit=args.ps_resource_limit, ps_pod_priority=args.ps_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=cluster_spec, envs=env, disable_relaunch=disable_relaunch, log_file_path=args.log_file_path, ) return instance_manager