Exemple #1
0
def _submit_job(image_name, client_args, container_args):
    client = k8s.Client(
        image_name=image_name,
        namespace=client_args.namespace,
        job_name=client_args.job_name,
        cluster_spec=client_args.cluster_spec,
        cluster_spec_json=client_args.cluster_spec_json,
        force_use_kube_config_file=client_args.force_use_kube_config_file,
    )

    container_args = wrap_python_args_with_string(container_args)

    master_client_command = (
        BashCommandTemplate.SET_PIPEFAIL
        + " python -m elasticdl.python.master.main"
    )
    container_args.insert(0, master_client_command)
    if client_args.log_file_path:
        container_args.append(
            BashCommandTemplate.REDIRECTION.format(client_args.log_file_path)
        )

    python_command = " ".join(container_args)
    container_args = ["-c", python_command]

    if client_args.yaml:
        client.dump_master_yaml(
            resource_requests=client_args.master_resource_request,
            resource_limits=client_args.master_resource_limit,
            args=container_args,
            pod_priority=client_args.master_pod_priority,
            image_pull_policy=client_args.image_pull_policy,
            restart_policy=client_args.restart_policy,
            volume=client_args.volume,
            envs=parse_envs(client_args.envs),
            yaml=client_args.yaml,
        )
        logger.info(
            "ElasticDL job %s YAML has been dumped into file %s."
            % (client_args.job_name, client_args.yaml)
        )
    else:
        client.create_master(
            resource_requests=client_args.master_resource_request,
            resource_limits=client_args.master_resource_limit,
            args=container_args,
            pod_priority=client_args.master_pod_priority,
            image_pull_policy=client_args.image_pull_policy,
            restart_policy=client_args.restart_policy,
            volume=client_args.volume,
            envs=parse_envs(client_args.envs),
        )
        logger.info(
            "ElasticDL job %s was successfully submitted. "
            "The master pod is: %s."
            % (client_args.job_name, client.get_master_pod_name())
        )
    def __init__(self, args):
        envs = parse_envs(args.envs)
        self._init_environment(envs)

        (
            self.model_inst,
            self.dataset_fn,
            self.loss_fn,
            self.opt_fn,
            self.eval_metrics_fn,
            self.prediction_outputs_processor,
            self.custom_data_reader,
            self.callback_list,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            dataset_fn=args.dataset_fn,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            model_params=args.model_params,
            prediction_outputs_processor="",
            custom_data_reader=args.custom_data_reader,
            callbacks=args.callbacks,
        )
        self.opt = self.opt_fn()
        self.epoch = args.num_epochs
        self.evaluation_steps = args.evaluation_steps
        self.batch_size = args.minibatch_size
        self.data_reader_params = get_dict_from_params_str(
            args.data_reader_params)
        self.records_per_task = (args.minibatch_size *
                                 args.num_minibatches_per_task)

        create_data_reader_fn = (create_data_reader
                                 if self.custom_data_reader is None else
                                 self.custom_data_reader)
        self.data_reader = create_data_reader_fn(
            data_origin=args.training_data,
            records_per_task=self.records_per_task,
            **self.data_reader_params)
        self.training_data = args.training_data
        self.validation_data = args.validation_data
        self.save_model_dir = args.output
Exemple #3
0
    def _create_instance_manager(self, args):
        instance_manager = None

        container_command = ["/bin/bash"]
        if args.num_workers:
            assert args.worker_image, "Worker image cannot be empty"

            worker_client_command = (
                BashCommandTemplate.SET_PIPEFAIL
                + " python -m elasticdl.python.worker.main"
            )
            worker_args = [
                "--master_addr",
                self.master_addr,
                "--job_type",
                self.job_type,
            ]
            worker_args.extend(
                build_arguments_from_parsed_result(args, filter_args=["envs"])
            )
            worker_args = wrap_python_args_with_string(worker_args)
            worker_args.insert(0, worker_client_command)

            if args.use_go_ps:
                opt_type, opt_args = get_optimizer_info(self.optimizer)
                ps_command = "elasticdl_ps"
                ps_command_args = [
                    "-job_name=" + args.job_name,
                    "-namespace=" + args.namespace,
                    "-master_addr=" + self.master_addr,
                    "-port=2222",
                    "-use_async=" + ("true" if args.use_async else "false"),
                    "-grads_to_wait=" + str(args.grads_to_wait),
                    "-lr_staleness_modulation="
                    + ("true" if args.lr_staleness_modulation else "false"),
                    "-sync_version_tolerance="
                    + str(args.sync_version_tolerance),
                    "-evaluation_steps=" + str(args.evaluation_steps),
                    "-num_ps_pods=" + str(args.num_ps_pods),
                    "-num_workers=" + str(args.num_workers),
                    "-checkpoint_dir=" + str(args.checkpoint_dir),
                    "-checkpoint_steps=" + str(args.checkpoint_steps),
                    "-keep_checkpoint_max=" + str(args.keep_checkpoint_max),
                    "-checkpoint_dir_for_init="
                    + str(args.checkpoint_dir_for_init),
                    "-opt_type=" + opt_type,
                    "-opt_args=" + opt_args,
                ]
                ps_command_args = wrap_go_args_with_string(ps_command_args)
                # Execute source /root/.bashrc to add the file path
                # of `elasticdl_ps` into the PATH environment variable.
                ps_args = [
                    "source",
                    "/root/.bashrc_elasticdl",
                    "&&",
                    ps_command,
                ]
                ps_args.extend(ps_command_args)
            else:
                ps_command = (
                    BashCommandTemplate.SET_PIPEFAIL
                    + " python -m elasticdl.python.ps.main"
                )
                ps_command_args = [
                    "--grads_to_wait",
                    str(args.grads_to_wait),
                    "--lr_staleness_modulation",
                    str(args.lr_staleness_modulation),
                    "--sync_version_tolerance",
                    str(args.sync_version_tolerance),
                    "--use_async",
                    str(args.use_async),
                    "--model_zoo",
                    args.model_zoo,
                    "--model_def",
                    args.model_def,
                    "--job_name",
                    args.job_name,
                    "--port",
                    "2222",
                    "--master_addr",
                    self.master_addr,
                    "--namespace",
                    args.namespace,
                    "--evaluation_steps",
                    str(args.evaluation_steps),
                    "--checkpoint_dir",
                    str(args.checkpoint_dir),
                    "--checkpoint_steps",
                    str(args.checkpoint_steps),
                    "--keep_checkpoint_max",
                    str(args.keep_checkpoint_max),
                    "--num_ps_pods",
                    str(args.num_ps_pods),
                    "--checkpoint_dir_for_init",
                    str(args.checkpoint_dir_for_init),
                    "--num_workers",
                    str(args.num_workers),
                    "--log_level",
                    str(args.log_level),
                    "--minibatch_size",
                    str(args.minibatch_size),
                    "--num_minibatches_per_task",
                    str(args.num_minibatches_per_task),
                ]
                ps_args = wrap_python_args_with_string(ps_command_args)
                ps_args.insert(0, ps_command)

            worker_args = ["-c", " ".join(worker_args)]
            ps_args = ["-c", " ".join(ps_args)]

            env_dict = parse_envs(args.envs)
            env = []
            for key in env_dict:
                env.append(V1EnvVar(name=key, value=env_dict[key]))

            kwargs = get_dict_from_params_str(args.aux_params)
            disable_relaunch = kwargs.get("disable_relaunch", False)
            cluster_spec = self._get_image_cluster_spec(args.cluster_spec)

            instance_manager = InstanceManager(
                self.task_d,
                rendezvous_server=self.rendezvous_server,
                job_name=args.job_name,
                image_name=args.worker_image,
                worker_command=container_command,
                worker_args=worker_args,
                namespace=args.namespace,
                num_workers=args.num_workers,
                worker_resource_request=args.worker_resource_request,
                worker_resource_limit=args.worker_resource_limit,
                worker_pod_priority=args.worker_pod_priority,
                num_ps=args.num_ps_pods,
                ps_command=container_command,
                ps_args=ps_args,
                ps_resource_request=args.ps_resource_request,
                ps_resource_limit=args.ps_resource_limit,
                ps_pod_priority=args.ps_pod_priority,
                volume=args.volume,
                image_pull_policy=args.image_pull_policy,
                restart_policy=args.restart_policy,
                cluster_spec=cluster_spec,
                envs=env,
                disable_relaunch=disable_relaunch,
                log_file_path=args.log_file_path,
            )

        return instance_manager