def predict(args): image_name = build_and_push_docker_image( model_zoo=args.model_zoo, base_image=args.image_base, docker_image_prefix=args.docker_image_prefix, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, docker_base_url=args.docker_base_url, docker_tlscert=args.docker_tlscert, docker_tlskey=args.docker_tlskey, ) container_args = [ "-m", "elasticdl.python.master.main", "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(args.model_zoo), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), ] container_args.extend( build_arguments_from_parsed_result( args, filter_args=["model_zoo", "cluster_spec", "worker_image"])) _submit_job(image_name, args, container_args)
def train(args): model_zoo = os.path.normpath(args.model_zoo) if args.distribution_strategy == DistributionStrategy.LOCAL: local_executor = LocalExecutor(args) local_executor.run() else: image_name = build_and_push_docker_image( model_zoo=model_zoo, base_image=args.image_base, docker_image_repository=args.docker_image_repository, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, docker_base_url=args.docker_base_url, docker_tlscert=args.docker_tlscert, docker_tlskey=args.docker_tlskey, ) container_args = [ "-m", "elasticdl.python.master.main", "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(model_zoo), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), ] container_args.extend( build_arguments_from_parsed_result( args, filter_args=["model_zoo", "cluster_spec", "worker_image"])) _submit_job(image_name, args, container_args)
def evaluate(args): image_name = build_and_push_docker_image( model_zoo=args.model_zoo, base_image=args.image_base, docker_image_prefix=args.docker_image_prefix, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, docker_base_url=args.docker_base_url, docker_tlscert=args.docker_tlscert, docker_tlskey=args.docker_tlskey, ) container_args = [ "-m", "elasticdl.python.master.main", "--job_name", args.job_name, "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(args.model_zoo), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), "--num_workers", str(args.num_workers), "--worker_resource_request", args.worker_resource_request, "--worker_resource_limit", args.worker_resource_limit, "--envs", args.envs, "--namespace", args.namespace, "--records_per_task", str(args.records_per_task), "--minibatch_size", str(args.minibatch_size), "--evaluation_data_dir", args.evaluation_data_dir, "--checkpoint_filename_for_init", args.checkpoint_filename_for_init, "--dataset_fn", args.dataset_fn, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--model_params", args.model_params, ] container_args.extend(["--image_pull_policy", args.image_pull_policy]) container_args.extend(["--restart_policy", args.restart_policy]) container_args.extend(["--volume", args.volume]) _submit_job(image_name, args, container_args)
def train(args): model_zoo = os.path.normpath(args.model_zoo) if args.distribution_strategy == DistributionStrategy.LOCAL: local_executor = LocalExecutor(args) local_executor.run() return image_pre_built = bool(args.image_name) image_name = (args.image_name if image_pre_built else build_and_push_docker_image( model_zoo=model_zoo, base_image=args.image_base, docker_image_repository=args.docker_image_repository, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, docker_base_url=args.docker_base_url, docker_tlscert=args.docker_tlscert, docker_tlskey=args.docker_tlskey, )) container_args = [ "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(model_zoo, image_pre_built), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), ] container_args.extend( build_arguments_from_parsed_result( args, filter_args=[ "model_zoo", "cluster_spec", "worker_image", "force_use_kube_config_file", "func", ], )) _submit_job(image_name, args, container_args)
def evaluate(args): model_zoo = os.path.normpath(args.model_zoo) image_pre_built = bool(args.image_name) image_name = (args.image_name if image_pre_built else build_and_push_docker_image( model_zoo=model_zoo, base_image=args.image_base, docker_image_repository=args.docker_image_repository, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, docker_base_url=args.docker_base_url, docker_tlscert=args.docker_tlscert, docker_tlskey=args.docker_tlskey, )) container_args = [ "-m", "elasticdl.python.master.main", "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(model_zoo, image_pre_built), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), ] container_args.extend( build_arguments_from_parsed_result( args, filter_args=[ "model_zoo", "cluster_spec", "worker_image", "force_use_kube_config_file", ], )) _submit_job(image_name, args, container_args)
def train(args): image_name = build_and_push_docker_image( model_zoo=args.model_zoo, base_image=args.image_base, docker_image_prefix=args.docker_image_prefix, extra_pypi=args.extra_pypi_index, cluster_spec=args.cluster_spec, ) container_args = [ "-m", "elasticdl.python.master.main", "--job_name", args.job_name, "--worker_image", image_name, "--model_zoo", _model_zoo_in_docker(args.model_zoo), "--cluster_spec", _cluster_spec_def_in_docker(args.cluster_spec), "--num_workers", str(args.num_workers), "--master_resource_request", args.master_resource_request, "--master_resource_limit", args.master_resource_limit, "--worker_resource_request", args.worker_resource_request, "--worker_resource_limit", args.worker_resource_limit, "--envs", args.envs, "--namespace", args.namespace, "--tensorboard_log_dir", args.tensorboard_log_dir, "--records_per_task", str(args.records_per_task), "--num_epochs", str(args.num_epochs), "--grads_to_wait", str(args.grads_to_wait), "--minibatch_size", str(args.minibatch_size), "--training_data_dir", args.training_data_dir, "--evaluation_data_dir", args.evaluation_data_dir, "--checkpoint_steps", str(args.checkpoint_steps), "--checkpoint_dir", args.checkpoint_dir, "--keep_checkpoint_max", str(args.keep_checkpoint_max), "--evaluation_steps", str(args.evaluation_steps), "--evaluation_start_delay_secs", str(args.evaluation_start_delay_secs), "--evaluation_throttle_secs", str(args.evaluation_throttle_secs), "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--model_params", args.model_params, ] container_args.extend(["--image_pull_policy", args.image_pull_policy]) container_args.extend(["--restart_policy", args.restart_policy]) container_args.extend(["--volume", args.volume]) _submit_job(image_name, args, container_args)