def run(self): if self.run_on_yarn: # Dump job and base as local json files for yarn_launcher job_name = f"job-{self.config.name}.json" with open(job_name, "w") as file: json.dump(self.job, file, indent=4) # Launch job on yarn pex_path = self.config.upload_pex_cpu() with skein.Client() as skein_client: LOGGER.info(f"Submitting job {self.config.name}") app_id = submit( skein_client=skein_client, module_name="deepr.cli.main", additional_files=[job_name], archive_hdfs=pex_path, args=["from_config", job_name, "-", "run"], env_vars=self.config.get_env_vars(), hadoop_file_systems=self.config.hadoop_file_systems, memory=self.config.memory, name=self.config.name, num_cores=self.config.num_cores, ) report = skein_client.application_report(app_id) LOGGER.info(f"TRACKING_URL: {report.tracking_url}") mlflow.clear_run() else: LOGGER.info("Not running on yarn.") job = from_config(self.job) job.run()
def _from_config(config: str): """Instantiate object from parsed config. Parameters ---------- config : str Path to json file or json string """ return from_config(load_json(config))
def run(self): if self.train_on_yarn: # Upload environment(s) to HDFS (CPU and / or GPU environments) pyenv_zip_path = { tf_yarn.NodeLabel.CPU: self.config.upload_pex_cpu() } if self.config.tf_yarn == "gpu": pyenv_zip_path[ tf_yarn.NodeLabel.GPU] = self.config.upload_pex_gpu() def _experiment_fn(): # Remove auto-termination of active MLFlow runs from # inside the chief / evaluator atexit.unregister(mlflow.end_run) return from_config(self.trainer).create_experiment() tf_yarn.run_on_yarn( acls=skein.model.ACLs(enable=True, ui_users=["*"], view_users=["*"]), env=self.config.get_env_vars(), experiment_fn=_experiment_fn, files=get_editable_requirements(), name=self.config.name, nb_retries=self.config.nb_retries, pre_script_hook=self.config.pre_script_hook, pyenv_zip_path=pyenv_zip_path, queue=self.config.queue, task_specs=self.config.get_task_specs(), ) # Run exporters and final evaluation trainer = from_config(self.trainer) experiment = trainer.create_experiment() for exporter in trainer.exporters: exporter(experiment.estimator) trainer.run_final_evaluation() else: LOGGER.info("Not training on yarn.") trainer = from_config(self.trainer) trainer.run()
def from_config_and_macros(config: str, macros: str = None): """Instantiate object from config and macros. Parameters ---------- config : str Path to json file or json string macros : str, optional Path to json file or json string Returns ------- Instance Defined by config """ parsed = parse_config(load_json(config), load_json(macros) if macros else None) return from_config(parsed)
def run(self): sampled = list(self.sampler) for idx, params in enumerate(sampled): LOGGER.info(f"Launching job with params: {params}") # Update macro params with sampled values macros = deepcopy(self.macros) macros["params"] = {**macros["params"], **params} assert_no_macros(macros["params"]) # Parse config and run job parsed = parse_config(self.job, macros) job = from_config(parsed) if not isinstance(job, base.Job): raise TypeError(f"Expected type Job but got {type(job)}") job.run() mlflow.clear_run() # New parameters based on time need to be different if idx + 1 < len(sampled): LOGGER.info("Sleeping 2 seconds before next experiment\n") time.sleep(2)
def __post_init__(self): job = from_config(self.job) if not isinstance(job, base.Job): raise TypeError(f"Expected type {base.Job} but got {job}")
def _experiment_fn(): # Remove auto-termination of active MLFlow runs from # inside the chief / evaluator atexit.unregister(mlflow.end_run) return from_config(self.trainer).create_experiment()
def __post_init__(self): trainer = from_config(self.trainer) if not isinstance(trainer, Trainer): raise TypeError( f"Expected job of type {Trainer} but got {type(trainer)}")