Beispiel #1
0
    def run(self):
        if self.run_on_yarn:
            # Dump job and base as local json files for yarn_launcher
            job_name = f"job-{self.config.name}.json"
            with open(job_name, "w") as file:
                json.dump(self.job, file, indent=4)

            # Launch job on yarn
            pex_path = self.config.upload_pex_cpu()
            with skein.Client() as skein_client:
                LOGGER.info(f"Submitting job {self.config.name}")
                app_id = submit(
                    skein_client=skein_client,
                    module_name="deepr.cli.main",
                    additional_files=[job_name],
                    archive_hdfs=pex_path,
                    args=["from_config", job_name, "-", "run"],
                    env_vars=self.config.get_env_vars(),
                    hadoop_file_systems=self.config.hadoop_file_systems,
                    memory=self.config.memory,
                    name=self.config.name,
                    num_cores=self.config.num_cores,
                )
                report = skein_client.application_report(app_id)
                LOGGER.info(f"TRACKING_URL: {report.tracking_url}")
            mlflow.clear_run()
        else:
            LOGGER.info("Not running on yarn.")
            job = from_config(self.job)
            job.run()
Beispiel #2
0
def _from_config(config: str):
    """Instantiate object from parsed config.

    Parameters
    ----------
    config : str
        Path to json file or json string
    """
    return from_config(load_json(config))
Beispiel #3
0
    def run(self):
        if self.train_on_yarn:
            # Upload environment(s) to HDFS (CPU and / or GPU environments)
            pyenv_zip_path = {
                tf_yarn.NodeLabel.CPU: self.config.upload_pex_cpu()
            }
            if self.config.tf_yarn == "gpu":
                pyenv_zip_path[
                    tf_yarn.NodeLabel.GPU] = self.config.upload_pex_gpu()

            def _experiment_fn():
                # Remove auto-termination of active MLFlow runs from
                # inside the chief / evaluator
                atexit.unregister(mlflow.end_run)
                return from_config(self.trainer).create_experiment()

            tf_yarn.run_on_yarn(
                acls=skein.model.ACLs(enable=True,
                                      ui_users=["*"],
                                      view_users=["*"]),
                env=self.config.get_env_vars(),
                experiment_fn=_experiment_fn,
                files=get_editable_requirements(),
                name=self.config.name,
                nb_retries=self.config.nb_retries,
                pre_script_hook=self.config.pre_script_hook,
                pyenv_zip_path=pyenv_zip_path,
                queue=self.config.queue,
                task_specs=self.config.get_task_specs(),
            )

            # Run exporters and final evaluation
            trainer = from_config(self.trainer)
            experiment = trainer.create_experiment()
            for exporter in trainer.exporters:
                exporter(experiment.estimator)
            trainer.run_final_evaluation()
        else:
            LOGGER.info("Not training on yarn.")
            trainer = from_config(self.trainer)
            trainer.run()
Beispiel #4
0
def from_config_and_macros(config: str, macros: str = None):
    """Instantiate object from config and macros.

    Parameters
    ----------
    config : str
        Path to json file or json string
    macros : str, optional
        Path to json file or json string

    Returns
    -------
    Instance
        Defined by config
    """
    parsed = parse_config(load_json(config), load_json(macros) if macros else None)
    return from_config(parsed)
Beispiel #5
0
    def run(self):
        sampled = list(self.sampler)
        for idx, params in enumerate(sampled):
            LOGGER.info(f"Launching job with params: {params}")

            # Update macro params with sampled values
            macros = deepcopy(self.macros)
            macros["params"] = {**macros["params"], **params}
            assert_no_macros(macros["params"])

            # Parse config and run job
            parsed = parse_config(self.job, macros)
            job = from_config(parsed)
            if not isinstance(job, base.Job):
                raise TypeError(f"Expected type Job but got {type(job)}")
            job.run()
            mlflow.clear_run()

            # New parameters based on time need to be different
            if idx + 1 < len(sampled):
                LOGGER.info("Sleeping 2 seconds before next experiment\n")
                time.sleep(2)
Beispiel #6
0
 def __post_init__(self):
     job = from_config(self.job)
     if not isinstance(job, base.Job):
         raise TypeError(f"Expected type {base.Job} but got {job}")
Beispiel #7
0
 def _experiment_fn():
     # Remove auto-termination of active MLFlow runs from
     # inside the chief / evaluator
     atexit.unregister(mlflow.end_run)
     return from_config(self.trainer).create_experiment()
Beispiel #8
0
 def __post_init__(self):
     trainer = from_config(self.trainer)
     if not isinstance(trainer, Trainer):
         raise TypeError(
             f"Expected job of type {Trainer} but got {type(trainer)}")