def __init__(self, src_dir, dst_dir, config, **build_opts): src_dir = local.path(src_dir) dst_dir = local.path(dst_dir) skip_s3 = build_opts.pop("skip_s3", False) self._tasks = {} task_list = task_list_from_config(config) for task_name, task_klass, task_info in task_list: self._translate_s3_references(task_info, skip_s3) self._tasks[task_name] = (task_klass, task_info, {}) n_fields_limit = build_opts.pop("n_fields_limit", None) if n_fields_limit is not None: important(f"Limiting to only {n_fields_limit} fields") # TASK: Convert these from named task to ANY task f that type if "ims_import" in self._tasks: self._tasks["ims_import"][1]["parameters"][ "n_fields_limit"] = n_fields_limit if "sigproc_v1" in self._tasks: self._tasks["sigproc_v1"][1]["parameters"][ "n_fields_limit"] = n_fields_limit if "sigproc_v2" in self._tasks: self._tasks["sigproc_v2"][1]["parameters"][ "n_fields_limit"] = n_fields_limit super().__init__(src_dir, dst_dir, self._tasks, **build_opts)
def run_nbstripout(self): """Strip all notebooks of output to save space in commits""" important("Stripping Notebooks...") result = (local["find"][".", "-type", "f", "-not", "-path", "*/\.*", "-name", "*.ipynb", "-print", ] | local["xargs"]["nbstripout"]) & TF(FG=True) if not result: raise CommandError
def print_local_job_folders(self): important("Local job folders:") root = local.path("./jobs_folder") self._print_job_folders([ Munch( folder=(p - root)[0], name=p.name, size=int(p.stat().st_size), mtime=int(p.stat().st_mtime), ) for p in root.walk() ])
def run_ipynb(self, ipynb_path): # Note: the timeout has been set to 8 hours to facilitate reports for # huge jobs (e.g. 100+ runs). important(f"Executing report notebook {ipynb_path}") local["jupyter"]( "nbconvert", "--to", "html", "--execute", ipynb_path, "--ExecutePreprocessor.timeout=28800", )
def run_docker_build(self, docker_tag, quiet=False): important(f"Building docker tag {docker_tag}") with local.env(LANG="en_US.UTF-8"): args = [ "build", "-t", f"erisyon:{docker_tag}", "-f", "./scripts/main_env.docker", ] if quiet: args += ["--quiet"] args += "." local["docker"][args] & FG
def train_rf(train_rf_params, sim_result, progress=None): X = sim_result.flat_train_radmat() y = sim_result.train_true_pep_iz() if train_rf_params.n_subsample is not None: X, y = _subsample(train_rf_params.n_subsample, X, y) else: if sim_result.params.n_samples_train > 1000: important( "Warning: RF does not memory-scale well when the n_samples_train is > 1000." ) del train_rf_params["n_subsample"] classifier = SciKitLearnRandomForestClassifier(**train_rf_params) classifier.train(X, y, progress) return TrainRFResult(params=train_rf_params, classifier=classifier)
def validate_job_name_and_folder(self): """ Validate the job name and compute job_folder path. Optionally delete the job_folder if it exists. Returns: job_folder path """ if self.job is None: raise ValidationError("job not specified.") self.job = self.job.lower() if not utils.is_symbol(self.job): raise ValidationError( "job should be a symbol (a-z, 0-9, and _) are allowed." ) job_folder = local.path(self.jobs_folder) / self.job delete_job = False if self.overwrite: delete_job = False elif self.force: delete_job = True elif job_folder.exists(): delete_job = confirm_yn( ( colors.red & colors.bold | f"Do you really want to remove ALL contents of " ) + ( colors.yellow | f"'{job_folder}'?\nIf no, then job may be in an inconsistent state.\n" ), "y", ) if delete_job: important(f"Deleting all of {job_folder}.") job_folder.delete() return job_folder
def run_zests(self, **kwargs): coverage = kwargs.pop("coverage", False) important(f"Running zests{' (with coverage)' if coverage else ''}...") if coverage: raise NotImplementedError ret = local["coverage"]["run", "./gen_main.py", "zest"] & RETCODE(FG=True) if ret == 0: local["coverage"]["html"] & FG local["xdg-open"]("./.coverage_html/index.html") else: from zest.zest_runner import ZestRunner try: runner = ZestRunner(include_dirs="./gen:./run:./tools", **kwargs) if runner.retcode != 0: raise CommandError return 0 except Exception as e: colorful_exception(e) return 1
def _request_field_from_user(self, field_name, type_, default): """Mock point""" headless = ValueError(f"Attempt to request field {field_name} in headless mode") while True: resp = input_request( f"Enter {field_name} ({type_.__name__} default={default}): ", default_when_headless=headless, ) try: if resp == "": resp = default if resp is None: val = None else: val = type_(resp) except Exception: important(f"Unable to convert '{resp}' to {type_}. Try again.") else: break return val
def _translate_s3_references(self, task, skip_s3): """ Any "inputs" block may have S3 references in which case plaster will do an s3 sync with that folder to a local cache and then substitute that local path so that Pipeline is always working with local files. """ for input_name, src_path in dict(task.inputs or {}).items(): if not input_name.startswith("_"): if src_path.startswith("s3:"): if not skip_s3: found_cache, dst_path = tmp.cache_path( "plaster_s3", src_path) if not found_cache: important(f"Syncing from {src_path} to {dst_path}") local["aws"]["s3", "sync", src_path, dst_path] & FG # COPY the old src_path by prefixing with underscore task.inputs["_" + input_name] = str(src_path) # RESET the input to the new dst_path task.inputs[input_name] = str(dst_path)
def _save_np(self, arr, name): if self.save_as is not None: save_as = local.path(self.save_as) + f"_{name}" np.save(save_as, arr) important(f"Wrote synth image to {save_as}.npy")
def _convert_message(target_dim, new_dim): """Mock-point""" important(f"Converting from dim {target_dim} to {new_dim}")
def main(self, job_folder=None): switches = utils.plumbum_switches(self) if job_folder is None: error(f"No job_folder was specified") return 1 important( f"Plaster run {job_folder} limit={self.limit} started at {arrow.utcnow().format()}" ) job_folder = assets.validate_job_folder_return_path( job_folder, allow_run_folders=True) if not job_folder.exists(): error(f"Unable to find the path {job_folder}") return 1 # Find all the plaster_run.yaml files. They might be in run subfolders found = list( job_folder.walk(filter=lambda p: p.name == "plaster_run.yaml")) run_dirs = [p.dirname for p in found] if len(run_dirs) == 0: error( "Plaster: Nothing to do because no run_dirs have plaster_run.yaml files" ) return 1 # A normal run where all happens in this process failure_count = 0 for run_dir_i, run_dir in enumerate(sorted(run_dirs)): metrics( _type="plaster_start", run_dir=run_dir, run_dir_i=run_dir_i, run_dir_n=len(run_dirs), **switches, ) important( f"Starting run subdirectory {run_dir}. {run_dir_i + 1} of {len(run_dirs)}" ) try: with zap.Context(cpu_limit=self.cpu_limit, debug_mode=self.debug_mode): run = RunExecutor(run_dir).load() if "_erisyon" in run.config: metrics(_type="erisyon_block", **run.config._erisyon) failure_count += run.execute( force=self.force, limit=self.limit.split(",") if self.limit else None, clean=self.clean, n_fields_limit=self.n_fields_limit, skip_s3=self.skip_s3, ) except Exception as e: failure_count += 1 if not self.continue_on_error: raise e if (failure_count == 0 and self.limit is None and not self.clean and not self.skip_reports): # RUN reports report_src_path = job_folder / "report.ipynb" report_dst_path = job_folder / "report.html" if (self.force or report_src_path.exists() and utils.out_of_date(report_src_path, report_dst_path)): self.run_ipynb(report_src_path) return 0 return failure_count