def is_blocked(self, state): if not self.status_path or not self.status_path.exists(): log_message("error", f"Status file for {self.dataset_id} cannot be found") sys.exit(1) # reload the status file in case somethings changed self.load_dataset_status_file() status_attrs = state.split(":") blocked = False if status_attrs[0] in self.stat["WAREHOUSE"].keys(): state_messages = sorted(self.stat["WAREHOUSE"][status_attrs[0]]) for ts, message in state_messages: if "Blocked" not in message and "Unblocked" not in message: continue message_items = message.split(":") if len(message_items) < 2: continue if message_items[0] not in state: continue if "Blocked" in message_items[1]: blocked = True elif "Unblocked" in message_items[1]: blocked = False return blocked
def latest_warehouse_dir(self): if self.warehouse_path is None or (not self.warehouse_path and not self.warehouse_path.exists()): log_message( "error", f"The dataset {self.dataset_id} does not have a warehouse path" ) sys.exit(1) if self.project != "CMIP6" and not self.warehouse_path.exists(): self.warehouse_path.mkdir(parents=True, exist_ok=True) if self.project == "CMIP6" and not self.warehouse_path.exists(): return None # import ipdb; ipdb.set_trace() # we assume that the warehouse directory contains only directories named "v0.#" or "v#" try: latest_version = sorted([ float(str(x.name)[1:]) for x in self.warehouse_path.iterdir() if x.is_dir() and any(x.iterdir()) and "tmp" not in x.name ]).pop() except IndexError: latest_version = 0 if not isinstance(latest_version, int) and latest_version.is_integer(): latest_version = int(latest_version) if latest_version < 0.1: latest_version = 0 path_to_latest = Path(self.warehouse_path, f"v{latest_version}").resolve() if "CMIP6" not in self.dataset_id and not path_to_latest.exists(): path_to_latest.mkdir(parents=True) return str(path_to_latest)
def check_climos(self, files): """ Given a list of climo files, find any that are missing """ missing = [] pattern = r"_\d{6}_\d{6}_climo.nc" files = sorted(files) idx = re.search(pattern=pattern, string=files[0]) if not idx: log_message("error", f"Unexpected file format: {files[0]}") sys.exit(1) prefix = files[0][:idx.start() - 2] for month in range(1, 13): name = f"{prefix}{month:02d}_{self.start_year:04d}{month:02d}_{self.end_year:04d}{month:02d}_climo.nc" if name not in files: missing.append(name) for season in SEASONS: name = f'{prefix}{season["name"]}_{self.start_year:04d}{season["start"]}_{self.end_year:04d}{season["end"]}_climo.nc' if name not in files: missing.append(name) return missing
def __init__(self, *args, **kwargs): super().__init__(**kwargs) self.name = NAME.upper() parallel = self.params.get('parallel') self.serial = False if parallel else True self.metadata_path = None log_message('info', f'initializing workflow {self.name}')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.name = NAME.upper() self.pub_path = None log_message( 'info', f'WF_pub_init Publication_init: initializing workflow {self.name}')
def load_children(self): my_path = Path(inspect.getfile(self.__class__)).parent.absolute() workflows = {} for d in os.scandir(my_path): if not d.is_dir() or d.name == "jobs" or d.name == "__pycache__": continue module_path = Path(my_path, d.name, '__init__.py') if not module_path.exists(): log_message( 'error', f"{module_path} doesnt exist, doesnt look like this is a well formatted workflow" ) sys.exit(1) workflows_string = f"warehouse{os.sep}workflows" idx = str(my_path.resolve()).find(workflows_string) if self.name == NAME: module_name = f'warehouse.workflows.{d.name}' else: module_name = f'warehouse.workflows{str(my_path)[idx+len(workflows_string):].replace(os.sep, ".")}.{d.name}' self.print_debug(f"loading workflow module {module_name}") module = importlib.import_module(module_name) workflow_class = getattr(module, module.NAME) workflow_instance = workflow_class( parent=self, slurm_scripts=self.slurm_scripts) workflow_instance.load_children() workflow_instance.load_transitions() workflows[module.NAME.upper()] = workflow_instance self.children = workflows
def find_e3sm_source_dataset(self, job): """ Given a job with a CMIP6 dataset that needs to be run, find the matching raw E3SM dataset it needs as input Parameters: job (WorkflowJob): the CMIP6 job that needs to have its requirements met Returns: Dataset, the E3SM dataset that matches the input requirements for the job if found, else None """ # msg = f"No raw E3SM dataset was in the list of datasets provided, seaching the warehouse for one that mathes {job}" # log_message("debug", msg) for x in self.collect_e3sm_datasets(): # log_message("info", f"DBG: ==== looping =========================================================================") # log_message("info", f"DBG: processing dataset x={x} in list of self.collect_e3sm_datasets(). Calling Dataset with") # log_message("info", f"DBG: pub_base={self.publication_path}") # log_message("info", f"DBG: war_base={self.warehouse_path}") dataset = Dataset(dataset_id=x, status_path=os.path.join(self.status_path, f"{x}.status"), pub_base=self.publication_path, warehouse_base=self.warehouse_path, archive_base=self.archive_path, no_status_file=True) # log_message("info", f"DBG: Dataset() returned: {dataset}, Testing for 'matches_requirement()'") if job.matches_requirement(dataset): # log_message("info", f"DBG: dataset matched job requirement") dataset.initialize_status_file() msg = f"matching dataset found: {dataset.dataset_id}" log_message("debug", msg, self.debug) return dataset return None
def status(self, status): """ Write out to the datasets status file and update its record of the latest state Because this is a @property you have to pass in the parameters along with the status as a tuple. Would love to have a solution for that uglyness """ self.load_dataset_status_file() latest, _ = self.get_latest_status() if status is None or status == self._status or latest == status: log_message( "info", f"DBG: DS: status.setter: Return pre-set with input status = {status}" ) return params = None if isinstance(status, tuple): status, params = status # msg = f"setting {self.dataset_id} to {status}" # log_message("debug", msg, ) self._status = status with open(self.status_path, "a") as outstream: tstamp = UTC.localize( datetime.utcnow()).strftime("%Y%m%d_%H%M%S_%f") # msg = f'STAT:{tstamp}:WAREHOUSE:{status}' msg = f'STAT:{tstamp}:{status}' if params is not None: items = [ f"{k}={v}".replace(":", "^") for k, v in params.items() ] msg += ",".join(items) outstream.write(msg + "\n") log_message("info", f"DBG: DS: status.setter: Wrote STAT message: {msg}")
def check_monthly(self, files): """ Given a list of monthly files, find any that are missing """ missing = [] files = sorted(files) pattern = r"\d{4}-\d{2}.*nc" try: idx = re.search(pattern=pattern, string=files[0]) except Exception as e: log_message( "error", f"file {files[0]} does not match expected pattern for monthly files", ) sys.exit(1) if not idx: log_message("error", f"Unexpected file format: {files[0]}") sys.exit(1) prefix = files[0][:idx.start()] suffix = files[0][idx.start() + 7:] for year in range(self.start_year, self.end_year + 1): for month in range(1, 13): name = f"{prefix}{year:04d}-{month:02d}{suffix}" if name not in files: missing.append(name) return missing
def __call__(self): from warehouse.warehouse import AutoWarehouse dataset_id = self.params['dataset_id'] log_message("info", f'Starting with datasets {dataset_id}') if (metadata_path := self.params.get('metadata_path')): self.metadata_path = Path(metadata_path)
def check_time_series(self, files): missing = [] files = [x.split("/")[-1] for x in sorted(files)] files_found = [] # DEBUG not self.datavasrs if not self.datavars: log_message( "error", f"dataset.py: check_time_series: dataset {self.dataset_id} is trying to validate time-series files, but has no datavars", ) sys.exit(1) for var in self.datavars: # depending on the mapping file used to regrid the time-series # they may have different names, so we start by finding # all the files for each variable v_files = list() for x in files: idx = -36 if "cmip6_180x360_aave" in x else -17 if var in x and x[:idx] == var: v_files.append(x) if not v_files: missing.append( f"{self.dataset_id}-{var}-{self.start_year:04d}-{self.end_year:04d}" ) continue v_files = sorted(v_files) v_start, v_end = self.get_ts_start_end(v_files[0]) if self.start_year != v_start: missing.append( f"{self.dataset_id}-{var}-{self.start_year:04d}-{v_start:04d}" ) prev_end = self.start_year for file in v_files: file_start, file_end = self.get_ts_start_end(file) if file_start == self.start_year: prev_end = file_end continue if file_start == prev_end + 1: prev_end = file_end else: missing.append( f"{self.dataset_id}-{var}-{prev_end:04d}-{file_start:04d}" ) file_start, file_end = self.get_ts_start_end(files[-1]) if file_end != self.end_year: missing.append( f"{self.dataset_id}-{var}-{file_start:04d}-{self.end_year:04d}" ) return missing
def arg_checker(args): if not os.path.exists(args.path): log_message('error', f"The given path {args.path} does not exist") return False, COMMAND if not not os.path.exists(args.zstash): log_message('error', f"The given path {args.zstash} does not exist") return False, COMMAND return True, COMMAND
def get_ts_start_end(filename): p = re.compile(r"_\d{6}_\d{6}.*nc") idx = p.search(filename) if not idx: log_message("error", f"Unexpected file format: {filename}") sys.exit(1) start = int(filename[idx.start() + 1:idx.start() + 5]) end = int(filename[idx.start() + 8:idx.start() + 12]) return start, end
def load_transitions(self): transition_path = Path( Path(inspect.getfile(self.__class__)).parents[0], 'transitions.yaml') with open(transition_path, 'r') as instream: self.transitions = yaml.load(instream, Loader=yaml.SafeLoader) log_message("info", f"WF_init: {self.name} loads transitions") log_message( "debug", f"WF_init: {self.name} loads transitions {self.transitions}")
def check_submonthly(self, files): missing = list() files = sorted(files) first = files[0] pattern = re.compile(r"\d{4}-\d{2}.*nc") if not (idx := pattern.search(first)): log_message("error", f"Unexpected file format: {first}") sys.exit(1)
def start_listener(self): """ Starts a file change listener for the status file for each of the datasets. """ self.listener = [] for _, dataset in self.datasets.items(): log_message("info", f"starting listener for {dataset.status_path}") listener = Listener(warehouse=self, file_path=dataset.status_path) listener.start() self.listener.append(listener) log_message("info", "Listener setup complete")
def __call__(self, slurm): if not self.meets_requirements(): log_message("error", f"Job does not meet requirements! {self.requires}") return None msg = f"Starting job: {str(self)} with reqs {[x.dataset_id for x in self.requires.values()]}" log_message('debug', msg) self.resolve_cmd() working_dir = self.dataset.latest_warehouse_dir if self.dataset.is_locked(working_dir): log_message( 'warning', f"Cant start job working dir is locked: {working_dir}") return None else: self.dataset.lock(working_dir) self._outname = self.get_slurm_output_script_name() output_option = ('-o', f'{Path(self._slurm_out, self._outname).resolve()}') self._slurm_opts.extend( [output_option, ('-N', 1), ('-c', self._job_workers)]) script_name = self.get_slurm_run_script_name() script_path = Path(self._slurm_out, script_name) script_path.touch(mode=0o664) message_file = NamedTemporaryFile(dir=self.tmpdir, delete=False) Path(message_file.name).touch() self._cmd = f"export message_file={message_file.name}\n" + self._cmd self.add_cmd_suffix() log_message( "info", f"WF_jobs_init:render_script: self,cmd={self.cmd}, script_path={str(script_path)}" ) slurm.render_script(self.cmd, str(script_path), self._slurm_opts) self._job_id = slurm.sbatch(str(script_path)) log_message( "info", f"WF_jobs_init: _call_: setting status to {self._parent}:{self.name}:Engaged: for {self.dataset.dataset_id}" ) self.dataset.status = (f"{self._parent}:{self.name}:Engaged:", { "slurm_id": self.job_id }) return self._job_id
def meets_requirements(self): """ Check if all the requirements for the job are met """ for req in self._requires: obtained = self._requires.get(req) # log_message("info", f"WF_jobs_init: job.meets_requirements(): checking req {req}") log_message( "debug", f"WF_jobs_init: job.meets_requirements(): self._requires.get(req) yields {obtained}" ) if not obtained: log_message( "info", f"WF_jobs_init: job.meets_requirements(): returning False") return False return True
def __call__(self, *args, **kwargs): from warehouse.warehouse import AutoWarehouse dataset_id = self.params['dataset_id'] tmpdir = self.params['tmp'] log_message( 'info', f'WF_pub_init Publication_call: starting workflow {self.name} for datasets {dataset_id}' ) if (pub_base := self.params.get('publication_path')): self.pub_path = Path(pub_base) if not self.pub_path.exists(): log_message( "info", f"WF_pub_init Publication_call: create pub dir {self.pub_path.resolve()}" ) os.makedirs(self.pub_path.resolve())
def check_done(self): """ Checks all the datasets to see if they're in the Pass or Fail state, if ALL datasets are in either Pass or Fail, then sys.exit(0) is called the filesystem listeners are shut down, and the 'should_exit' variable is set.""" all_done = True for dataset in self.datasets.values(): if (f"{self.workflow.name.upper()}:Pass:"******"{self.workflow.name.upper()}:Fail:" not in dataset.status): all_done = False if all_done: for listener in self.listener: listener.observer.stop() self.should_exit = True log_message("info", "All datasets complete, exiting") sys.exit(0) return
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.name = NAME # log_message("info", f"JOB PublishEsgf: processing {self.dataset.dataset_id}") log_message("info", f"JOB PublishEsgf: processing {self.dataset}") mapfile_path = sorted( [x for x in self.dataset.publication_path.glob('*.map')]).pop() optional_facets = {} if self.dataset.project == 'E3SM': dataset_attrs = self.dataset.dataset_id.split('.') model_version = dataset_attrs[1] experiment_name = dataset_attrs[2] experiment_info = self._spec['project']['E3SM'][model_version][ experiment_name] if (campaign := experiment_info.get('campaign')): optional_facets['campaign'] = campaign if (science_driver := experiment_info.get('science_driver')): optional_facets['science_driver'] = science_driver
def __call__(self, *args, **kwargs): from warehouse.warehouse import AutoWarehouse log_message('info', f'starting workflow {self.name}') dataset_ids = self.params['dataset_id'] warehouse_path = self.params['warehouse_path'] publication_path = self.params['publication_path'] archive_path = self.params['archive_path'] status_path = self.params.get('status_path') if (data_path := self.params.get('data_path')): warehouse = AutoWarehouse( workflow=self, dataset_id=dataset_ids, warehouse_path=data_path, publication_path=publication_path, archive_path=archive_path, status_path=status_path, serial=True, job_worker=self.job_workers, debug=self.debug)
def status_was_updated(self, path): """ This should be called whenever a datasets status file is updated Parameters: path (str) -> the path to the directory containing the status file """ dataset_id = None with open(path, "r") as instream: for line in instream.readlines(): if "DATASETID" in line: dataset_id = line.split("=")[-1].strip() if dataset_id is None: log_message("error", "Unable to find dataset ID in status file") dataset = self.datasets[dataset_id] dataset.update_from_status_file() dataset.unlock(dataset.latest_warehouse_dir) # check to see of there's a slurm ID in the second to last status # and if there is, and the latest is either Pass or Fail, then # remove the job from the job_pool latest, second_latest = dataset.get_latest_status() log_message("info", f"dataset: {dataset_id} updated to state {latest}") if second_latest is not None: latest_attrs = latest.split(":") second_latest_attrs = second_latest.split(":") if "slurm_id" in second_latest_attrs[-1]: job_id = int(second_latest_attrs[-1] [second_latest_attrs[-1].index("=") + 1:]) # if the job names are the same if second_latest_attrs[-3] == latest_attrs[-3]: if "Pass" in latest_attrs[-2] or "Fail" in latest_attrs[-2]: for job in self.job_pool: if job.job_id == job_id: self.job_pool.remove(job) break # start the transition change for the dataset self.start_datasets({dataset_id: dataset})
def setup_datasets(self, check_esgf=True): log_message("info", "WH: setup_datasets: Initializing the warehouse") log_message("info", f"WH: self.warehouse_path = {self.warehouse_path}") log_message("info", f"WH: self.publication_path = {self.publication_path}") cmip6_ids = [x for x in self.collect_cmip_datasets()] e3sm_ids = [x for x in self.collect_e3sm_datasets()] all_dataset_ids = cmip6_ids + e3sm_ids # if the user gave us a wild card, filter out anything # that doesn't match their pattern if self.dataset_ids and self.dataset_ids is not None: dataset_ids = [] for dataset_pattern in self.dataset_ids: if new_ids := fnmatch.filter(all_dataset_ids, dataset_pattern): dataset_ids.extend(new_ids) self.dataset_ids = dataset_ids
def print_missing(self): found_missing = False # import ipdb; ipdb.set_trace() for x in self.datasets.values(): if x.missing: found_missing = True for m in x.missing: print(f"{m}") elif x.status == DatasetStatus.UNITITIALIZED.value: found_missing = True msg = f"No files in dataset {x.dataset_id}" log_message("error", msg) elif x.status != DatasetStatus.SUCCESS.value: found_missing = True msg = f"Dataset {x.dataset_id} status is {x.status}" log_message("error", msg) if not found_missing: log_message("info", "No missing files in datasets")
def get_esgf_status(self): """ Check ESGF to see of the dataset has already been published, if it exists check that the dataset is complete """ # import ipdb; ipdb.set_trace() # TODO: fix this at some point if "CMIP6" in self.dataset_id: project = "CMIP6" else: project = "e3sm" facets = {"master_id": self.dataset_id, "type": "Dataset"} docs = search_esgf(project, facets) if not docs or int(docs[0]["number_of_files"]) == 0: if not docs: log_message( "info", f"dataset.py get_esgf_status: search facets for Dataset returned empty docs" ) else: log_message( "info", f"dataset.py get_esgf_status: dataset query returned file_count = {int(docs[0]['number_of_files'])}" ) return DatasetStatus.UNITITIALIZED.value facets = {"dataset_id": docs[0]["id"], "type": "File"} docs = search_esgf(project, facets) if not docs or len(docs) == 0: log_message( "info", f"dataset.py get_esgf_status: search facets for File returned empty docs" ) return DatasetStatus.UNITITIALIZED.value files = [x["title"] for x in docs] if self.check_dataset_is_complete(files): return DatasetStatus.PUBLISHED.value else: return DatasetStatus.PARTIAL_PUBLISHED.value
def arg_checker(args, command=NAME): if args.data_path and not args.dataset_id: log_message( 'error', "\nIf the data_path is given, please also give a dataset ID for the data at the path\n" ) return False, command if not args.dataset_id and not args.data_path: log_message( 'error', "\nError: please specify either the dataset-ids to process, or the data-path to find datasets\n" ) return False, command if isinstance(args.dataset_id, list) and len(args.dataset_id) > 1 and args.data_path: log_message( 'error', "\nMultiple datasets were given along with the --data-path. For multiple datasets you must use the --warehouse-path and the E3SM publication directory structure" ) return False, command return True, command
def start_datasets(self, datasets=None): """ Resolve next steps for datasets and create job objects for them Parameters: datasets dict of string dataset_ids to dataset objects Returns: list of new job objects """ log_message( "info", f"WH: start_datasets: Generate job objects for each dataset") log_message("debug", f"WH: start_datasets: datasets={datasets}") new_jobs = [] ready_states = [ DatasetStatus.NOT_IN_PUBLICATION.value, DatasetStatus.NOT_IN_WAREHOUSE.value, DatasetStatus.PARTIAL_PUBLISHED.value, DatasetStatus.UNITITIALIZED.value ] ''' DBG rsm = "" for stateval in ready_states: rsm = rsm + f"{stateval}," log_message("info", f"DBG: start_datasets: ready_states include {rsm}") end DBG ''' if datasets is None: datasets = self.datasets for dataset_id, dataset in datasets.items(): log_message( "debug", f"WH: start_datasets: working datasets_id {dataset_id} from datasets.items()" ) log_message( "debug", f"WH: start_datasets: dataset.status = {dataset.status}") if "Engaged" in dataset.status: log_message( "debug", f"WH: start_datasets: 'Engaged' in dataset.status: continue" ) continue # for all the datasets, if they're not yet published or in the warehouse # then mark them as ready to start if dataset.status in ready_states: log_message( 'debug', f"WH: start_datasets: Dataset {dataset.dataset_id} is transitioning from {dataset.status} to {DatasetStatus.READY.value}" ) dataset.status = DatasetStatus.READY.value continue # import ipdb; ipdb.set_trace() # we keep a reference to the workflow instance, so when # we make a job we can reconstruct the parent workflow name # for the status file log_message( "debug", f"WH: start_datasets: To reconstruct parent workflow name:") params = {} if parameters := dataset.status.split(":")[-1].strip(): for item in parameters.split(","): key, value = item.split("=") params[key] = value.replace("^", ":") log_message( "debug", f"WH: start_datasets: params[{key}] = {params[key]}") state = dataset.status workflow = self.workflow log_message("debug", f"WH: start_datasets: state = {state}") log_message("debug", f"WH: start_datasets: workflow = {self.workflow}") if state == DatasetStatus.UNITITIALIZED.value: state = DatasetStatusMessage.WAREHOUSE_READY.value # check that the dataset isnt blocked by some other process thats acting on it # and that the workflow hasnt either failed or succeeded # import ipdb; ipdb.set_trace() if dataset.is_blocked(state): msg = f"Dataset {dataset.dataset_id} at state {state} is marked as Blocked" log_message("error", msg) continue elif f"{self.workflow.name.upper()}:Pass:"******"{self.workflow.name.upper()}:Fail:" == state: self.workflow_error(dataset) self.check_done() continue # there may be several transitions out of this state and # we need to collect them all engaged_states = [] for item in self.workflow.next_state(dataset, state, params): new_state, workflow, params = item # if we have a new state with the "Engaged" keyword # we know its a leaf node that needs to be executed if "Engaged" in new_state: engaged_states.append((new_state, workflow, params)) # otherwise the new state and its parameters need to be # written to the dataset status file else: msg = f"warehouse: start_datasets: Dataset {dataset.dataset_id} transitioning to state {new_state}" if params: msg += f" with params {params}" log_message("info", msg) log_message("debug", msg, self.debug) dataset.status = (new_state, params) if not engaged_states: continue for state, workflow, params in engaged_states: # import ipdb; ipdb.set_trace() newjob = self.workflow.get_job( dataset, state, params, self.scripts_path, self.slurm_path, workflow=workflow, job_workers=self.job_workers, spec=self.dataset_spec, debug=self.debug, config=warehouse_conf, other_datasets=list(self.datasets.values()), serial=self.serial, tmpdir=self.tmpdir, ) if newjob is None: continue # check if the new job is a duplicate if (matching_job := self.find_matching_job(newjob)) is None: log_message( "debug", f"Created jobs from {state} for dataset {dataset_id}") new_jobs.append(newjob) else: matching_job.setup_requisites(newjob.dataset)
def __init__(self, *args, **kwargs): super().__init__() self.warehouse_path = Path( kwargs.get("warehouse_path", DEFAULT_WAREHOUSE_PATH)) self.publication_path = Path( kwargs.get("publication_path", DEFAULT_PUBLICATION_PATH)) self.archive_path = Path( kwargs.get("archive_path", DEFAULT_ARCHIVE_PATH)) self.status_path = Path(kwargs.get("status_path", DEFAULT_STATUS_PATH)) self.spec_path = Path(kwargs.get("spec_path", DEFAULT_SPEC_PATH)) self.num_workers = kwargs.get("num", 8) self.serial = kwargs.get("serial", False) self.testing = kwargs.get("testing", False) self.dataset_ids = kwargs.get("dataset_id") if self.dataset_ids is not None and not isinstance( self.dataset_ids, list): self.dataset_ids = [self.dataset_ids] self.slurm_path = kwargs.get("slurm", "slurm_scripts") self.report_missing = kwargs.get("report_missing") self.job_workers = kwargs.get("job_workers", 8) self.datasets = None self.datasets_from_path = kwargs.get("datasets_from_path", False) os.makedirs(self.slurm_path, exist_ok=True) self.should_exit = False if kwargs.get("debug"): self.debug = "DEBUG" else: self.debug = "INFO" self.ask = kwargs.get("ask") self.tmpdir = kwargs.get("tmp", os.environ.get("TMPDIR", '/tmp')) self.scripts_path = Path( Path(inspect.getfile(self.__class__)).parent.absolute(), "scripts").resolve() # not sure where to put this - Tony setup_logging("debug", f"{self.slurm_path}/warehouse.log") if self.report_missing: pass else: self.workflow = kwargs.get( "workflow", Workflow(slurm_scripts=self.slurm_path, debug=self.debug, job_workers=self.job_workers)) self.workflow.load_children() self.workflow.load_transitions() # this is a list of WorkflowJob objects self.job_pool = [] # create the local Slurm object self.slurm = Slurm() # dont setup the listener until after we've gathered the datasets self.listener = None if self.serial is True: log_message("info", "Running warehouse in serial mode") else: log_message( "info", f"Running warehouse in parallel mode with {self.num_workers} workers", ) with open(self.spec_path, "r") as instream: self.dataset_spec = yaml.load(instream, Loader=yaml.SafeLoader)
def workflow_success(self, dataset): log_message( "info", f"Dataset {dataset.dataset_id} SUCCEEDED from {dataset.status}")