def run(self): """ Override of Custodian.run() to include instructions to copy the temp_dir to the scratch partition on slave compute nodes if requested. """ cwd = os.getcwd() with ScratchDir(self.scratch_dir, create_symbolic_link=True, copy_to_current_on_exit=True, copy_from_current_on_enter=True) as temp_dir: self._manage_node_scratch(temp_dir_path=temp_dir, job_start=True) self.total_errors = 0 start = datetime.datetime.now() logger.info("Run started at {} in {}.".format( start, temp_dir)) v = sys.version.replace("\n", " ") logger.info("Custodian running on Python version {}".format(v)) try: # skip jobs until the restart for job_n, job in islice(enumerate(self.jobs, 1), self.restart, None): self._run_job(job_n, job, temp_dir) # Checkpoint after each job so that we can recover from # last point and remove old checkpoints if self.checkpoint: super(SSHCustodian, self)._save_checkpoint(cwd, job_n) except CustodianError as ex: logger.error(ex.message) if ex.raises: raise RuntimeError("{} errors reached: {}. Exited..." .format(self.total_errors, ex)) finally: # Log the corrections to a json file. logger.info("Logging to {}...".format(super(SSHCustodian, self).LOG_FILE)) dumpfn(self.run_log, super(SSHCustodian, self).LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info("Run ended at {}.".format(end)) run_time = end - start logger.info("Run completed. Total time taken = {}." .format(run_time)) # Remove duplicate copy of log file, provided it ends with # ".log" for x in ([x for x in os.listdir(temp_dir) if re.match(r'\w*\.log', x)]): os.remove(os.path.join(temp_dir, x)) self._manage_node_scratch(temp_dir_path=temp_dir, job_start=False) if self.gzipped_output: gzip_dir(".") # Cleanup checkpoint files (if any) if run is successful. super(SSHCustodian, self)._delete_checkpoints(cwd) return self.run_log
def run(self): """ Runs all the jobs jobs. Returns: All errors encountered as a list of list. [[error_dicts for job 1], [error_dicts for job 2], ....] """ cwd = os.getcwd() with ScratchDir(self.scratch_dir, create_symbolic_link=True, copy_to_current_on_exit=True, copy_from_current_on_enter=True) as temp_dir: self.total_errors = 0 start = datetime.datetime.now() logger.info("Run started at {} in {}.".format(start, temp_dir)) v = sys.version.replace("\n", " ") logger.info("Custodian running on Python version {}".format(v)) logger.info( "Hostname: {}, Cluster: {}".format(*get_execution_host_info())) try: # skip jobs until the restart for job_n, job in islice(enumerate(self.jobs, 1), self.restart, None): self._run_job(job_n, job) # Checkpoint after each job so that we can recover from last # point and remove old checkpoints if self.checkpoint: self.restart = job_n Custodian._save_checkpoint(cwd, job_n) except CustodianError as ex: logger.error(ex.message) if ex.raises: raise RuntimeError( "{} errors reached: {}. Exited...".format( self.total_errors, ex)) finally: # Log the corrections to a json file. logger.info("Logging to {}...".format(Custodian.LOG_FILE)) dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info("Run ended at {}.".format(end)) run_time = end - start logger.info( "Run completed. Total time taken = {}.".format(run_time)) if self.gzipped_output: gzip_dir(".") # Cleanup checkpoint files (if any) if run is successful. Custodian._delete_checkpoints(cwd) return self.run_log
def run(self): """ Runs all the jobs jobs. Returns: All errors encountered as a list of list. [[error_dicts for job 1], [error_dicts for job 2], ....] """ cwd = os.getcwd() with ScratchDir(self.scratch_dir, create_symbolic_link=True, copy_to_current_on_exit=True, copy_from_current_on_enter=True) as temp_dir: self.total_errors = 0 start = datetime.datetime.now() logger.info("Run started at {} in {}.".format( start, temp_dir)) v = sys.version.replace("\n", " ") logger.info("Custodian running on Python version {}".format(v)) logger.info("Hostname: {}, Cluster: {}".format( *get_execution_host_info())) try: # skip jobs until the restart for job_n, job in islice(enumerate(self.jobs, 1), self.restart, None): self._run_job(job_n, job) # Checkpoint after each job so that we can recover from last # point and remove old checkpoints if self.checkpoint: self.restart = job_n Custodian._save_checkpoint(cwd, job_n) except CustodianError as ex: logger.error(ex.message) if ex.raises: raise RuntimeError("{} errors reached: {}. Exited..." .format(self.total_errors, ex)) finally: # Log the corrections to a json file. logger.info("Logging to {}...".format(Custodian.LOG_FILE)) dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info("Run ended at {}.".format(end)) run_time = end - start logger.info("Run completed. Total time taken = {}." .format(run_time)) if self.gzipped_output: gzip_dir(".") # Cleanup checkpoint files (if any) if run is successful. Custodian._delete_checkpoints(cwd) return self.run_log
def test_gzip(self): full_f = os.path.join(test_dir, "gzip_dir", "tempfile") gzip_dir(os.path.join(test_dir, "gzip_dir")) self.assertTrue(os.path.exists("{}.gz".format(full_f))) self.assertFalse((os.path.exists(full_f))) with GzipFile("{}.gz".format(full_f)) as g: self.assertEqual(g.readline().decode("utf-8"), "what") self.assertAlmostEqual(os.path.getmtime("{}.gz".format(full_f)), self.mtime, 4)
def test_gzip(self): full_f = os.path.join(test_dir, "gzip_dir", "tempfile") gzip_dir(os.path.join(test_dir, "gzip_dir")) self.assertTrue(os.path.exists("{}.gz".format(full_f))) self.assertFalse((os.path.exists(full_f))) with GzipFile("{}.gz".format(full_f)) as g: self.assertEquals(g.readline().decode("utf-8"), "what") self.assertAlmostEqual(os.path.getmtime("{}.gz".format(full_f)), self.mtime, 4)
def test_handle_sub_dirs(self): sub_dir = os.path.join(test_dir, "gzip_dir", "sub_dir") sub_file = os.path.join(sub_dir, "new_tempfile") os.mkdir(sub_dir) with open(sub_file, "w") as f: f.write("anotherwhat") gzip_dir(os.path.join(test_dir, "gzip_dir")) self.assertTrue(os.path.exists("{}.gz".format(sub_file))) self.assertFalse((os.path.exists(sub_file))) with GzipFile("{}.gz".format(sub_file)) as g: self.assertEqual(g.readline().decode("utf-8"), "anotherwhat")
def run_task(self, fw_spec=None): cwd = os.getcwd() gzip_dir(cwd)
def run_interrupted(self): """ Runs custodian in a interuppted mode, which sets up and validates jobs but doesn't run the executable Returns: number of remaining jobs Raises: ValidationError: if a job fails validation ReturnCodeError: if the process has a return code different from 0 NonRecoverableError: if an unrecoverable occurs MaxCorrectionsPerJobError: if max_errors_per_job is reached MaxCorrectionsError: if max_errors is reached MaxCorrectionsPerHandlerError: if max_errors_per_handler is reached """ start = datetime.datetime.now() try: cwd = os.getcwd() v = sys.version.replace("\n", " ") logger.info( "Custodian started in singleshot mode at {} in {}.".format( start, cwd)) logger.info("Custodian running on Python version {}".format(v)) # load run log if os.path.exists(Custodian.LOG_FILE): self.run_log = loadfn(Custodian.LOG_FILE, cls=MontyDecoder) if len(self.run_log) == 0: # starting up an initial job - setup input and quit job_n = 0 job = self.jobs[job_n] logger.info("Setting up job no. 1 ({}) ".format(job.name)) job.setup() self.run_log.append({ "job": job.as_dict(), "corrections": [], 'job_n': job_n }) return len(self.jobs) else: # Continuing after running calculation job_n = self.run_log[-1]['job_n'] job = self.jobs[job_n] # If we had to fix errors from a previous run, insert clean log # dict if len(self.run_log[-1]['corrections']) > 0: logger.info("Reran {}.run due to fixable errors".format( job.name)) # check error handlers logger.info("Checking error handlers for {}.run".format( job.name)) if self._do_check(self.handlers): logger.info("Failed validation based on error handlers") # raise an error for an unrecoverable error for x in self.run_log[-1]["corrections"]: if not x["actions"] and x[ "handler"].raises_runtime_error: self.run_log[-1]["handler"] = x["handler"] s = "Unrecoverable error for handler: {}. " \ "Raising RuntimeError".format(x["handler"]) raise NonRecoverableError(s, True, x["handler"]) logger.info("Corrected input based on error handlers") # Return with more jobs to run if recoverable error caught # and corrected for return len(self.jobs) - job_n # check validators logger.info("Checking validator for {}.run".format(job.name)) for v in self.validators: if v.check(): self.run_log[-1]["validator"] = v logger.info("Failed validation based on validator") s = "Validation failed: {}".format(v) raise ValidationError(s, True, v) logger.info("Postprocessing for {}.run".format(job.name)) job.postprocess() # IF DONE WITH ALL JOBS - DELETE ALL CHECKPOINTS AND RETURN # VALIDATED if len(self.jobs) == (job_n + 1): self.finished = True return 0 # Setup next job_n job_n += 1 job = self.jobs[job_n] self.run_log.append({ "job": job.as_dict(), "corrections": [], 'job_n': job_n }) job.setup() return len(self.jobs) - job_n except CustodianError as ex: logger.error(ex.message) if ex.raises: raise finally: # Log the corrections to a json file. logger.info("Logging to {}...".format(Custodian.LOG_FILE)) dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info("Run ended at {}.".format(end)) run_time = end - start logger.info( "Run completed. Total time taken = {}.".format(run_time)) if self.finished and self.gzipped_output: gzip_dir(".")
def run_interrupted(self): """ Runs custodian in a interuppted mode, which sets up and validates jobs but doesn't run the executable Returns: number of remaining jobs Raises: CustodianError on unrecoverable errors, and jobs that fail validation """ try: cwd = os.getcwd() start = datetime.datetime.now() v = sys.version.replace("\n", " ") logger.info("Custodian started in singleshot mode at {} in {}." .format(start, cwd)) logger.info("Custodian running on Python version {}".format(v)) # load run log if os.path.exists(Custodian.LOG_FILE): self.run_log = loadfn(Custodian.LOG_FILE, cls=MontyDecoder) if len(self.run_log) == 0: # starting up an initial job - setup input and quit job_n = 0 job = self.jobs[job_n] logger.info("Setting up job no. 1 ({}) ".format(job.name)) job.setup() self.run_log.append({"job": job.as_dict(), "corrections": [], 'job_n': job_n}) return len(self.jobs) else: # Continuing after running calculation job_n = self.run_log[-1]['job_n'] job = self.jobs[job_n] # If we had to fix errors from a previous run, insert clean log # dict if len(self.run_log[-1]['corrections']) > 0: logger.info("Reran {}.run due to fixable errors".format(job.name)) # check error handlers logger.info("Checking error handlers for {}.run".format(job.name)) if self._do_check(self.handlers): logger.info("Failed validation based on error handlers") # raise an error for an unrecoverable error for x in self.run_log[-1]["corrections"]: if not x["actions"] and x["handler"].raises_runtime_error: s = "Unrecoverable error for handler: {}. " \ "Raising RuntimeError".format(x["handler"]) raise CustodianError(s, True, x["handler"]) logger.info("Corrected input based on error handlers") # Return with more jobs to run if recoverable error caught # and corrected for return len(self.jobs) - job_n # check validators logger.info("Checking validator for {}.run".format(job.name)) for v in self.validators: if v.check(): logger.info("Failed validation based on validator") s = "Validation failed: {}".format(v) raise CustodianError(s, True, v) logger.info("Postprocessing for {}.run".format(job.name)) job.postprocess() # IF DONE WITH ALL JOBS - DELETE ALL CHECKPOINTS AND RETURN # VALIDATED if len(self.jobs) == (job_n + 1): self.finished = True return 0 # Setup next job_n job_n += 1 job = self.jobs[job_n] self.run_log.append({"job": job.as_dict(), "corrections": [], 'job_n': job_n}) job.setup() return len(self.jobs) - job_n except CustodianError as ex: logger.error(ex.message) if ex.raises: raise RuntimeError("{} errors reached: {}. Exited..." .format(self.total_errors, ex)) finally: #Log the corrections to a json file. logger.info("Logging to {}...".format(Custodian.LOG_FILE)) dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info("Run ended at {}.".format(end)) run_time = end - start logger.info("Run completed. Total time taken = {}." .format(run_time)) if self.finished and self.gzipped_output: gzip_dir(".")
def postprocess(self): """ Renaming or gzipping as needed. """ if self.gzipped: gzip_dir(".")
def run_task(self, fw_spec): # Get data_dir = self.get('data_dir') calc_rdf = self.get('calc_rdf', True) calc_bad = self.get('calc_bad', True) calc_voronoi = self.get('calc_voronoi', False) calc_cage = self.get('calc_cage', True) calc_pmf = self.get('calc_pmf', False) calc_connectivity = self.get('calc_connectivity', False) ionic_steps = [] for root, dirs, files in os.walk(data_dir): for f in files: if 'ionic_steps' in f: name, ext = os.path.splitext(f) if ext in ('.gz', '.GZ', '.Z'): with gzip.open(f, "rb") as gzipped: d = json.loads(gzipped.read().decode("ascii")) else: d = loadfn(f) ionic_steps.extend(d) structures = [step['structure'] for step in ionic_steps] data_dict = {} if calc_rdf: logger.info("LOGGER: Calculating radial distribution functions...") rdf = RadialDistributionFunction(structures=structures) rdf.get_radial_distribution_functions(nproc=4) cns = rdf.get_coordination_numbers() fs = rdf.first_coordination_shell_radius data_dict.update({'radial_distribution_functions': rdf.as_dict()}) data_dict.update({'coordination_numbers': cns}) if calc_cage: logger.info("LOGGER: Calculating cage correlation function...") ccf = CageCorrelationFunction(structures, fs) ccf.get_cage_correlation_function() # TODO: Make sure the CCFs work if calc_pmf: logger.info( "LOGGER: Calculating the potential of mean force...") # TODO: Need to include the implementation of PMF here if calc_bad: logger.info( "LOGGER: Calculating bond angle distribution functions...") bad = BondAngleDistribution(structures=structures) bad.get_bond_angle_distribution(nproc=4) data_dict.update( {'bond_angle_distribution_functions': bad.as_dict()}) if calc_voronoi: logger.info("LOGGER: Performing voronoi analysis...") va = VoronoiAnalyzer(structures) try: poly = va.analyze_structures() data_dict.update({'voronoi_polyhedra': poly}) except MemoryError: logger.info( "ERROR: Voronoi analysis failed due to insufficient memory..." ) if calc_connectivity: logger.info( "LOGGER: Getting the connectivity motif distribution functions..." ) # TODO: Implement after writing connectivity function # write structural analysis results to json file and then zip it write_dir = os.path.join(os.getcwd(), 'structural_analysis') os.mkdir(write_dir) for k, v in data_dict.items(): dumpfn(v, os.path.join(write_dir, '{}.json').format(k)) gzip_dir(write_dir) return FWAction()
def run(self): """ Runs all jobs. Returns: All errors encountered as a list of list. [[error_dicts for job 1], [error_dicts for job 2], ....] Raises: ValidationError: if a job fails validation ReturnCodeError: if the process has a return code different from 0 NonRecoverableError: if an unrecoverable occurs MaxCorrectionsPerJobError: if max_errors_per_job is reached MaxCorrectionsError: if max_errors is reached MaxCorrectionsPerHandlerError: if max_errors_per_handler is reached """ cwd = os.getcwd() with ScratchDir( self.scratch_dir, create_symbolic_link=True, copy_to_current_on_exit=True, copy_from_current_on_enter=True, ) as temp_dir: self.total_errors = 0 start = datetime.datetime.now() logger.info(f"Run started at {start} in {temp_dir}.") v = sys.version.replace("\n", " ") logger.info(f"Custodian running on Python version {v}") host, cluster = get_execution_host_info() logger.info(f"Hostname: {host}, Cluster: {cluster}") try: # skip jobs until the restart for job_n, job in islice(enumerate(self.jobs, 1), self.restart, None): self._run_job(job_n, job) # We do a dump of the run log after each job. dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) # Checkpoint after each job so that we can recover from last # point and remove old checkpoints if self.checkpoint: self.restart = job_n Custodian._save_checkpoint(cwd, job_n) except CustodianError as ex: logger.error(ex.message) if ex.raises: raise finally: # Log the corrections to a json file. logger.info(f"Logging to {Custodian.LOG_FILE}...") dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder, indent=4) end = datetime.datetime.now() logger.info(f"Run ended at {end}.") run_time = end - start logger.info(f"Run completed. Total time taken = {run_time}.") if self.gzipped_output: gzip_dir(".") # Cleanup checkpoint files (if any) if run is successful. Custodian._delete_checkpoints(cwd) return self.run_log