Example #1
0
    def run(self):
        """
        Override of Custodian.run() to include instructions to copy the
        temp_dir to the scratch partition on slave compute nodes if requested.
        """
        cwd = os.getcwd()

        with ScratchDir(self.scratch_dir, create_symbolic_link=True,
                        copy_to_current_on_exit=True,
                        copy_from_current_on_enter=True) as temp_dir:
            self._manage_node_scratch(temp_dir_path=temp_dir,
                                      job_start=True)
            self.total_errors = 0
            start = datetime.datetime.now()
            logger.info("Run started at {} in {}.".format(
                start, temp_dir))
            v = sys.version.replace("\n", " ")
            logger.info("Custodian running on Python version {}".format(v))

            try:
                # skip jobs until the restart
                for job_n, job in islice(enumerate(self.jobs, 1),
                                         self.restart, None):
                    self._run_job(job_n, job, temp_dir)
                    # Checkpoint after each job so that we can recover from
                    # last point and remove old checkpoints
                    if self.checkpoint:
                        super(SSHCustodian, self)._save_checkpoint(cwd, job_n)
            except CustodianError as ex:
                logger.error(ex.message)
                if ex.raises:
                    raise RuntimeError("{} errors reached: {}. Exited..."
                                       .format(self.total_errors, ex))
            finally:
                # Log the corrections to a json file.
                logger.info("Logging to {}...".format(super(SSHCustodian,
                                                            self).LOG_FILE))
                dumpfn(self.run_log, super(SSHCustodian, self).LOG_FILE,
                       cls=MontyEncoder, indent=4)
                end = datetime.datetime.now()
                logger.info("Run ended at {}.".format(end))
                run_time = end - start
                logger.info("Run completed. Total time taken = {}."
                            .format(run_time))
                # Remove duplicate copy of log file, provided it ends with
                # ".log"
                for x in ([x for x in os.listdir(temp_dir)
                           if re.match(r'\w*\.log', x)]):
                    os.remove(os.path.join(temp_dir, x))
                self._manage_node_scratch(temp_dir_path=temp_dir,
                                          job_start=False)
                if self.gzipped_output:
                    gzip_dir(".")

            # Cleanup checkpoint files (if any) if run is successful.
            super(SSHCustodian, self)._delete_checkpoints(cwd)

        return self.run_log
Example #2
0
    def run(self):
        """
        Runs all the jobs jobs.

        Returns:
            All errors encountered as a list of list.
            [[error_dicts for job 1], [error_dicts for job 2], ....]
        """
        cwd = os.getcwd()

        with ScratchDir(self.scratch_dir,
                        create_symbolic_link=True,
                        copy_to_current_on_exit=True,
                        copy_from_current_on_enter=True) as temp_dir:
            self.total_errors = 0
            start = datetime.datetime.now()
            logger.info("Run started at {} in {}.".format(start, temp_dir))
            v = sys.version.replace("\n", " ")
            logger.info("Custodian running on Python version {}".format(v))
            logger.info(
                "Hostname: {}, Cluster: {}".format(*get_execution_host_info()))

            try:
                # skip jobs until the restart
                for job_n, job in islice(enumerate(self.jobs, 1), self.restart,
                                         None):
                    self._run_job(job_n, job)
                    # Checkpoint after each job so that we can recover from last
                    # point and remove old checkpoints
                    if self.checkpoint:
                        self.restart = job_n
                        Custodian._save_checkpoint(cwd, job_n)
            except CustodianError as ex:
                logger.error(ex.message)
                if ex.raises:
                    raise RuntimeError(
                        "{} errors reached: {}. Exited...".format(
                            self.total_errors, ex))
            finally:
                # Log the corrections to a json file.
                logger.info("Logging to {}...".format(Custodian.LOG_FILE))
                dumpfn(self.run_log,
                       Custodian.LOG_FILE,
                       cls=MontyEncoder,
                       indent=4)
                end = datetime.datetime.now()
                logger.info("Run ended at {}.".format(end))
                run_time = end - start
                logger.info(
                    "Run completed. Total time taken = {}.".format(run_time))
                if self.gzipped_output:
                    gzip_dir(".")

            # Cleanup checkpoint files (if any) if run is successful.
            Custodian._delete_checkpoints(cwd)

        return self.run_log
Example #3
0
    def run(self):
        """
        Runs all the jobs jobs.

        Returns:
            All errors encountered as a list of list.
            [[error_dicts for job 1], [error_dicts for job 2], ....]
        """
        cwd = os.getcwd()

        with ScratchDir(self.scratch_dir, create_symbolic_link=True,
                        copy_to_current_on_exit=True,
                        copy_from_current_on_enter=True) as temp_dir:
            self.total_errors = 0
            start = datetime.datetime.now()
            logger.info("Run started at {} in {}.".format(
                start, temp_dir))
            v = sys.version.replace("\n", " ")
            logger.info("Custodian running on Python version {}".format(v))
            logger.info("Hostname: {}, Cluster: {}".format(
                *get_execution_host_info()))

            try:
                # skip jobs until the restart
                for job_n, job in islice(enumerate(self.jobs, 1),
                                         self.restart, None):
                    self._run_job(job_n, job)
                    # Checkpoint after each job so that we can recover from last
                    # point and remove old checkpoints
                    if self.checkpoint:
                        self.restart = job_n
                        Custodian._save_checkpoint(cwd, job_n)
            except CustodianError as ex:
                logger.error(ex.message)
                if ex.raises:
                    raise RuntimeError("{} errors reached: {}. Exited..."
                                       .format(self.total_errors, ex))
            finally:
                # Log the corrections to a json file.
                logger.info("Logging to {}...".format(Custodian.LOG_FILE))
                dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder,
                       indent=4)
                end = datetime.datetime.now()
                logger.info("Run ended at {}.".format(end))
                run_time = end - start
                logger.info("Run completed. Total time taken = {}."
                            .format(run_time))
                if self.gzipped_output:
                    gzip_dir(".")

            # Cleanup checkpoint files (if any) if run is successful.
            Custodian._delete_checkpoints(cwd)

        return self.run_log
Example #4
0
    def test_gzip(self):
        full_f = os.path.join(test_dir, "gzip_dir", "tempfile")
        gzip_dir(os.path.join(test_dir, "gzip_dir"))

        self.assertTrue(os.path.exists("{}.gz".format(full_f)))
        self.assertFalse((os.path.exists(full_f)))

        with GzipFile("{}.gz".format(full_f)) as g:
            self.assertEqual(g.readline().decode("utf-8"), "what")

        self.assertAlmostEqual(os.path.getmtime("{}.gz".format(full_f)), self.mtime, 4)
Example #5
0
    def test_gzip(self):
        full_f = os.path.join(test_dir, "gzip_dir", "tempfile")
        gzip_dir(os.path.join(test_dir, "gzip_dir"))

        self.assertTrue(os.path.exists("{}.gz".format(full_f)))
        self.assertFalse((os.path.exists(full_f)))

        with GzipFile("{}.gz".format(full_f)) as g:
            self.assertEquals(g.readline().decode("utf-8"), "what")

        self.assertAlmostEqual(os.path.getmtime("{}.gz".format(full_f)),
                               self.mtime, 4)
Example #6
0
    def test_handle_sub_dirs(self):
        sub_dir = os.path.join(test_dir, "gzip_dir", "sub_dir")
        sub_file = os.path.join(sub_dir, "new_tempfile")
        os.mkdir(sub_dir)
        with open(sub_file, "w") as f:
            f.write("anotherwhat")

        gzip_dir(os.path.join(test_dir, "gzip_dir"))

        self.assertTrue(os.path.exists("{}.gz".format(sub_file)))
        self.assertFalse((os.path.exists(sub_file)))

        with GzipFile("{}.gz".format(sub_file)) as g:
            self.assertEqual(g.readline().decode("utf-8"), "anotherwhat")
Example #7
0
 def run_task(self, fw_spec=None):
     cwd = os.getcwd()
     gzip_dir(cwd)
Example #8
0
    def run_interrupted(self):
        """
        Runs custodian in a interuppted mode, which sets up and
        validates jobs but doesn't run the executable

        Returns:
            number of remaining jobs

        Raises:
            ValidationError: if a job fails validation
            ReturnCodeError: if the process has a return code different from 0
            NonRecoverableError: if an unrecoverable occurs
            MaxCorrectionsPerJobError: if max_errors_per_job is reached
            MaxCorrectionsError: if max_errors is reached
            MaxCorrectionsPerHandlerError: if max_errors_per_handler is reached
        """
        start = datetime.datetime.now()
        try:
            cwd = os.getcwd()
            v = sys.version.replace("\n", " ")
            logger.info(
                "Custodian started in singleshot mode at {} in {}.".format(
                    start, cwd))
            logger.info("Custodian running on Python version {}".format(v))

            # load run log
            if os.path.exists(Custodian.LOG_FILE):
                self.run_log = loadfn(Custodian.LOG_FILE, cls=MontyDecoder)

            if len(self.run_log) == 0:
                # starting up an initial job - setup input and quit
                job_n = 0
                job = self.jobs[job_n]
                logger.info("Setting up job no. 1 ({}) ".format(job.name))
                job.setup()
                self.run_log.append({
                    "job": job.as_dict(),
                    "corrections": [],
                    'job_n': job_n
                })
                return len(self.jobs)
            else:
                # Continuing after running calculation
                job_n = self.run_log[-1]['job_n']
                job = self.jobs[job_n]

                # If we had to fix errors from a previous run, insert clean log
                # dict
                if len(self.run_log[-1]['corrections']) > 0:
                    logger.info("Reran {}.run due to fixable errors".format(
                        job.name))

                # check error handlers
                logger.info("Checking error handlers for {}.run".format(
                    job.name))
                if self._do_check(self.handlers):
                    logger.info("Failed validation based on error handlers")
                    # raise an error for an unrecoverable error
                    for x in self.run_log[-1]["corrections"]:
                        if not x["actions"] and x[
                                "handler"].raises_runtime_error:
                            self.run_log[-1]["handler"] = x["handler"]
                            s = "Unrecoverable error for handler: {}. " \
                                "Raising RuntimeError".format(x["handler"])
                            raise NonRecoverableError(s, True, x["handler"])
                    logger.info("Corrected input based on error handlers")
                    # Return with more jobs to run if recoverable error caught
                    # and corrected for
                    return len(self.jobs) - job_n

                # check validators
                logger.info("Checking validator for {}.run".format(job.name))
                for v in self.validators:
                    if v.check():
                        self.run_log[-1]["validator"] = v
                        logger.info("Failed validation based on validator")
                        s = "Validation failed: {}".format(v)
                        raise ValidationError(s, True, v)

                logger.info("Postprocessing for {}.run".format(job.name))
                job.postprocess()

                # IF DONE WITH ALL JOBS - DELETE ALL CHECKPOINTS AND RETURN
                # VALIDATED
                if len(self.jobs) == (job_n + 1):
                    self.finished = True
                    return 0

                # Setup next job_n
                job_n += 1
                job = self.jobs[job_n]
                self.run_log.append({
                    "job": job.as_dict(),
                    "corrections": [],
                    'job_n': job_n
                })
                job.setup()
                return len(self.jobs) - job_n

        except CustodianError as ex:
            logger.error(ex.message)
            if ex.raises:
                raise

        finally:
            # Log the corrections to a json file.
            logger.info("Logging to {}...".format(Custodian.LOG_FILE))
            dumpfn(self.run_log,
                   Custodian.LOG_FILE,
                   cls=MontyEncoder,
                   indent=4)
            end = datetime.datetime.now()
            logger.info("Run ended at {}.".format(end))
            run_time = end - start
            logger.info(
                "Run completed. Total time taken = {}.".format(run_time))
            if self.finished and self.gzipped_output:
                gzip_dir(".")
Example #9
0
    def run_interrupted(self):
        """
        Runs custodian in a interuppted mode, which sets up and
        validates jobs but doesn't run the executable

        Returns:
            number of remaining jobs

        Raises:
            CustodianError on unrecoverable errors, and jobs that fail
            validation
        """

        try:
            cwd = os.getcwd()
            start = datetime.datetime.now()
            v = sys.version.replace("\n", " ")
            logger.info("Custodian started in singleshot mode at {} in {}."
                        .format(start, cwd))
            logger.info("Custodian running on Python version {}".format(v))

            # load run log
            if os.path.exists(Custodian.LOG_FILE):
                self.run_log = loadfn(Custodian.LOG_FILE, cls=MontyDecoder)

            if len(self.run_log) == 0:
                # starting up an initial job - setup input and quit
                job_n = 0
                job = self.jobs[job_n]
                logger.info("Setting up job no. 1 ({}) ".format(job.name))
                job.setup()
                self.run_log.append({"job": job.as_dict(), "corrections": [], 'job_n': job_n})
                return len(self.jobs)
            else:
                # Continuing after running calculation
                job_n = self.run_log[-1]['job_n']
                job = self.jobs[job_n]

                # If we had to fix errors from a previous run, insert clean log
                # dict
                if len(self.run_log[-1]['corrections']) > 0:
                    logger.info("Reran {}.run due to fixable errors".format(job.name))

                # check error handlers
                logger.info("Checking error handlers for {}.run".format(job.name))
                if self._do_check(self.handlers):
                    logger.info("Failed validation based on error handlers")
                    # raise an error for an unrecoverable error
                    for x in self.run_log[-1]["corrections"]:
                        if not x["actions"] and x["handler"].raises_runtime_error:
                            s = "Unrecoverable error for handler: {}. " \
                                "Raising RuntimeError".format(x["handler"])
                            raise CustodianError(s, True, x["handler"])
                    logger.info("Corrected input based on error handlers")
                    # Return with more jobs to run if recoverable error caught
                    # and corrected for
                    return len(self.jobs) - job_n

                # check validators
                logger.info("Checking validator for {}.run".format(job.name))
                for v in self.validators:
                    if v.check():
                        logger.info("Failed validation based on validator")
                        s = "Validation failed: {}".format(v)
                        raise CustodianError(s, True, v)

                logger.info("Postprocessing for {}.run".format(job.name))
                job.postprocess()

                # IF DONE WITH ALL JOBS - DELETE ALL CHECKPOINTS AND RETURN
                # VALIDATED
                if len(self.jobs) == (job_n + 1):
                    self.finished = True
                    return 0

                # Setup next job_n
                job_n += 1
                job = self.jobs[job_n]
                self.run_log.append({"job": job.as_dict(), "corrections": [],
                                     'job_n': job_n})
                job.setup()
                return len(self.jobs) - job_n

        except CustodianError as ex:
            logger.error(ex.message)
            if ex.raises:
                raise RuntimeError("{} errors reached: {}. Exited..."
                                   .format(self.total_errors, ex))

        finally:
            #Log the corrections to a json file.
            logger.info("Logging to {}...".format(Custodian.LOG_FILE))
            dumpfn(self.run_log, Custodian.LOG_FILE, cls=MontyEncoder,
                   indent=4)
            end = datetime.datetime.now()
            logger.info("Run ended at {}.".format(end))
            run_time = end - start
            logger.info("Run completed. Total time taken = {}."
                        .format(run_time))
            if self.finished and self.gzipped_output:
                gzip_dir(".")
Example #10
0
 def postprocess(self):
     """
     Renaming or gzipping as needed.
     """
     if self.gzipped:
         gzip_dir(".")
Example #11
0
    def run_task(self, fw_spec):

        # Get
        data_dir = self.get('data_dir')
        calc_rdf = self.get('calc_rdf', True)
        calc_bad = self.get('calc_bad', True)
        calc_voronoi = self.get('calc_voronoi', False)
        calc_cage = self.get('calc_cage', True)
        calc_pmf = self.get('calc_pmf', False)
        calc_connectivity = self.get('calc_connectivity', False)

        ionic_steps = []
        for root, dirs, files in os.walk(data_dir):
            for f in files:
                if 'ionic_steps' in f:
                    name, ext = os.path.splitext(f)
                    if ext in ('.gz', '.GZ', '.Z'):
                        with gzip.open(f, "rb") as gzipped:
                            d = json.loads(gzipped.read().decode("ascii"))
                    else:
                        d = loadfn(f)
                    ionic_steps.extend(d)

        structures = [step['structure'] for step in ionic_steps]

        data_dict = {}

        if calc_rdf:
            logger.info("LOGGER: Calculating radial distribution functions...")
            rdf = RadialDistributionFunction(structures=structures)
            rdf.get_radial_distribution_functions(nproc=4)
            cns = rdf.get_coordination_numbers()
            fs = rdf.first_coordination_shell_radius
            data_dict.update({'radial_distribution_functions': rdf.as_dict()})
            data_dict.update({'coordination_numbers': cns})

            if calc_cage:
                logger.info("LOGGER: Calculating cage correlation function...")
                ccf = CageCorrelationFunction(structures, fs)
                ccf.get_cage_correlation_function()
                # TODO: Make sure the CCFs work

            if calc_pmf:
                logger.info(
                    "LOGGER: Calculating the potential of mean force...")
                # TODO: Need to include the implementation of PMF here

        if calc_bad:
            logger.info(
                "LOGGER: Calculating bond angle distribution functions...")
            bad = BondAngleDistribution(structures=structures)
            bad.get_bond_angle_distribution(nproc=4)
            data_dict.update(
                {'bond_angle_distribution_functions': bad.as_dict()})

        if calc_voronoi:
            logger.info("LOGGER: Performing voronoi analysis...")
            va = VoronoiAnalyzer(structures)
            try:
                poly = va.analyze_structures()
                data_dict.update({'voronoi_polyhedra': poly})
            except MemoryError:
                logger.info(
                    "ERROR: Voronoi analysis failed due to insufficient memory..."
                )

        if calc_connectivity:
            logger.info(
                "LOGGER: Getting the connectivity motif distribution functions..."
            )
            # TODO: Implement after writing connectivity function

        # write structural analysis results to json file and then zip it
        write_dir = os.path.join(os.getcwd(), 'structural_analysis')
        os.mkdir(write_dir)
        for k, v in data_dict.items():
            dumpfn(v, os.path.join(write_dir, '{}.json').format(k))
        gzip_dir(write_dir)

        return FWAction()
Example #12
0
    def run(self):
        """
        Runs all jobs.

        Returns:
            All errors encountered as a list of list.
            [[error_dicts for job 1], [error_dicts for job 2], ....]

        Raises:
            ValidationError: if a job fails validation
            ReturnCodeError: if the process has a return code different from 0
            NonRecoverableError: if an unrecoverable occurs
            MaxCorrectionsPerJobError: if max_errors_per_job is reached
            MaxCorrectionsError: if max_errors is reached
            MaxCorrectionsPerHandlerError: if max_errors_per_handler is reached
        """
        cwd = os.getcwd()

        with ScratchDir(
                self.scratch_dir,
                create_symbolic_link=True,
                copy_to_current_on_exit=True,
                copy_from_current_on_enter=True,
        ) as temp_dir:
            self.total_errors = 0
            start = datetime.datetime.now()
            logger.info(f"Run started at {start} in {temp_dir}.")
            v = sys.version.replace("\n", " ")
            logger.info(f"Custodian running on Python version {v}")
            host, cluster = get_execution_host_info()
            logger.info(f"Hostname: {host}, Cluster: {cluster}")

            try:
                # skip jobs until the restart
                for job_n, job in islice(enumerate(self.jobs, 1), self.restart,
                                         None):
                    self._run_job(job_n, job)
                    # We do a dump of the run log after each job.
                    dumpfn(self.run_log,
                           Custodian.LOG_FILE,
                           cls=MontyEncoder,
                           indent=4)
                    # Checkpoint after each job so that we can recover from last
                    # point and remove old checkpoints
                    if self.checkpoint:
                        self.restart = job_n
                        Custodian._save_checkpoint(cwd, job_n)
            except CustodianError as ex:
                logger.error(ex.message)
                if ex.raises:
                    raise
            finally:
                # Log the corrections to a json file.
                logger.info(f"Logging to {Custodian.LOG_FILE}...")
                dumpfn(self.run_log,
                       Custodian.LOG_FILE,
                       cls=MontyEncoder,
                       indent=4)
                end = datetime.datetime.now()
                logger.info(f"Run ended at {end}.")
                run_time = end - start
                logger.info(f"Run completed. Total time taken = {run_time}.")
                if self.gzipped_output:
                    gzip_dir(".")

            # Cleanup checkpoint files (if any) if run is successful.
            Custodian._delete_checkpoints(cwd)

        return self.run_log
Example #13
0
 def postprocess(self):
     """
     Renaming or gzipping as needed.
     """
     if self.gzipped:
         gzip_dir(".")