Example #1
    def test_extract_id(self):
        no_filename = "/home/moltherm/"
        no_path = "1234.mol"
        no_mol = "44511"
        filename_path = "/data/moltherm/112358.mol"
        underscores = "/global/homes/m/moltherm/1_90210.mol"

        self.assertEqual(extract_id(no_filename), "")
        self.assertEqual(extract_id(no_path), "1234")
        self.assertEqual(extract_id(no_mol), "44511")
        self.assertEqual(extract_id(filename_path), "112358")
        self.assertEqual(extract_id(underscores), "90210")
Example #2
    def map_reactants_to_reactions(self):
        Construct a dict showing which directories share each reactant.

        This is useful for analysis of common reactants, and to identify the
        "source" of a given reactant (in which directory the calculation
        actually took place).


        mapping = {}
        dirs = [
            d for d in listdir(self.base_dir)
            if isdir(join(self.base_dir, d)) and not d.startswith("block")

        for d in dirs:
            if isdir(join(self.base_dir, d)) and not d.startswith("block"):
                molfiles = [
                    f for f in listdir(join(self.base_dir, d))
                    if f.endswith(".mol") and f.startswith(self.reactant_pre)
                for file in molfiles:
                    f_id = extract_id(file)
                    if f_id in mapping:
                        mapping[f_id] = [d]

        return mapping
Example #3
    def get_completed_molecules(self, dirs=None, extra=False):
        Returns a list of molecules with completed opt, freq, and sp output

        :param dirs: List of directories to search for completed molecules.
        :params extra: If True, include directory of completed reaction and name
            of molfile along with mol_id
        :return: set of completed molecules

        completed = set()

        all_dirs = [
            d for d in listdir(self.base_dir)
            if isdir(join(self.base_dir, d)) and not d.startswith("block")

        if dirs is not None:
            all_dirs = [d for d in all_dirs if d in dirs]

        for d in all_dirs:
            path = join(self.base_dir, d)
            mapping = associate_qchem_to_mol(self.base_dir, d)

            for molfile, qcfiles in mapping.items():
                mol_id = extract_id(molfile)

                for outfile in qcfiles["out"]:
                    if "sp" in outfile:
                        spfile = QCOutput(join(path, outfile))

                        completion = spfile.data.get("completion", False)

                        # Currently will catch iefpcm or smd
                        if completion:
                            if extra:
                                completed.add((mol_id, d, molfile))

        return completed
Example #4
    def get_completed_reactions(self):
        Returns a list of directories (reactions) where all molecules are

        :return: list of directories with complete information.

        if self.db is None:
            raise RuntimeError("Could not connect to database. Check db_file"
                               "and try again later.")

        collection = self.db.db["molecules"]

        completed_molecules = [x["mol_id"] for x in collection.find()]

        completed_reactions = set()

        dirs = [
            d for d in listdir(self.base_dir)
            if isdir(join(self.base_dir, d)) and not d.startswith("block")

        for d in dirs:
            path = join(self.base_dir, d)

            mols = [
                extract_id(f) for f in listdir(path)
                if isfile(join(path, f)) and f.endswith(".mol")

            are_completed = [
                True if m in completed_molecules else False for m in mols

            if all(are_completed):

        return completed_reactions
Example #5
    def get_unfinished_jobs(self,
        Look for jobs where optimization and frequency calculations have
        successfully completed, but single-point has not. Then, for these cases,
        construct a workflow which will only run the sp job.

        :param sp_params: dict containing input parameters for single-point job
        :param name_pre: str representing prefix for all jobs.
        :param dirs: list of subdirectories to check for unfinished jobs.
            Default None, meaning that all subdirectories will be checked.
        :param max_cores: max_cores (int): Maximum number of cores to
            parallelize over. Defaults to 24.

        if not self.subdirs:
            raise RuntimeError("Cannot run get_reaction_set_workflow();"
                               "Need reactions components to be isolated in"
                               "different subdirectories.")

        fws = []

        all_dirs = [
            d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d))

        molecules_cleared = []

        appropriate_dirs = all_dirs

        if dirs is not None:
            appropriate_dirs = [d for d in appropriate_dirs if d in dirs]

        for d in appropriate_dirs:
            path = join(self.base_dir, d)
            file_map = associate_qchem_to_mol(self.base_dir, d)

            for key, values in file_map.items():
                mol_id = extract_id(key)

                if mol_id in molecules_cleared:

                freq_complete = False
                sp_complete = False

                in_files = values["in"]
                out_files = values["out"]

                # Check if this molecule has finished freq, sp
                # If there is no sp output file, or if the sp output file did
                # not complete, then we may proceed
                for out_file in out_files:
                    if "freq" in out_file:
                        freq_out = QCOutput(join(path, out_file))

                        if freq_out.data.get("completion", []):
                            freq_complete = True
                    elif "sp" in out_file:
                        sp_out = QCOutput(join(path, out_file))

                        if sp_out.data.get("completion", []):
                            sp_complete = True

                if freq_complete and not sp_complete:
                    # Check if there is already an sp input file
                    freq_in_file = None

                    for in_file in in_files:
                        if "freq" in in_file:
                            freq_in_file = in_file

                    if freq_in_file is None:
                        # We could parse output files to get previous input
                        # information, but we should try to keep all input
                        # files in the same directory
                        infile = join(path, key.replace(".mol", "") + ".in")
                        outfile = join(path, key.replace(".mol", "") + ".out")
                        qclogfile = join(path,
                                         key.replace(".mol", "") + ".qclog")

                        freq_in_file = QCInput.from_file(
                            join(path, freq_in_file))
                        mol = freq_in_file.molecule

                        fw = SinglePointFW(molecule=mol,
                                           name="{}: {}/{}".format(
                                               name_pre, d, mol_id),
                                           qchem_cmd="qchem -slurm",


        return Workflow(fws)
Example #6
    def get_reaction_set_workflow(self,
        """Generates a Fireworks Workflow to find the structures and energies of
        the reactants and products of a single reaction.

        Note: as written now, this function will only work if self.subdirs is
        True; that is, only if each reaction is in a separate subdirectory.
        Later additions could allow for some other means of specifying the
        separate reactions within a single directory.

        :param name_pre: str indicating the prefix which should be used for all
        Firework names
        :param max_cores: int specifying number of processes/threads that can
        be used for this workflow.
        :param qchem_input_params: dict
        :param sp_params: For OptFreqSPFW, single-point calculations can be
        treated differently from Opt and Freq. In this case, another dict
        for sp must be used.

        :return: Workflow

        if not self.subdirs:
            raise RuntimeError("Cannot run get_reaction_set_workflow();"
                               "Need reactions components to be isolated in"
                               "different subdirectories.")

        fws = []

        dirs = [
            d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d))

        # Only set up a workflow if it is worthwhile (the reaction actually
        # proceeds as written, and all atoms add up)
        appropriate_dirs = self.check_appropriate_dirs(dirs)

        if self.db is not None:
            all_fws = self.db.collection.find()

            # Keep track of which molecules have already been run as jobs before
            molecules_registered = [
                extract_id(fw["task_label"]) for fw in all_fws
            molecules_registered = []

        for d in appropriate_dirs:
            path = join(self.base_dir, d)
            files = [
                f for f in listdir(path)
                if isfile(join(path, f)) and f.endswith(".mol")
            rcts = [f for f in files if f.startswith(self.reactant_pre)]
            pros = [f for f in files if f.startswith(self.product_pre)]

            for i, rct in enumerate(rcts):
                mol_id = rct.rstrip(".mol").split("_")[-1]

                if mol_id in molecules_registered:

                mol = get_molecule(join(self.base_dir, d, rct))

                infile = join(path, self.reactant_pre + str(i) + ".in")
                outfile = join(path, self.reactant_pre + str(i) + ".out")

                fw = OptFreqSPFW(molecule=mol,
                                 name="{}: {}/{}".format(name_pre, d, rct),
                                 qchem_cmd="qchem -slurm",
                                     self.reactant_pre + str(i) + ".qclog"),


            for i, pro in enumerate(pros):
                mol_id = pro.rstrip(".mol").split("_")[-1]

                if mol_id in molecules_registered:

                mol = get_molecule(join(self.base_dir, d, pro))

                infile = join(path, self.product_pre + str(i) + ".in")
                outfile = join(path, self.product_pre + str(i) + ".out")

                fw = OptFreqSPFW(molecule=mol,
                                 name="{}: {}/{}".format(name_pre, d, pro),
                                 qchem_cmd="qchem -slurm",
                                     self.product_pre + str(i) + ".qclog"),


        return Workflow(fws)
Example #7
    def get_reaction_data(self, directory=None, mol_ids=None):
        Compile all useful data for a set of molecules associated with a
        particular reaction. This data will be compiled on a reaction basis
        (difference between reactants and products) as well as an individual
        molecule basis.

        :param directory: Subdirectory where molecule data is located.
        :param mol_ids: List of unique IDs for molecules associated with the
        :return: dict of relevant reaction data.

        reaction_data = {}
        reaction_data["thermo"] = None

        if directory is not None:
            mol_ids = [
                extract_id(f) for f in listdir(join(self.base_dir, directory))
                if f.endswith(".mol")

            component_data = [self.get_molecule_data(m) for m in mol_ids]

            reaction_data["thermo"] = self.extract_reaction_thermo_db(

        elif mol_ids is not None:
            component_data = [self.get_molecule_data(m) for m in mol_ids]

            raise ValueError(
                "get_reaction_data requires either a directory or "
                "a set of molecule ids.")

        component_data = sorted(component_data,
                                key=lambda x: len(x["molecule"]))

        reaction_data["dir_name"] = directory
        reaction_data["mol_ids"] = mol_ids
        reaction_data["product"] = component_data[-1]
        reaction_data["reactants"] = component_data[:-1]

        if reaction_data["thermo"] is None:
            reaction_data["thermo"] = {}
            pro_h = reaction_data["product"]["enthalpy"] + reaction_data[
            rct_h = sum(r["enthalpy"] + r["energy"]
                        for r in reaction_data["reactants"])
            reaction_data["thermo"]["enthalpy"] = pro_h - rct_h

            pro_s = reaction_data["product"]["entropy"]
            rct_s = sum(r["entropy"] for r in reaction_data["reactants"])
            reaction_data["thermo"]["entropy"] = pro_s - rct_s

                reaction_data["thermo"]["t_star"] = reaction_data["thermo"][
                    "enthalpy"] / reaction_data["thermo"]["entropy"]
            except ZeroDivisionError:
                reaction_data["thermo"]["t_star"] = 0

        return reaction_data
Example #8
    def extract_reaction_thermo_files(self, path):
        Naively scrape thermo data from QChem output files.

        :param path: Path to a subdirectory.

        :return: dict {prop: value}, where properties are enthalpy, entropy.

        base_path = join(self.base_dir, path)

        rct_ids = [
            extract_id(f) for f in listdir(base_path)
            if f.endswith(".mol") and f.startswith(self.reactant_pre)

        pro_ids = [
            extract_id(f) for f in listdir(base_path)
            if f.endswith(".mol") and f.startswith(self.product_pre)

        rct_map = {
            m: [
                f for f in listdir(base_path)
                if f.startswith(self.reactant_pre) and m in f and ".out" in f
                and not f.endswith("_copy")
            for m in rct_ids
        pro_map = {
            m: [
                f for f in listdir(base_path)
                if f.startswith(self.product_pre) and m in f and ".out" in f
            for m in pro_ids

        rct_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}}
        pro_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}}

        for mol in rct_map.keys():
            enthalpy = 0
            entropy = 0
            energy_opt = 0
            energy_sp = 0

            for out in rct_map[mol]:
                qcout = QCOutput(join(base_path, out))

                # Catch potential for Nonetype entries
                if "freq" in out:
                    enthalpy = qcout.data.get("enthalpy", 0) or 0
                    entropy = qcout.data.get("entropy", 0) or 0
                elif "opt" in out:
                    energy_opt = qcout.data.get("final_energy", 0) or 0
                elif "sp" in out:
                    energy_sp = qcout.data.get("final_energy_sp", 0) or 0

            if energy_sp == 0:
                rct_thermo["energy"] += energy_opt
                rct_thermo["has_sp"][self.reactant_pre + str(mol)] = False
                rct_thermo["energy"] += energy_sp
                rct_thermo["has_sp"][self.reactant_pre + str(mol)] = True

            rct_thermo["enthalpy"] += enthalpy
            rct_thermo["entropy"] += entropy
            print(path, mol, enthalpy, energy_sp)

        for mol in pro_map.keys():
            enthalpy = 0
            entropy = 0
            energy_opt = 0
            energy_sp = 0

            for out in pro_map[mol]:
                qcout = QCOutput(join(base_path, out))

                # Catch potential for Nonetype entries
                if "freq" in out:
                    enthalpy = qcout.data.get("enthalpy", 0) or 0
                    entropy = qcout.data.get("entropy", 0) or 0
                elif "opt" in out:
                    energy_opt = qcout.data.get("final_energy", 0) or 0
                elif "sp" in out:
                    energy_sp = qcout.data.get("final_energy_sp", 0) or 0

            # Enthalpy calculation should actually be enthalpy - energy_sp
            # But currently, not all calculations have sp
            if int(energy_sp) == 0:
                pro_thermo["energy"] += energy_opt
                pro_thermo["has_sp"][self.product_pre + str(mol)] = False
                pro_thermo["energy"] += energy_sp
                pro_thermo["has_sp"][self.product_pre + str(mol)] = True

            pro_thermo["enthalpy"] += enthalpy
            pro_thermo["entropy"] += entropy
            print(path, mol, enthalpy, energy_sp)

        thermo_data = {}

        # Generate totals as ∆H = H_pro - H_rct, ∆S = S_pro - S_rct
        # Also ensures that units are appropriate (Joules/mol,
        # rather than cal/mol or kcal/mol, or hartree for energy)
        energy = (pro_thermo["energy"] - rct_thermo["energy"]) * 627.509
        enthalpy = (pro_thermo["enthalpy"] - rct_thermo["enthalpy"])
        print(path, energy, enthalpy)
        thermo_data["enthalpy"] = (energy + enthalpy) * 1000 * 4.184
        thermo_data["entropy"] = (pro_thermo["entropy"] -
                                  rct_thermo["entropy"]) * 4.184
            thermo_data["t_critical"] = thermo_data["enthalpy"] / thermo_data[
        except ZeroDivisionError:
            thermo_data["t_critical"] = None
        # Combine dicts from pro_thermo and rct_thermo
        thermo_data["has_sp"] = {

        result = {
            "thermo": thermo_data,
            "directory": path,
            "reactant_ids": rct_ids,
            "product_ids": pro_ids

        return result
Example #9
    def copy_outputs_across_directories(self):
        Copy output files between subdirectories to ensure that all reaction
        directories that need outputs of a given molecule will have them.

        Note: This function should not be used unless necessary. It was written
        because for each directory, only a single database entry was being made
        (because db entries were being overwritten by default.


        files_copied = 0

        dirs = [
            d for d in listdir(self.base_dir)
            if isdir(join(self.base_dir, d)) and not d.startswith("block")
        print("Number of directories: {}".format(len(dirs)))

        for start_d in dirs:
            start_p = join(self.base_dir, start_d)
            mol_files = [
                f for f in listdir(start_p)
                if isfile(join(start_p, f)) and f.endswith(".mol")
            out_files = [
                f for f in listdir(start_p)
                if isfile(join(start_p, f)) and ".out" in f

            for mf in mol_files:
                is_covered = False
                mol_id = extract_id(mf)

                mol_obj = get_molecule(join(start_p, mf))

                for out in out_files:
                    qcout = QCOutput(join(start_p, out))
                    if sorted(
                            qcout.data["initial_molecule"].species) == sorted(
                        # If there is already output, do not copy any files
                        is_covered = True

                if is_covered:

                for other_d in dirs:
                    if other_d == start_d:
                    if is_covered:

                    other_p = join(self.base_dir, other_d)
                    # Check if this id is present
                    other_mol_files = [
                        f for f in listdir(other_p) if isfile(join(other_p, f))
                        and f.endswith(".mol") and mol_id in f
                    other_out_files = [
                        f for f in listdir(other_p)
                        if isfile(join(other_p, f)) and ".out" in f
                    to_copy = []
                    for other_mol in other_mol_files:
                        if other_mol.startswith(self.product_pre):
                            to_copy = [
                                f for f in other_out_files
                                if f.startswith(self.product_pre)
                        elif other_mol.startswith(self.reactant_pre):
                            to_check = [
                                f for f in other_out_files
                                if f.startswith(self.reactant_pre)
                            to_copy = []
                            for file in to_check:
                                qcout = QCOutput(join(other_p, file))
                                if qcout.data[
                                        "initial_molecule"].species == mol_obj.species:
                            to_copy = []
                    for file in to_copy:
                        shutil.copyfile(join(other_p, file),
                                        join(start_p, file + "_copy"))
                        files_copied += 1

                    if files_copied > 0:
                        is_covered = True
        print("Number of files copied: {}".format(files_copied))
Example #10
    def extract_reaction_thermo_db(self,
        Gathers all relevant reaction parameters, including references to
        each job performed.

        :param directory: Directory name where the reaction is stored. Right
            now, this is the easiest way to identify the reaction. In the
            future, more sophisticated searching should be used.
        :param opt: dict containing information about the optimization jobs. By
            default, this is None, and that information will be obtained by
            querying the self.db.tasks collection.
        :param freq: dict containing information about the frequency jobs. By
            default, this is None, and that information will be obtained by
            querying the self.db.tasks collection.
        :param sp: dict containing information about the single-point jobs. By
            default, this is None, and that information will be obtained by
            querying the self.db.tasks collection.

        :return: dict

        if self.db is None:
            raise RuntimeError("Could not connect to database. Check db_file"
                               "and try again later.")

        # To extract enthalpy and entropy from calculation results
        # Note: After all sp jobs are finished, it should be unnecessary to use
        # energy_opt
        def get_thermo(job):
            enthalpy = None
            entropy = None
            energy_sp = None

            for calc in job["calcs_reversed"]:
                if (calc["task"]["type"] == "freq"
                        or calc["task"]["type"] == "frequency")\
                        and (enthalpy is None or entropy is None):
                    enthalpy = calc["enthalpy"]
                    entropy = calc["entropy"]
                if calc["task"]["type"] == "sp" and energy_sp is None:
                    energy_sp = calc["final_energy_sp"]

            if enthalpy is None:
                enthalpy = 0.0
            if entropy is None:
                entropy = 0.0
            if energy_sp is None:
                energy_sp = 0.0

            return {
                "enthalpy": enthalpy,
                "entropy": entropy,
                "energy": energy_sp

        if abspath(directory) != directory:
            directory = join(self.base_dir, directory)

        mol_files = [f for f in listdir(directory) if f.endswith(".mol")]

        dir_ids = [extract_id(f) for f in mol_files]

        collection = self.db.db["molecules"]
        records = []

        for mol_id in dir_ids:
            record = collection.find_one({"mol_id": str(mol_id)})

        # Sort files for if they are reactants or products
        reactants = []
        products = []
        for i, record in enumerate(records):
            filename = mol_files[i]
            if opt is None:
                for calc in record["calcs_reversed"]:
                    if calc["task"]["type"] == "opt" or \
                            calc["task"]["type"] == "optimization":
                        method = calc["input"]["rem"]["method"]
                        basis = calc["input"]["rem"]["basis"]
                        solvent_method = calc["input"]["rem"].get(
                            "solvent_method", None)
                        if solvent_method == "smd":
                            if calc["input"]["smx"] is None:
                                solvent = None
                                solvent = calc["input"]["smx"]["solvent"]
                        elif solvent_method == "pcm":
                            solvent = calc["input"]["solvent"]
                            solvent = None

                        opt = {
                            "method": method,
                            "basis": basis,
                            "solvent_method": solvent_method,
                            "solvent": solvent
            if freq is None:
                for calc in record["calcs_reversed"]:
                    if calc["task"]["type"] == "freq" or \
                            calc["task"]["type"] == "frequency":
                        method = calc["input"]["rem"]["method"]
                        basis = calc["input"]["rem"]["basis"]
                        solvent_method = calc["input"]["rem"].get(
                            "solvent_method", None)
                        if solvent_method == "smd":
                            if calc["input"]["smx"] is None:
                                solvent = None
                                solvent = calc["input"]["smx"]["solvent"]
                        elif solvent_method == "pcm":
                            solvent = calc["input"]["solvent"]
                            solvent = None

                        freq = {
                            "method": method,
                            "basis": basis,
                            "solvent_method": solvent_method,
                            "solvent": solvent
            if sp is None:
                for calc in record["calcs_reversed"]:
                    if calc["task"]["type"] == "sp":
                        method = calc["input"]["rem"]["method"]
                        basis = calc["input"]["rem"]["basis"]
                        solvent_method = calc["input"]["rem"].get(
                            "solvent_method", None)
                        if solvent_method == "smd":
                            if calc["input"]["smx"] is None:
                                solvent = None
                                solvent = calc["input"]["smx"]["solvent"]
                        elif solvent_method == "pcm":
                            solvent = calc["input"]["solvent"]
                            solvent = None

                        sp = {
                            "method": method,
                            "basis": basis,
                            "solvent_method": solvent_method,
                            "solvent": solvent

            if filename.startswith(self.reactant_pre):
            elif filename.startswith(self.product_pre):
                print("Skipping {} because it cannot be determined if it is"
                      "reactant or product.".format(filename))

        # Get ids
        reactant_ids = [r["mol_id"] for r in reactants]
        product_ids = [p["mol_id"] for p in products]

        # Get thermo data
        rct_thermo = [get_thermo(r) for r in reactants]
        pro_thermo = [get_thermo(p) for p in products]

        # Compile reaction thermo from reactant and product thermos
        delta_e = sum(p["energy"]
                      for p in pro_thermo) - sum(r["energy"]
                                                 for r in rct_thermo)
        delta_e *= 627.509
        delta_h = sum(p["enthalpy"]
                      for p in pro_thermo) - sum(r["enthalpy"]
                                                 for r in rct_thermo) + delta_e
        delta_h *= 1000 * 4.184
        delta_s = sum(p["entropy"]
                      for p in pro_thermo) - sum(r["entropy"]
                                                 for r in rct_thermo)
        delta_s *= 4.184
        thermo = {"enthalpy": delta_h, "entropy": delta_s}

            thermo["t_star"] = delta_h / delta_s
        except ZeroDivisionError:
            thermo["t_star"] = 0

        result = {
            "dir_name": directory,
            "opt": opt,
            "freq": freq,
            "sp": sp,
            "reactant_ids": reactant_ids,
            "product_ids": product_ids,
            "thermo": thermo

        return result