def test_extract_id(self): no_filename = "/home/moltherm/" no_path = "1234.mol" no_mol = "44511" filename_path = "/data/moltherm/112358.mol" underscores = "/global/homes/m/moltherm/1_90210.mol" self.assertEqual(extract_id(no_filename), "") self.assertEqual(extract_id(no_path), "1234") self.assertEqual(extract_id(no_mol), "44511") self.assertEqual(extract_id(filename_path), "112358") self.assertEqual(extract_id(underscores), "90210")
def map_reactants_to_reactions(self): """ Construct a dict showing which directories share each reactant. This is useful for analysis of common reactants, and to identify the "source" of a given reactant (in which directory the calculation actually took place). :return: """ mapping = {} dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] for d in dirs: if isdir(join(self.base_dir, d)) and not d.startswith("block"): molfiles = [ f for f in listdir(join(self.base_dir, d)) if f.endswith(".mol") and f.startswith(self.reactant_pre) ] for file in molfiles: f_id = extract_id(file) if f_id in mapping: mapping[f_id].append(d) else: mapping[f_id] = [d] return mapping
def get_completed_molecules(self, dirs=None, extra=False): """ Returns a list of molecules with completed opt, freq, and sp output files. :param dirs: List of directories to search for completed molecules. :params extra: If True, include directory of completed reaction and name of molfile along with mol_id :return: set of completed molecules """ completed = set() all_dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] if dirs is not None: all_dirs = [d for d in all_dirs if d in dirs] for d in all_dirs: path = join(self.base_dir, d) mapping = associate_qchem_to_mol(self.base_dir, d) for molfile, qcfiles in mapping.items(): mol_id = extract_id(molfile) for outfile in qcfiles["out"]: if "sp" in outfile: spfile = QCOutput(join(path, outfile)) completion = spfile.data.get("completion", False) # Currently will catch iefpcm or smd if completion: if extra: completed.add((mol_id, d, molfile)) else: completed.add(mol_id) return completed
def get_completed_reactions(self): """ Returns a list of directories (reactions) where all molecules are completed. :return: list of directories with complete information. """ if self.db is None: raise RuntimeError("Could not connect to database. Check db_file" "and try again later.") collection = self.db.db["molecules"] completed_molecules = [x["mol_id"] for x in collection.find()] completed_reactions = set() dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] for d in dirs: path = join(self.base_dir, d) mols = [ extract_id(f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".mol") ] are_completed = [ True if m in completed_molecules else False for m in mols ] if all(are_completed): completed_reactions.add(d) return completed_reactions
def get_unfinished_jobs(self, sp_params, name_pre="single_point", dirs=None, max_cores=32): """ Look for jobs where optimization and frequency calculations have successfully completed, but single-point has not. Then, for these cases, construct a workflow which will only run the sp job. :param sp_params: dict containing input parameters for single-point job :param name_pre: str representing prefix for all jobs. :param dirs: list of subdirectories to check for unfinished jobs. Default None, meaning that all subdirectories will be checked. :param max_cores: max_cores (int): Maximum number of cores to parallelize over. Defaults to 24. :return: """ if not self.subdirs: raise RuntimeError("Cannot run get_reaction_set_workflow();" "Need reactions components to be isolated in" "different subdirectories.") fws = [] all_dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) ] molecules_cleared = [] appropriate_dirs = all_dirs if dirs is not None: appropriate_dirs = [d for d in appropriate_dirs if d in dirs] for d in appropriate_dirs: path = join(self.base_dir, d) file_map = associate_qchem_to_mol(self.base_dir, d) for key, values in file_map.items(): mol_id = extract_id(key) if mol_id in molecules_cleared: continue freq_complete = False sp_complete = False in_files = values["in"] out_files = values["out"] # Check if this molecule has finished freq, sp # If there is no sp output file, or if the sp output file did # not complete, then we may proceed for out_file in out_files: if "freq" in out_file: freq_out = QCOutput(join(path, out_file)) if freq_out.data.get("completion", []): freq_complete = True elif "sp" in out_file: sp_out = QCOutput(join(path, out_file)) if sp_out.data.get("completion", []): sp_complete = True if freq_complete and not sp_complete: # Check if there is already an sp input file freq_in_file = None for in_file in in_files: if "freq" in in_file: freq_in_file = in_file if freq_in_file is None: # We could parse output files to get previous input # information, but we should try to keep all input # files in the same directory continue else: infile = join(path, key.replace(".mol", "") + ".in") outfile = join(path, key.replace(".mol", "") + ".out") qclogfile = join(path, key.replace(".mol", "") + ".qclog") freq_in_file = QCInput.from_file( join(path, freq_in_file)) mol = freq_in_file.molecule fw = SinglePointFW(molecule=mol, name="{}: {}/{}".format( name_pre, d, mol_id), qchem_cmd="qchem -slurm", multimode="openmp", input_file=infile, output_file=outfile, qclog_file=qclogfile, max_cores=max_cores, sp_params=sp_params) fws.append(fw) molecules_cleared.append(mol_id) return Workflow(fws)
def get_reaction_set_workflow(self, name_pre="opt_freq_sp", max_cores=32, qchem_input_params=None, sp_params=None): """Generates a Fireworks Workflow to find the structures and energies of the reactants and products of a single reaction. Note: as written now, this function will only work if self.subdirs is True; that is, only if each reaction is in a separate subdirectory. Later additions could allow for some other means of specifying the separate reactions within a single directory. :param name_pre: str indicating the prefix which should be used for all Firework names :param max_cores: int specifying number of processes/threads that can be used for this workflow. :param qchem_input_params: dict :param sp_params: For OptFreqSPFW, single-point calculations can be treated differently from Opt and Freq. In this case, another dict for sp must be used. :return: Workflow """ if not self.subdirs: raise RuntimeError("Cannot run get_reaction_set_workflow();" "Need reactions components to be isolated in" "different subdirectories.") fws = [] dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) ] # Only set up a workflow if it is worthwhile (the reaction actually # proceeds as written, and all atoms add up) appropriate_dirs = self.check_appropriate_dirs(dirs) if self.db is not None: all_fws = self.db.collection.find() # Keep track of which molecules have already been run as jobs before molecules_registered = [ extract_id(fw["task_label"]) for fw in all_fws ] else: molecules_registered = [] for d in appropriate_dirs: path = join(self.base_dir, d) files = [ f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".mol") ] rcts = [f for f in files if f.startswith(self.reactant_pre)] pros = [f for f in files if f.startswith(self.product_pre)] for i, rct in enumerate(rcts): mol_id = rct.rstrip(".mol").split("_")[-1] if mol_id in molecules_registered: continue else: molecules_registered.append(mol_id) mol = get_molecule(join(self.base_dir, d, rct)) infile = join(path, self.reactant_pre + str(i) + ".in") outfile = join(path, self.reactant_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, d, rct), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( path, self.reactant_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) for i, pro in enumerate(pros): mol_id = pro.rstrip(".mol").split("_")[-1] if mol_id in molecules_registered: continue else: molecules_registered.append(mol_id) mol = get_molecule(join(self.base_dir, d, pro)) infile = join(path, self.product_pre + str(i) + ".in") outfile = join(path, self.product_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, d, pro), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( path, self.product_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def get_reaction_data(self, directory=None, mol_ids=None): """ Compile all useful data for a set of molecules associated with a particular reaction. This data will be compiled on a reaction basis (difference between reactants and products) as well as an individual molecule basis. :param directory: Subdirectory where molecule data is located. :param mol_ids: List of unique IDs for molecules associated with the reaction :return: dict of relevant reaction data. """ reaction_data = {} reaction_data["thermo"] = None if directory is not None: mol_ids = [ extract_id(f) for f in listdir(join(self.base_dir, directory)) if f.endswith(".mol") ] component_data = [self.get_molecule_data(m) for m in mol_ids] reaction_data["thermo"] = self.extract_reaction_thermo_db( directory)["thermo"] elif mol_ids is not None: component_data = [self.get_molecule_data(m) for m in mol_ids] else: raise ValueError( "get_reaction_data requires either a directory or " "a set of molecule ids.") component_data = sorted(component_data, key=lambda x: len(x["molecule"])) reaction_data["dir_name"] = directory reaction_data["mol_ids"] = mol_ids reaction_data["product"] = component_data[-1] reaction_data["reactants"] = component_data[:-1] if reaction_data["thermo"] is None: reaction_data["thermo"] = {} pro_h = reaction_data["product"]["enthalpy"] + reaction_data[ "product"]["energy"] rct_h = sum(r["enthalpy"] + r["energy"] for r in reaction_data["reactants"]) reaction_data["thermo"]["enthalpy"] = pro_h - rct_h pro_s = reaction_data["product"]["entropy"] rct_s = sum(r["entropy"] for r in reaction_data["reactants"]) reaction_data["thermo"]["entropy"] = pro_s - rct_s try: reaction_data["thermo"]["t_star"] = reaction_data["thermo"][ "enthalpy"] / reaction_data["thermo"]["entropy"] except ZeroDivisionError: reaction_data["thermo"]["t_star"] = 0 return reaction_data
def extract_reaction_thermo_files(self, path): """ Naively scrape thermo data from QChem output files. :param path: Path to a subdirectory. :return: dict {prop: value}, where properties are enthalpy, entropy. """ base_path = join(self.base_dir, path) rct_ids = [ extract_id(f) for f in listdir(base_path) if f.endswith(".mol") and f.startswith(self.reactant_pre) ] pro_ids = [ extract_id(f) for f in listdir(base_path) if f.endswith(".mol") and f.startswith(self.product_pre) ] rct_map = { m: [ f for f in listdir(base_path) if f.startswith(self.reactant_pre) and m in f and ".out" in f and not f.endswith("_copy") ] for m in rct_ids } pro_map = { m: [ f for f in listdir(base_path) if f.startswith(self.product_pre) and m in f and ".out" in f ] for m in pro_ids } rct_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}} pro_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}} for mol in rct_map.keys(): enthalpy = 0 entropy = 0 energy_opt = 0 energy_sp = 0 for out in rct_map[mol]: qcout = QCOutput(join(base_path, out)) # Catch potential for Nonetype entries if "freq" in out: enthalpy = qcout.data.get("enthalpy", 0) or 0 entropy = qcout.data.get("entropy", 0) or 0 elif "opt" in out: energy_opt = qcout.data.get("final_energy", 0) or 0 elif "sp" in out: energy_sp = qcout.data.get("final_energy_sp", 0) or 0 if energy_sp == 0: rct_thermo["energy"] += energy_opt rct_thermo["has_sp"][self.reactant_pre + str(mol)] = False else: rct_thermo["energy"] += energy_sp rct_thermo["has_sp"][self.reactant_pre + str(mol)] = True rct_thermo["enthalpy"] += enthalpy rct_thermo["entropy"] += entropy print(path, mol, enthalpy, energy_sp) for mol in pro_map.keys(): enthalpy = 0 entropy = 0 energy_opt = 0 energy_sp = 0 for out in pro_map[mol]: qcout = QCOutput(join(base_path, out)) # Catch potential for Nonetype entries if "freq" in out: enthalpy = qcout.data.get("enthalpy", 0) or 0 entropy = qcout.data.get("entropy", 0) or 0 elif "opt" in out: energy_opt = qcout.data.get("final_energy", 0) or 0 elif "sp" in out: energy_sp = qcout.data.get("final_energy_sp", 0) or 0 # Enthalpy calculation should actually be enthalpy - energy_sp # But currently, not all calculations have sp if int(energy_sp) == 0: pro_thermo["energy"] += energy_opt pro_thermo["has_sp"][self.product_pre + str(mol)] = False else: pro_thermo["energy"] += energy_sp pro_thermo["has_sp"][self.product_pre + str(mol)] = True pro_thermo["enthalpy"] += enthalpy pro_thermo["entropy"] += entropy print(path, mol, enthalpy, energy_sp) thermo_data = {} # Generate totals as ∆H = H_pro - H_rct, ∆S = S_pro - S_rct # Also ensures that units are appropriate (Joules/mol, # rather than cal/mol or kcal/mol, or hartree for energy) energy = (pro_thermo["energy"] - rct_thermo["energy"]) * 627.509 enthalpy = (pro_thermo["enthalpy"] - rct_thermo["enthalpy"]) print(path, energy, enthalpy) thermo_data["enthalpy"] = (energy + enthalpy) * 1000 * 4.184 thermo_data["entropy"] = (pro_thermo["entropy"] - rct_thermo["entropy"]) * 4.184 try: thermo_data["t_critical"] = thermo_data["enthalpy"] / thermo_data[ "entropy"] except ZeroDivisionError: thermo_data["t_critical"] = None # Combine dicts from pro_thermo and rct_thermo thermo_data["has_sp"] = { **pro_thermo["has_sp"], **rct_thermo["has_sp"] } result = { "thermo": thermo_data, "directory": path, "reactant_ids": rct_ids, "product_ids": pro_ids } return result
def copy_outputs_across_directories(self): """ Copy output files between subdirectories to ensure that all reaction directories that need outputs of a given molecule will have them. Note: This function should not be used unless necessary. It was written because for each directory, only a single database entry was being made (because db entries were being overwritten by default. :return: """ files_copied = 0 dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] print("Number of directories: {}".format(len(dirs))) for start_d in dirs: start_p = join(self.base_dir, start_d) mol_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and f.endswith(".mol") ] out_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and ".out" in f ] for mf in mol_files: is_covered = False mol_id = extract_id(mf) mol_obj = get_molecule(join(start_p, mf)) for out in out_files: qcout = QCOutput(join(start_p, out)) if sorted( qcout.data["initial_molecule"].species) == sorted( mol_obj.species): # If there is already output, do not copy any files is_covered = True if is_covered: continue for other_d in dirs: if other_d == start_d: continue if is_covered: break other_p = join(self.base_dir, other_d) # Check if this id is present other_mol_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and f.endswith(".mol") and mol_id in f ] other_out_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and ".out" in f ] to_copy = [] for other_mol in other_mol_files: if other_mol.startswith(self.product_pre): to_copy = [ f for f in other_out_files if f.startswith(self.product_pre) ] elif other_mol.startswith(self.reactant_pre): to_check = [ f for f in other_out_files if f.startswith(self.reactant_pre) ] to_copy = [] for file in to_check: qcout = QCOutput(join(other_p, file)) if qcout.data[ "initial_molecule"].species == mol_obj.species: to_copy.append(file) else: to_copy = [] for file in to_copy: shutil.copyfile(join(other_p, file), join(start_p, file + "_copy")) files_copied += 1 if files_copied > 0: is_covered = True print("Number of files copied: {}".format(files_copied))
def extract_reaction_thermo_db(self, directory, opt=None, freq=None, sp=None): """ Gathers all relevant reaction parameters, including references to each job performed. :param directory: Directory name where the reaction is stored. Right now, this is the easiest way to identify the reaction. In the future, more sophisticated searching should be used. :param opt: dict containing information about the optimization jobs. By default, this is None, and that information will be obtained by querying the self.db.tasks collection. :param freq: dict containing information about the frequency jobs. By default, this is None, and that information will be obtained by querying the self.db.tasks collection. :param sp: dict containing information about the single-point jobs. By default, this is None, and that information will be obtained by querying the self.db.tasks collection. :return: dict """ if self.db is None: raise RuntimeError("Could not connect to database. Check db_file" "and try again later.") # To extract enthalpy and entropy from calculation results # Note: After all sp jobs are finished, it should be unnecessary to use # energy_opt def get_thermo(job): enthalpy = None entropy = None energy_sp = None for calc in job["calcs_reversed"]: if (calc["task"]["type"] == "freq" or calc["task"]["type"] == "frequency")\ and (enthalpy is None or entropy is None): enthalpy = calc["enthalpy"] entropy = calc["entropy"] if calc["task"]["type"] == "sp" and energy_sp is None: energy_sp = calc["final_energy_sp"] if enthalpy is None: enthalpy = 0.0 if entropy is None: entropy = 0.0 if energy_sp is None: energy_sp = 0.0 return { "enthalpy": enthalpy, "entropy": entropy, "energy": energy_sp } if abspath(directory) != directory: directory = join(self.base_dir, directory) mol_files = [f for f in listdir(directory) if f.endswith(".mol")] dir_ids = [extract_id(f) for f in mol_files] collection = self.db.db["molecules"] records = [] for mol_id in dir_ids: record = collection.find_one({"mol_id": str(mol_id)}) records.append(record) # Sort files for if they are reactants or products reactants = [] products = [] for i, record in enumerate(records): filename = mol_files[i] if opt is None: for calc in record["calcs_reversed"]: if calc["task"]["type"] == "opt" or \ calc["task"]["type"] == "optimization": method = calc["input"]["rem"]["method"] basis = calc["input"]["rem"]["basis"] solvent_method = calc["input"]["rem"].get( "solvent_method", None) if solvent_method == "smd": if calc["input"]["smx"] is None: solvent = None else: solvent = calc["input"]["smx"]["solvent"] elif solvent_method == "pcm": solvent = calc["input"]["solvent"] else: solvent = None opt = { "method": method, "basis": basis, "solvent_method": solvent_method, "solvent": solvent } break if freq is None: for calc in record["calcs_reversed"]: if calc["task"]["type"] == "freq" or \ calc["task"]["type"] == "frequency": method = calc["input"]["rem"]["method"] basis = calc["input"]["rem"]["basis"] solvent_method = calc["input"]["rem"].get( "solvent_method", None) if solvent_method == "smd": if calc["input"]["smx"] is None: solvent = None else: solvent = calc["input"]["smx"]["solvent"] elif solvent_method == "pcm": solvent = calc["input"]["solvent"] else: solvent = None freq = { "method": method, "basis": basis, "solvent_method": solvent_method, "solvent": solvent } break if sp is None: for calc in record["calcs_reversed"]: if calc["task"]["type"] == "sp": method = calc["input"]["rem"]["method"] basis = calc["input"]["rem"]["basis"] solvent_method = calc["input"]["rem"].get( "solvent_method", None) if solvent_method == "smd": if calc["input"]["smx"] is None: solvent = None else: solvent = calc["input"]["smx"]["solvent"] elif solvent_method == "pcm": solvent = calc["input"]["solvent"] else: solvent = None sp = { "method": method, "basis": basis, "solvent_method": solvent_method, "solvent": solvent } break if filename.startswith(self.reactant_pre): reactants.append(record) elif filename.startswith(self.product_pre): products.append(record) else: print("Skipping {} because it cannot be determined if it is" "reactant or product.".format(filename)) continue # Get ids reactant_ids = [r["mol_id"] for r in reactants] product_ids = [p["mol_id"] for p in products] # Get thermo data rct_thermo = [get_thermo(r) for r in reactants] pro_thermo = [get_thermo(p) for p in products] # Compile reaction thermo from reactant and product thermos delta_e = sum(p["energy"] for p in pro_thermo) - sum(r["energy"] for r in rct_thermo) delta_e *= 627.509 delta_h = sum(p["enthalpy"] for p in pro_thermo) - sum(r["enthalpy"] for r in rct_thermo) + delta_e delta_h *= 1000 * 4.184 delta_s = sum(p["entropy"] for p in pro_thermo) - sum(r["entropy"] for r in rct_thermo) delta_s *= 4.184 thermo = {"enthalpy": delta_h, "entropy": delta_s} try: thermo["t_star"] = delta_h / delta_s except ZeroDivisionError: thermo["t_star"] = 0 result = { "dir_name": directory, "opt": opt, "freq": freq, "sp": sp, "reactant_ids": reactant_ids, "product_ids": product_ids, "thermo": thermo } return result