def check_appropriate_dirs(self, dirs): """ Returns only those reactions which have appropriate products and reactants (products, reactants have same number of atoms). This is not a sophisticated checking mechanism, and could probably be easily improved upon. :return: """ add_up = [] for d in dirs: path = join(self.base_dir, d) files = [f for f in listdir(path) if isfile(join(path, f))] rcts = [ f for f in files if f.startswith(self.reactant_pre) and f.endswith(".mol") ] pros = [ f for f in files if f.startswith(self.product_pre) and f.endswith(".mol") ] rct_mols = [get_molecule(join(self.base_dir, d, r)) for r in rcts] pro_mols = [get_molecule(join(self.base_dir, d, p)) for p in pros] total_pro_length = sum([len(p) for p in pro_mols]) total_rct_length = sum([len(r) for r in rct_mols]) if total_pro_length == total_rct_length: add_up.append(d) return add_up
def test_get_molecule(self): benzene_file = join(files_dir, "benzene.mol") benzene_pmg = Molecule.from_file(benzene_file) benzene_moltherm = get_molecule(benzene_file) species_no_h = [ e for e in benzene_moltherm.species if str(e).upper() != "H" ] self.assertEqual(species_no_h, benzene_pmg.species) # Test that implicit hydrogens are added appropriately species = sorted([str(e) for e in benzene_moltherm.species]) self.assertEqual( species, ["C", "C", "C", "C", "C", "C", "H", "H", "H", "H", "H", "H"]) # Ensure that get_molecule is deterministic, always produces the same # molecule self.assertEqual(benzene_moltherm, get_molecule(benzene_file)) coords = np.array([[-7.59858151e-01, 1.16908119e+00, -1.61105859e-03], [-1.39065495e+00, -7.49583582e-02, -9.63095317e-04], [-6.28683825e-01, -1.24326334e+00, 6.60526465e-04], [7.64084196e-01, -1.16752892e+00, 1.63614012e-03], [1.39488100e+00, 7.65106237e-02, 9.87135623e-04], [6.32909871e-01, 1.24481561e+00, -6.36441101e-04], [-1.35352141e+00, 2.07932532e+00, -2.87509442e-03], [-2.47578162e+00, -1.33964201e-01, -1.72330284e-03], [-1.12014718e+00, -2.21251339e+00, 1.16530208e-03], [1.35774746e+00, -2.07777305e+00, 2.90204589e-03], [2.48000766e+00, 1.35516465e-01, 1.74638272e-03], [1.12437322e+00, 2.21406566e+00, -1.14215271e-03]]) self.assertTrue(np.allclose(benzene_moltherm.cart_coords, coords))
def get_modified_molecule_workflow(self, directory, reactant, index, func_group, qchem_input_params, sp_params, bond_order=1, do_rct=True, new_dir=None): """ Modify a reactant molecule, mimic that change in the product, and then create a workflow with the modified molecules (and any other molecules not already in the database). Note: this function will check if a substitution is "allowed"; that is, :param directory: Subdirectory where the reaction files are. :param reactant: File name of the reactant to be modified. It MUST be a reactant, and cannot be the product molecule. :param index: Index (in the reactant molecule) where the functional group is to be substituted. :param func_group: Either a string representing a functional group (from pymatgen.structure.core.FunctionalGroups), or a Molecule with a dummy atom X. :param bond_order: Order of the bond between the functional group and the base molecule. Default 1, for single bond. :param do_rct: If True (default), calculate both modified reactant and modified product; if False, only calculate for the product. :param new_dir: Name for new directory to store modified molecules. Default is None. :return: """ base_path = join(self.base_dir, directory) mol_files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and f.endswith(".mol") ] # For this workflow, assume a single product rct_file = [f for f in mol_files if f == reactant][0] pro_file = [f for f in mol_files if f.startswith(self.product_pre)][0] # Set up - strategy to extract bond orders # Node match for isomorphism check strat = OpenBabelNN() nm = iso.categorical_node_match("specie", "C") # Set up molecule graphs, including node attributes rct_mg = MoleculeGraph.with_local_env_strategy(get_molecule( join(base_path, rct_file)), strat, reorder=False, extend_structure=False) rct_mg.set_node_attributes() rct_graph = rct_mg.graph.to_undirected() pro_mg = MoleculeGraph.with_local_env_strategy(get_molecule( join(base_path, pro_file)), strat, reorder=False, extend_structure=False) pro_mg.set_node_attributes() pro_graph = pro_mg.graph.to_undirected() # To determine the subgraph of pro_mg that is derived from the reactant matcher = iso.GraphMatcher(pro_graph, rct_graph, node_match=nm) if not matcher.subgraph_is_isomorphic(): raise RuntimeError("Cannot find reactant molecule within product " "molecule.") else: for mm in matcher.subgraph_isomorphisms_iter(): mapping = mm # Reverse mapping mapping = {mapping[i]: i for i in mapping.keys()} new_path = None if new_dir is not None: try: os.mkdir(join(self.base_dir, new_dir)) except FileExistsError: print("New directory {} already exists in {}".format( new_dir, self.base_dir)) new_path = join(self.base_dir, new_dir) rct_mg.substitute_group(index, func_group, OpenBabelNN, bond_order=bond_order, extend_structure=False) pro_mg.substitute_group(mapping[index], func_group, OpenBabelNN, bond_order=bond_order, extend_structure=False) rct_name = rct_file.replace(".mol", "{}{}".format(func_group, index)) pro_name = pro_file.replace(".mol", "{}{}".format(func_group, index)) if new_path is None: new_path = base_path rct_mg.molecule.to(fmt="mol", filename=join(new_path, rct_name + ".mol")) pro_mg.molecule.to(fmt="mol", filename=join(new_path, pro_name + ".mol")) for mol_file in mol_files: if mol_file != pro_file and mol_file != rct_file: shutil.copyfile(join(base_path, mol_file), join(new_path, mol_file)) fws = [] fws.append( OptFreqSPFW(molecule=pro_mg.molecule, name="Modification: {}/{}".format(new_path, pro_name), qchem_cmd="qchem -slurm", input_file=join(new_path, pro_name + ".in"), output_file=join(new_path, pro_name + ".out"), qclog_file=join(new_path, pro_name + ".qclog"), max_cores=32, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file)) if do_rct: fws.append( OptFreqSPFW(molecule=rct_mg.molecule, name="Modification: {}/{}".format( new_path, rct_name), qchem_cmd="qchem -slurm", input_file=join(new_path, rct_name + ".in"), output_file=join(new_path, rct_name + ".out"), qclog_file=join(new_path, rct_name + ".qclog"), max_cores=32, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file)) return Workflow(fws)
def get_molecule_workflow(self, path, mol_id, name_pre="molecule_opt_freq", qchem_cmd="qchem -slurm", max_cores=32, qchem_input_params=None, modify_mol=True, max_iterations=3, max_perturb_scale=0.3): """ Generates a Fireworks Workflow to optimize a molecular geometry and perform a vibrational analysis (frequency calculation) in Q-Chem. :param path: Specified (sub)path in which to run the reaction. By default, this is None, and the Fireworks will run in self.base_dir :param mol_id: str representing the unique molecule identifier :param name_pre: str indicating the prefix which should be used for all Firework names :param qchem_cmd: str indicating how the Q-Chem code should be called. Default is "qchem -slurm", for a SLURM-based system. :param max_cores: int specifying how many cores the workflow should be split over. Default is 32. :param qchem_input_params: dict listing all parameters differing from default values. :param modify_mol: If True (default), use utility get_molecule to modify, including adding implicit hydrogens and performing an initial optimization. :param max_iterations (int): Number of perturbation -> optimization -> frequency iterations to perform. Defaults to 3. :param max_perturb_scale (float): The maximum scaled perturbation that can be applied to the molecule. Defaults to 0.3. :return: Workflow """ fws = [] base_path = join(self.base_dir, path, mol_id) files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and f.startswith(mol_id) and f.endswith(".mol") ] if len(files) > 1: print("Multiple valid molecule files found.") print("Generating workflows for all valid files found.") for i, file in enumerate(files): if modify_mol: mol = get_molecule(join(base_path, file)) else: mol = Molecule.from_file(join(base_path, file)) filename = file.split(".")[0] dir_name = join(base_path, "{}_{}".format(filename, i)) try: mkdir(dir_name) except FileExistsError: print("Subdirectory {} already exists".format(dir_name)) fw = FrequencyFlatteningOptimizeFW( molecule=mol, name=name_pre + "_{}".format(mol_id), qchem_cmd=qchem_cmd, qchem_input_params=qchem_input_params, multimode="openmp", max_cores=max_cores, directory=join(base_path), max_iterations=max_iterations, max_molecule_perturb_scale=max_perturb_scale, db_file=self.db_file) fws.append(fw) elif len(files) == 0: raise RuntimeError("No valid files found.") else: file = files[0] entry = self.molecules.find_one({"mol_id": mol_id}) if entry is None: mol = get_molecule(join(base_path, file)) else: geometry = entry["output"].get( 'optimized_molecule', entry["output"].get('initial_molecule')) mol = Molecule.from_dict(geometry) fw = FrequencyFlatteningOptimizeFW( molecule=mol, name=name_pre + "_{}".format(mol_id), qchem_cmd=qchem_cmd, qchem_input_params=qchem_input_params, multimode="openmp", max_cores=max_cores, directory=base_path, max_iterations=3, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def get_reaction_set_workflow(self, name_pre="opt_freq_sp", max_cores=32, qchem_input_params=None, sp_params=None): """Generates a Fireworks Workflow to find the structures and energies of the reactants and products of a single reaction. Note: as written now, this function will only work if self.subdirs is True; that is, only if each reaction is in a separate subdirectory. Later additions could allow for some other means of specifying the separate reactions within a single directory. :param name_pre: str indicating the prefix which should be used for all Firework names :param max_cores: int specifying number of processes/threads that can be used for this workflow. :param qchem_input_params: dict :param sp_params: For OptFreqSPFW, single-point calculations can be treated differently from Opt and Freq. In this case, another dict for sp must be used. :return: Workflow """ if not self.subdirs: raise RuntimeError("Cannot run get_reaction_set_workflow();" "Need reactions components to be isolated in" "different subdirectories.") fws = [] dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) ] # Only set up a workflow if it is worthwhile (the reaction actually # proceeds as written, and all atoms add up) appropriate_dirs = self.check_appropriate_dirs(dirs) if self.db is not None: all_fws = self.db.collection.find() # Keep track of which molecules have already been run as jobs before molecules_registered = [ extract_id(fw["task_label"]) for fw in all_fws ] else: molecules_registered = [] for d in appropriate_dirs: path = join(self.base_dir, d) files = [ f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".mol") ] rcts = [f for f in files if f.startswith(self.reactant_pre)] pros = [f for f in files if f.startswith(self.product_pre)] for i, rct in enumerate(rcts): mol_id = rct.rstrip(".mol").split("_")[-1] if mol_id in molecules_registered: continue else: molecules_registered.append(mol_id) mol = get_molecule(join(self.base_dir, d, rct)) infile = join(path, self.reactant_pre + str(i) + ".in") outfile = join(path, self.reactant_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, d, rct), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( path, self.reactant_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) for i, pro in enumerate(pros): mol_id = pro.rstrip(".mol").split("_")[-1] if mol_id in molecules_registered: continue else: molecules_registered.append(mol_id) mol = get_molecule(join(self.base_dir, d, pro)) infile = join(path, self.product_pre + str(i) + ".in") outfile = join(path, self.product_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, d, pro), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( path, self.product_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def get_single_reaction_workflow(self, name_pre="opt_freq_sp", path=None, filenames=None, max_cores=32, qchem_input_params=None, sp_params=None): """ Generates a Fireworks Workflow to find the structures and energies of the reactants and products of a single reaction. :param name_pre: str indicating the prefix which should be used for all Firework names :param path: Specified (sub)path in which to run the reaction. By default, this is None, and the Fireworks will run in self.base_dir :param filenames: Specified files within the path (if self.base_dir or a subdirectory) that should be considered a part of this reaction. If None, assume all files in the directory are to be involved. :param max_cores: int specifying number of processes/threads that can be used for this workflow. :param qchem_input_params: dict :param sp_params: For OptFreqSPFW, single-point calculations can be treated differently from Opt and Freq. In this case, another dict for sp must be used. :return: Workflow """ fws = [] if self.subdirs: base_path = join(self.base_dir, path) else: base_path = self.base_dir if filenames: rcts = [ f for f in filenames if f.startswith(self.reactant_pre) and f.endswith(".mol") ] pros = [ f for f in filenames if f.startswith(self.product_pre) and f.endswith(".mol") ] print(rcts) print(pros) else: # Assume that every file in the directory is part of the reaction files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and f.endswith(".mol") ] rcts = [f for f in files if f.startswith(self.reactant_pre)] pros = [f for f in files if f.startswith(self.product_pre)] for i, rct in enumerate(rcts): mol = get_molecule(join(base_path, rct)) infile = join(base_path, self.reactant_pre + str(i) + ".in") outfile = join(base_path, self.reactant_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, path, rct), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( base_path, self.reactant_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) for i, pro in enumerate(pros): mol = get_molecule(join(base_path, pro)) infile = join(base_path, self.product_pre + str(i) + ".in") outfile = join(base_path, self.product_pre + str(i) + ".out") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, path, pro), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=join( base_path, self.product_pre + str(i) + ".qclog"), max_cores=max_cores, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def get_single_molecule_workflow(self, mol_id, name_pre="opt_freq_sp", path=None, max_cores=32, max_iterations=1, max_perturb_scale=0.3, qchem_input_params=None, sp_params=None): """ Generates a Fireworks Workflow to find the structures and energies of the reactants and products of a single reaction. :param mol_id: ID string for molecule to be analyzed :param name_pre: str indicating the prefix which should be used for all Firework names :param path: str indicating subdirectory where calculation should take place :param max_cores: int specifying number of processes/threads that can be used for this workflow. :param max_iterations: For opt-freq-sp workflow, multiple iteractions can be performed in case of negative frequencies. By default, no such "frequency flattening" is allowed (max_iterations=1); in general, 3 is recommended. :param max_perturb_scale (float): The maximum scaled perturbation that can be applied to the molecule. Defaults to 0.3. :param qchem_input_params: dict :param sp_params: For OptFreqSPFW, single-point calculations can be treated differently from Opt and Freq. In this case, another dict for sp must be used. :return: Workflow """ fws = [] if self.subdirs: base_path = join(self.base_dir, path) else: base_path = self.base_dir # Assume that every file in the directory is part of the reaction file = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and f.startswith(mol_id) and f.endswith(".mol") ][0] mol = get_molecule(join(base_path, file)) infile = join(base_path, mol_id + ".qin") outfile = join(base_path, mol_id + ".qout") qclogfile = join(base_path, mol_id + ".qclog") fw = OptFreqSPFW(molecule=mol, name="{}: {}/{}".format(name_pre, path, file), qchem_cmd="qchem -slurm", input_file=infile, output_file=outfile, qclog_file=qclogfile, max_cores=max_cores, max_iterations=max_iterations, max_molecule_perturb_scale=max_perturb_scale, qchem_input_params=qchem_input_params, sp_params=sp_params, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def get_reaction_workflow(self, rxn_id, mol_dir=None, name_pre="reaction_opt_freq", qchem_cmd="qchem -slurm", max_cores=32, qchem_input_params=None, max_iterations=3, max_perturb_scale=0.3): """ Generates a Fireworks Workflow to perform geometry optimizations and vibrational analyses on all of the molecules involved in a chemical reaction. :param rxn_id: str representing unique reaction identifier. :param mol_dir: str indicating a subdirectory (from self.base_dir) where molecule calculations should be stored. Default is None, indicating that all calculations should be done within self.base_dir. :param name_pre: str indicating the prefix which should be used for all Firework names :param qchem_cmd: str indicating how the Q-Chem code should be called. Default is "qchem -slurm", for a SLURM-based system. :param max_cores: int specifying how many cores the workflow should be split over. Default is 32. :param qchem_input_params: dict listing all parameters differing from default values. :param max_iterations (int): Number of perturbation -> optimization -> frequency iterations to perform. Defaults to 3. :param max_perturb_scale (float): The maximum scaled perturbation that can be applied to the molecule. Defaults to 0.3. :return: Workflow """ fws = [] if mol_dir is not None: base_path = join(self.base_dir, mol_dir) else: base_path = self.base_dir mol_dirs = [ d for d in listdir(base_path) if isdir(join(base_path, d)) and not ("atomate" in d) ] rxn = self.reactions.find_one({"rxn_id": rxn_id}) if rxn is None: raise RuntimeError( "No reaction with id {} found in database.".format(rxn_id)) mol_ids = [str(i) for i in rxn["pro_ids"] + rxn["rct_ids"]] for mol_id in mol_ids: mol_path = join(base_path, mol_id) if mol_id not in mol_dirs: os.mkdir(mol_path) os.chdir(mol_path) # Search for molecule in previous calculations result = self.molecules.find_one({"mol_id": mol_id}) if result is None: mol_files = [ f for f in listdir(mol_path) if isfile(join(mol_path, f)) and f.endswith(".mol") ] if len(mol_files) == 0: raise RuntimeError("Molecule not found in database or file" " system.") elif len(mol_files) > 1: print("More than one valid *.mol file available.") print("Selecting one for analysis.") mol = get_molecule(join(mol_path, mol_files[0])) else: entry = result["output"].get( 'optimized_molecule', result["output"].get('initial_molecule')) mol = Molecule.from_dict(entry) fw = FrequencyFlatteningOptimizeFW( molecule=mol, name=name_pre + "_{}".format(mol_id), qchem_cmd=qchem_cmd, qchem_input_params=qchem_input_params, multimode="openmp", max_cores=max_cores, directory=mol_path, max_iterations=max_iterations, max_molecule_perturb_scale=max_perturb_scale, db_file=self.db_file) fws.append(fw) return Workflow(fws)
def copy_outputs_across_directories(self): """ Copy output files between subdirectories to ensure that all reaction directories that need outputs of a given molecule will have them. Note: This function should not be used unless necessary. It was written because for each directory, only a single database entry was being made (because db entries were being overwritten by default. :return: """ files_copied = 0 dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] print("Number of directories: {}".format(len(dirs))) for start_d in dirs: start_p = join(self.base_dir, start_d) mol_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and f.endswith(".mol") ] out_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and ".out" in f ] for mf in mol_files: is_covered = False mol_id = extract_id(mf) mol_obj = get_molecule(join(start_p, mf)) for out in out_files: qcout = QCOutput(join(start_p, out)) if sorted( qcout.data["initial_molecule"].species) == sorted( mol_obj.species): # If there is already output, do not copy any files is_covered = True if is_covered: continue for other_d in dirs: if other_d == start_d: continue if is_covered: break other_p = join(self.base_dir, other_d) # Check if this id is present other_mol_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and f.endswith(".mol") and mol_id in f ] other_out_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and ".out" in f ] to_copy = [] for other_mol in other_mol_files: if other_mol.startswith(self.product_pre): to_copy = [ f for f in other_out_files if f.startswith(self.product_pre) ] elif other_mol.startswith(self.reactant_pre): to_check = [ f for f in other_out_files if f.startswith(self.reactant_pre) ] to_copy = [] for file in to_check: qcout = QCOutput(join(other_p, file)) if qcout.data[ "initial_molecule"].species == mol_obj.species: to_copy.append(file) else: to_copy = [] for file in to_copy: shutil.copyfile(join(other_p, file), join(start_p, file + "_copy")) files_copied += 1 if files_copied > 0: is_covered = True print("Number of files copied: {}".format(files_copied))