def generate_freq_input(qoutfile, qinfile, basis_set="6-311++G*", pcm_dielectric=None, overwrite_inputs=None): """ Parses a QChem output file for ideal structure and then returns a QChem input file for frequency calculations (to determine enthalpy and entropy). :param qoutfile: Absolute path to the QChem output file (.out) :param qinfile: Absolute path to the QChem input file (.in) :return: """ output = QCOutput(qoutfile) if len(output.data.get("molecule_from_optimized_geometry", [])) > 0: mol = output.data["molecule_from_optimized_geometry"] else: try: mol = output.data["molecule_from_last_geometry"] except KeyError: raise RuntimeError("No molecule to use as input") qcinput = FreqSet(mol, basis_set=basis_set, pcm_dielectric=pcm_dielectric, overwrite_inputs=overwrite_inputs) qcinput.write_file(qinfile)
def process_qchem_multirun(self, dir_name, input_files, output_files): """ Process a QChem run which is known to include multiple calculations in a single input/output pair. """ if len(input_files) != 1: raise ValueError( "ERROR: The drone can only process a directory containing a single input/output pair when each include multiple calculations." ) else: for key in input_files: to_return = [] qchem_input_file = os.path.join(dir_name, input_files.get(key)) qchem_output_file = os.path.join(dir_name, output_files.get(key)) multi_out = QCOutput.multiple_outputs_from_file( QCOutput, qchem_output_file, keep_sub_files=False) multi_in = QCInput.from_multi_jobs_file(qchem_input_file) for ii, out in enumerate(multi_out): d = out.data d["input"] = {} d["input"]["molecule"] = multi_in[ii].molecule d["input"]["rem"] = multi_in[ii].rem d["input"]["opt"] = multi_in[ii].opt d["input"]["pcm"] = multi_in[ii].pcm d["input"]["solvent"] = multi_in[ii].solvent d["input"]["smx"] = multi_in[ii].smx d["task"] = {"type": key, "name": "calc" + str(ii)} to_return.append(d) return to_return
def generate_single_point_input(qoutfile, qinfile, basis_set="6-311++G*", pcm_dielectric=None, overwrite_inputs=None): """ Parse QChem output file for ideal structure and then returns a QChem input file for single-point calculations. :param qoutfile: :param qinfile: :return: """ output = QCOutput(qoutfile) if len(output.data.get("molecule_from_optimized_geometry", [])) > 0: mol = output.data["molecule_from_optimized_geometry"] else: try: mol = output.data["molecule_from_last_geometry"] except KeyError: raise RuntimeError("No molecule to use as input") qcinput = SinglePointSet(mol, basis_set=basis_set, pcm_dielectric=pcm_dielectric, overwrite_inputs=overwrite_inputs) qcinput.write_file(qinfile)
def associate_qchem_to_mol(base_dir, directory): """ Assign all .in and .out files in a directory to one of the .mol files in that directory, based on the non-H atoms in those molecules. :param directory: :return: """ base_path = join(base_dir, directory) mol_files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and f.endswith(".mol") ] # Note: This will catch .in and .out files for incomplete computations # TODO: What's the best way to filter these out? in_files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and ".in" in f and not f.startswith("atomate") ] out_files = [ f for f in listdir(base_path) if isfile(join(base_path, f)) and ".out" in f and not f.startswith("atomate") ] mapping = {mol: {"in": [], "out": []} for mol in mol_files} for file in in_files: qcin = QCInput.from_file(join(base_path, file)) file_mol = qcin.molecule # Remove H because mol files may not begin with H included file_species = [str(s) for s in file_mol.species if str(s) != "H"] for mf in mol_files: mol_mol = Molecule.from_file(join(base_path, mf)) mol_species = [str(s) for s in mol_mol.species if str(s) != "H"] # Preserve initial order because that gives a better guarantee # That the two are actually associated if mol_species == file_species: mapping[mf]["in"].append(file) break for file in out_files: qcout = QCOutput(join(base_path, file)) file_mol = qcout.data["initial_molecule"] file_species = [str(s) for s in file_mol.species if str(s) != "H"] for mf in mol_files: mol_mol = Molecule.from_file(join(base_path, mf)) mol_species = [str(s) for s in mol_mol.species if str(s) != "H"] # Preserve initial order because that gives a better guarantee # That the two are actually associated if mol_species == file_species: mapping[mf]["out"].append(file) break return mapping
def process_qchemrun(self, dir_name, taskname, input_file, output_file): """ Process a QChem calculation, aka an input/output pair. """ qchem_input_file = os.path.join(dir_name, input_file) qchem_output_file = os.path.join(dir_name, output_file) d = QCOutput(qchem_output_file).data temp_input = QCInput.from_file(qchem_input_file) d["input"] = {} d["input"]["molecule"] = temp_input.molecule d["input"]["rem"] = temp_input.rem d["input"]["opt"] = temp_input.opt d["input"]["pcm"] = temp_input.pcm d["input"]["solvent"] = temp_input.solvent d["input"]["smx"] = temp_input.smx d["task"] = {"type": taskname, "name": taskname} return d
def get_completed_molecules(self, dirs=None, extra=False): """ Returns a list of molecules with completed opt, freq, and sp output files. :param dirs: List of directories to search for completed molecules. :params extra: If True, include directory of completed reaction and name of molfile along with mol_id :return: set of completed molecules """ completed = set() all_dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] if dirs is not None: all_dirs = [d for d in all_dirs if d in dirs] for d in all_dirs: path = join(self.base_dir, d) mapping = associate_qchem_to_mol(self.base_dir, d) for molfile, qcfiles in mapping.items(): mol_id = extract_id(molfile) for outfile in qcfiles["out"]: if "sp" in outfile: spfile = QCOutput(join(path, outfile)) completion = spfile.data.get("completion", False) # Currently will catch iefpcm or smd if completion: if extra: completed.add((mol_id, d, molfile)) else: completed.add(mol_id) return completed
def opt_with_freq_sp(cls, qchem_command, multimode="openmp", input_file="mol.qin", output_file="mol.qout", qclog_file="mol.qclog", sp_params=None, max_cores=64, scratch_dir="/dev/shm/qcscratch/", save_scratch=False, save_name="default_save_name", **QCJob_kwargs): """ Optimize a structure, perform a frequency calculation to determine vibrational modes and thermodynamics, and then perform a single-point calculation to correct for errors in earlier calculations. :param qchem_command: String describing how to call qchem. :param multimode: How to perform multiprocessing. Can be "openmp" or "mpi" :param input_file: String describing location of QChem input file :param output_file: String describing location of QChem output file :param qclog_file: String describing location of log file :param sp_params: Dict describing the input parameters for single-point calculations. If None, the same parameters from opt and freq will be used. :param max_cores: For passthrough to QCJob. :param scratch_dir: For passthrough to QCJob. :param save_scratch: for passthrough to QCJob. :param save_name: For passthrough ot QCJob. :param QCJob_kwargs: Passthrough kwargs to QCJob. See :class:`custodian.qchem.new_jobs.QCJob`. :return: """ orig_opt_input = QCInput.from_file(input_file) orig_freq_rem = copy.deepcopy(orig_opt_input.rem) orig_freq_rem["job_type"] = "freq" yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, max_cores=max_cores, qclog_file=qclog_file, suffix=".opt", **QCJob_kwargs)) opt_outdata = QCOutput(output_file + ".opt").data freq_input = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=orig_freq_rem, opt=orig_opt_input.opt, pcm=orig_opt_input.pcm, solvent=orig_opt_input.solvent, smx=orig_opt_input.smx) freq_input.write_file(input_file) yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, max_cores=max_cores, qclog_file=qclog_file, suffix=".freq", **QCJob_kwargs)) outdata = QCOutput(output_file + ".freq").data errors = outdata.get("errors") if len(errors) != 0: raise AssertionError( 'No errors should be encountered while flattening frequencies!' ) if sp_params is not None: sp_input = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=sp_params.get("rem", { "method": "m06-2x", "basis": "6-311++g(d,p)" }), opt=sp_params.get("opt", None), pcm=sp_params.get("pcm", None), solvent=sp_params.get("solvent", None), smx=sp_params.get("smx", None)) else: orig_sp_rem = copy.deepcopy(orig_opt_input.rem) orig_sp_rem["job_type"] = "sp" sp_input = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=orig_sp_rem, opt=orig_opt_input.opt, pcm=orig_opt_input.pcm, solvent=orig_opt_input.solvent, smx=orig_opt_input.smx) sp_input.write_file(input_file) yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, qclog_file=qclog_file, suffix=".sp", **QCJob_kwargs))
def opt_with_frequency_flattener(cls, qchem_command, multimode="openmp", input_file="mol.qin", output_file="mol.qout", qclog_file="mol.qclog", sp_params=None, max_iterations=10, max_molecule_perturb_scale=0.3, reversed_direction=False, ignore_connectivity=False, **QCJob_kwargs): """ Optimize a structure and calculate vibrational frequencies to check if the structure is in a true minima. If a frequency is negative, iteratively perturbe the geometry, optimize, and recalculate frequencies until all are positive, aka a true minima has been found. Args: qchem_command (str): Command to run QChem. multimode (str): Parallelization scheme, either openmp or mpi. input_file (str): Name of the QChem input file. output_file (str): Name of the QChem output file max_iterations (int): Number of perturbation -> optimization -> frequency iterations to perform. Defaults to 10. max_molecule_perturb_scale (float): The maximum scaled perturbation that can be applied to the molecule. Defaults to 0.3. reversed_direction (bool): Whether to reverse the direction of the vibrational frequency vectors. Defaults to False. ignore_connectivity (bool): Whether to ignore differences in connectivity introduced by structural perturbation. Defaults to False. **QCJob_kwargs: Passthrough kwargs to QCJob. See :class:`custodian.qchem.new_jobs.QCJob`. """ min_molecule_perturb_scale = 0.1 scale_grid = 10 perturb_scale_grid = (max_molecule_perturb_scale - min_molecule_perturb_scale) / scale_grid msc = MoleculeStructureComparator() if not os.path.exists(input_file): raise AssertionError('Input file must be present!') orig_opt_input = QCInput.from_file(input_file) orig_opt_rem = copy.deepcopy(orig_opt_input.rem) orig_freq_rem = copy.deepcopy(orig_opt_input.rem) orig_freq_rem["job_type"] = "freq" for ii in range(max_iterations): yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, qclog_file=qclog_file, suffix=".opt_" + str(ii), **QCJob_kwargs)) opt_outdata = QCOutput(output_file + ".opt_" + str(ii)).data freq_QCInput = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=orig_freq_rem, opt=orig_opt_input.opt, pcm=orig_opt_input.pcm, solvent=orig_opt_input.solvent, smx=orig_opt_input.smx) freq_QCInput.write_file(input_file) yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, qclog_file=qclog_file, suffix=".freq_" + str(ii), **QCJob_kwargs)) outdata = QCOutput(output_file + ".freq_" + str(ii)).data errors = outdata.get("errors") if len(errors) != 0: raise AssertionError( 'No errors should be encountered while flattening frequencies!' ) if outdata.get('frequencies')[0] > 0.0: print("All frequencies positive!") break else: negative_freq_vecs = outdata.get("frequency_mode_vectors")[0] old_coords = outdata.get("initial_geometry") old_molecule = outdata.get("initial_molecule") structure_successfully_perturbed = False for molecule_perturb_scale in np.arange( max_molecule_perturb_scale, min_molecule_perturb_scale, -perturb_scale_grid): new_coords = perturb_coordinates( old_coords=old_coords, negative_freq_vecs=negative_freq_vecs, molecule_perturb_scale=molecule_perturb_scale, reversed_direction=reversed_direction) new_molecule = Molecule( species=outdata.get('species'), coords=new_coords, charge=outdata.get('charge'), spin_multiplicity=outdata.get('multiplicity')) if msc.are_equal(old_molecule, new_molecule) or ignore_connectivity: structure_successfully_perturbed = True break if not structure_successfully_perturbed: raise Exception( "Unable to perturb coordinates to remove negative frequency without changing the bonding structure" ) new_opt_QCInput = QCInput(molecule=new_molecule, rem=orig_opt_rem, opt=orig_opt_input.opt, pcm=orig_opt_input.pcm, solvent=orig_opt_input.solvent) new_opt_QCInput.write_file(input_file) if sp_params is not None: sp_input = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=sp_params.get("rem", { "method": "wb97x-d", "basis": "6-311++g(d,p)" }), opt=sp_params.get("opt", None), pcm=sp_params.get("pcm", None), solvent=sp_params.get("solvent", None), smx=sp_params.get("smx", None)) else: orig_sp_rem = copy.deepcopy(orig_opt_input.rem) orig_sp_rem["job_type"] = "sp" sp_input = QCInput( molecule=opt_outdata.get("molecule_from_optimized_geometry"), rem=orig_sp_rem, opt=orig_opt_input.opt, pcm=orig_opt_input.pcm, solvent=orig_opt_input.solvent, smx=orig_opt_input.smx) sp_input.write_file(input_file) yield (QCJob(qchem_command=qchem_command, multimode=multimode, input_file=input_file, output_file=output_file, qclog_file=qclog_file, suffix=".sp", **QCJob_kwargs))
def extract_reaction_thermo_files(self, path): """ Naively scrape thermo data from QChem output files. :param path: Path to a subdirectory. :return: dict {prop: value}, where properties are enthalpy, entropy. """ base_path = join(self.base_dir, path) rct_ids = [ extract_id(f) for f in listdir(base_path) if f.endswith(".mol") and f.startswith(self.reactant_pre) ] pro_ids = [ extract_id(f) for f in listdir(base_path) if f.endswith(".mol") and f.startswith(self.product_pre) ] rct_map = { m: [ f for f in listdir(base_path) if f.startswith(self.reactant_pre) and m in f and ".out" in f and not f.endswith("_copy") ] for m in rct_ids } pro_map = { m: [ f for f in listdir(base_path) if f.startswith(self.product_pre) and m in f and ".out" in f ] for m in pro_ids } rct_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}} pro_thermo = {"enthalpy": 0, "entropy": 0, "energy": 0, "has_sp": {}} for mol in rct_map.keys(): enthalpy = 0 entropy = 0 energy_opt = 0 energy_sp = 0 for out in rct_map[mol]: qcout = QCOutput(join(base_path, out)) # Catch potential for Nonetype entries if "freq" in out: enthalpy = qcout.data.get("enthalpy", 0) or 0 entropy = qcout.data.get("entropy", 0) or 0 elif "opt" in out: energy_opt = qcout.data.get("final_energy", 0) or 0 elif "sp" in out: energy_sp = qcout.data.get("final_energy_sp", 0) or 0 if energy_sp == 0: rct_thermo["energy"] += energy_opt rct_thermo["has_sp"][self.reactant_pre + str(mol)] = False else: rct_thermo["energy"] += energy_sp rct_thermo["has_sp"][self.reactant_pre + str(mol)] = True rct_thermo["enthalpy"] += enthalpy rct_thermo["entropy"] += entropy print(path, mol, enthalpy, energy_sp) for mol in pro_map.keys(): enthalpy = 0 entropy = 0 energy_opt = 0 energy_sp = 0 for out in pro_map[mol]: qcout = QCOutput(join(base_path, out)) # Catch potential for Nonetype entries if "freq" in out: enthalpy = qcout.data.get("enthalpy", 0) or 0 entropy = qcout.data.get("entropy", 0) or 0 elif "opt" in out: energy_opt = qcout.data.get("final_energy", 0) or 0 elif "sp" in out: energy_sp = qcout.data.get("final_energy_sp", 0) or 0 # Enthalpy calculation should actually be enthalpy - energy_sp # But currently, not all calculations have sp if int(energy_sp) == 0: pro_thermo["energy"] += energy_opt pro_thermo["has_sp"][self.product_pre + str(mol)] = False else: pro_thermo["energy"] += energy_sp pro_thermo["has_sp"][self.product_pre + str(mol)] = True pro_thermo["enthalpy"] += enthalpy pro_thermo["entropy"] += entropy print(path, mol, enthalpy, energy_sp) thermo_data = {} # Generate totals as ∆H = H_pro - H_rct, ∆S = S_pro - S_rct # Also ensures that units are appropriate (Joules/mol, # rather than cal/mol or kcal/mol, or hartree for energy) energy = (pro_thermo["energy"] - rct_thermo["energy"]) * 627.509 enthalpy = (pro_thermo["enthalpy"] - rct_thermo["enthalpy"]) print(path, energy, enthalpy) thermo_data["enthalpy"] = (energy + enthalpy) * 1000 * 4.184 thermo_data["entropy"] = (pro_thermo["entropy"] - rct_thermo["entropy"]) * 4.184 try: thermo_data["t_critical"] = thermo_data["enthalpy"] / thermo_data[ "entropy"] except ZeroDivisionError: thermo_data["t_critical"] = None # Combine dicts from pro_thermo and rct_thermo thermo_data["has_sp"] = { **pro_thermo["has_sp"], **rct_thermo["has_sp"] } result = { "thermo": thermo_data, "directory": path, "reactant_ids": rct_ids, "product_ids": pro_ids } return result
def copy_outputs_across_directories(self): """ Copy output files between subdirectories to ensure that all reaction directories that need outputs of a given molecule will have them. Note: This function should not be used unless necessary. It was written because for each directory, only a single database entry was being made (because db entries were being overwritten by default. :return: """ files_copied = 0 dirs = [ d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d)) and not d.startswith("block") ] print("Number of directories: {}".format(len(dirs))) for start_d in dirs: start_p = join(self.base_dir, start_d) mol_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and f.endswith(".mol") ] out_files = [ f for f in listdir(start_p) if isfile(join(start_p, f)) and ".out" in f ] for mf in mol_files: is_covered = False mol_id = extract_id(mf) mol_obj = get_molecule(join(start_p, mf)) for out in out_files: qcout = QCOutput(join(start_p, out)) if sorted( qcout.data["initial_molecule"].species) == sorted( mol_obj.species): # If there is already output, do not copy any files is_covered = True if is_covered: continue for other_d in dirs: if other_d == start_d: continue if is_covered: break other_p = join(self.base_dir, other_d) # Check if this id is present other_mol_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and f.endswith(".mol") and mol_id in f ] other_out_files = [ f for f in listdir(other_p) if isfile(join(other_p, f)) and ".out" in f ] to_copy = [] for other_mol in other_mol_files: if other_mol.startswith(self.product_pre): to_copy = [ f for f in other_out_files if f.startswith(self.product_pre) ] elif other_mol.startswith(self.reactant_pre): to_check = [ f for f in other_out_files if f.startswith(self.reactant_pre) ] to_copy = [] for file in to_check: qcout = QCOutput(join(other_p, file)) if qcout.data[ "initial_molecule"].species == mol_obj.species: to_copy.append(file) else: to_copy = [] for file in to_copy: shutil.copyfile(join(other_p, file), join(start_p, file + "_copy")) files_copied += 1 if files_copied > 0: is_covered = True print("Number of files copied: {}".format(files_copied))
def get_unfinished_jobs(self, sp_params, name_pre="single_point", dirs=None, max_cores=24): """ Look for jobs where optimization and frequency calculations have successfully completed, but single-point has not. Then, for these cases, construct a workflow which will only run the sp job. :param sp_params: dict containing input parameters for single-point job :param name_pre: str representing prefix for all jobs. :param dirs: list of subdirectories to check for unfinished jobs. Default None, meaning that all subdirectories will be checked. :param max_cores: max_cores (int): Maximum number of cores to parallelize over. Defaults to 24. :return: """ if not self.subdirs: raise RuntimeError("Cannot run get_reaction_set_workflow();" "Need reactions components to be isolated in" "different subdirectories.") fws = [] all_dirs = [d for d in listdir(self.base_dir) if isdir(join(self.base_dir, d))] molecules_cleared = [] appropriate_dirs = all_dirs if dirs is not None: appropriate_dirs = [d for d in appropriate_dirs if d in dirs] for d in appropriate_dirs: path = join(self.base_dir, d) file_map = associate_qchem_to_mol(self.base_dir, d) for key, values in file_map.items(): mol_id = extract_id(key) if mol_id in molecules_cleared: continue freq_complete = False sp_complete = False in_files = values["in"] out_files = values["out"] # Check if this molecule has finished freq, sp # If there is no sp output file, or if the sp output file did # not complete, then we may proceed for out_file in out_files: if "freq" in out_file: freq_out = QCOutput(join(path, out_file)) if freq_out.data.get("completion", []): freq_complete = True elif "sp" in out_file: sp_out = QCOutput(join(path, out_file)) if sp_out.data.get("completion", []): sp_complete = True if freq_complete and not sp_complete: # Check if there is already an sp input file freq_in_file = None for in_file in in_files: if "freq" in in_file: freq_in_file = in_file if freq_in_file is None: # We could parse output files to get previous input # information, but we should try to keep all input # files in the same directory continue else: infile = join(path, key.replace(".mol", "") + ".in") outfile = join(path, key.replace(".mol", "") + ".out") qclogfile = join(path, key.replace(".mol", "") + ".qclog") freq_in_file = QCInput.from_file(join(path, freq_in_file)) mol = freq_in_file.molecule fw = SinglePointFW(molecule=mol, name="{}: {}/{}".format(name_pre, d, mol_id), qchem_cmd="qchem -slurm", multimode="openmp", input_file=infile, output_file=outfile, qclog_file=qclogfile, max_cores=max_cores, sp_params=sp_params) fws.append(fw) molecules_cleared.append(mol_id) return Workflow(fws)