def main(argv=None): print(f"Running GaussianWrangler script plot_steps version {__version__}") # Read input args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret try: # Make a list of lists from the input file list with open(args.list) as f: row_list = [row.strip().split() for row in f.readlines()] row_list = list(filter(None, row_list)) if args.output_fname: plot_fname = create_out_fname(args.output_fname, base_dir=args.out_dir, ext='.png') else: plot_fname = create_out_fname(args.list, base_dir=args.out_dir, ext='.png') plot_delta_g(plot_fname, args.temp, row_list, args.conv, args.fig_width, args.fig_height, args.y_axis_label) print("Wrote file: {}".format(plot_fname)) except IOError as e: warning("Problems reading file:", e) return IO_ERROR except InvalidDataError as e: warning("Problems reading data:", e) return INVALID_DATA return GOOD_RET # success
def create_dynamics_plots(add_rate_str, bond_types, cfg, num_monos, num_oligs, sg_ratio): # Starting with num mon & olig vs timestep: len_y_val_key_list = [MONOMERS, OLIGOMERS] min_len = len(num_monos[0]) avg_bond_types = {} std_bond_types = {} if cfg[NUM_REPEATS] > 1: # If there are multiple runs, arrays may be different lengths, so find shortest array min_len = len(num_monos[0]) for mono_list in num_monos[1:]: if len(mono_list) < min_len: min_len = len(mono_list) # make lists of lists into np array sg_num_monos = np.asarray([np.array(num_list[:min_len]) for num_list in num_monos]) # could save; for now, use to make images av_num_monos = np.mean(sg_num_monos, axis=0) sg_num_oligs = np.asarray([np.array(num_list[:min_len]) for num_list in num_oligs]) av_num_oligs = np.mean(sg_num_oligs, axis=0) std_num_monos = np.std(sg_num_monos, axis=0) std_num_oligs = np.std(sg_num_oligs, axis=0) len_y_axis_val_dicts = {MONOMERS: av_num_monos, OLIGOMERS: av_num_oligs} len_y_axis_std_dev_dicts = {MONOMERS: std_num_monos, OLIGOMERS: std_num_oligs} for bond_type in BOND_TYPE_LIST: sg_bond_dist = np.asarray([np.array(bond_list[:min_len]) for bond_list in bond_types[bond_type]]) avg_bond_types[bond_type] = np.mean(sg_bond_dist, axis=0) std_bond_types[bond_type] = np.std(sg_bond_dist, axis=0) else: len_y_axis_val_dicts = {MONOMERS: num_monos[0], OLIGOMERS: num_oligs[0]} len_y_axis_std_dev_dicts = {MONOMERS: None, OLIGOMERS: None} for bond_type in BOND_TYPE_LIST: avg_bond_types[bond_type] = bond_types[bond_type] std_bond_types[bond_type] = None timesteps = list(range(min_len)) title = f"S:G Ratio {sg_ratio}, Add rate {add_rate_str} monomer/s" sg_str = f'{sg_ratio:.{3}g}'.replace("+", "").replace(".", "-") fname = create_out_fname(f'mono_olig_v_step_{sg_str}_{add_rate_str}', base_dir=cfg[OUT_DIR], ext='.png') x_axis_label = 'Time step' y_axis_label = 'Number' plot_bond_error_bars(timesteps, len_y_axis_val_dicts, len_y_axis_std_dev_dicts, len_y_val_key_list, x_axis_label, y_axis_label, title, fname) fname = create_out_fname(f'bond_dist_v_step_{sg_str}_{add_rate_str}', base_dir=cfg[OUT_DIR], ext='.png') x_axis_label = 'Time step' y_axis_label = 'Number of Bonds' plot_bond_error_bars(timesteps, avg_bond_types, std_bond_types, BOND_TYPE_LIST, x_axis_label, y_axis_label, title, fname)
def create_convergence_plots(out_fname, step_list): """ To allow easy viewing of convergence :param out_fname: This is the name of the base csv file :param step_list: list of dicts with data re convergence :return: n/a, save file """ png_out = create_out_fname(out_fname, prefix='', ext='.png') png_titles = [CONVERG, ENERGY, MAX_FORCE, RMS_FORCE, MAX_DISPL, RMS_DISPL] num_lists = len(png_titles) png_lists = [[] for _ in range(num_lists)] steps = [] for s_dict in step_list: steps.append(s_dict[STEP_NUM]) for list_id in range(num_lists): png_lists[list_id].append(s_dict[png_titles[list_id]]) fig, axs = plt.subplots(num_lists, figsize=(7, 11.5)) for list_id in range(num_lists): axs[list_id].plot(steps, png_lists[list_id]) axs[list_id].set_title(png_titles[list_id]) plt.subplots_adjust(hspace=0.4) plt.xlabel("Step number") plt.savefig( png_out, transparent=True, bbox_inches='tight', ) plt.close() print(f"Wrote file: {os.path.relpath(png_out)}")
def process_gausscom_files(cfg, pdb_tpl_content): f_name = '' if cfg[COMBINE_LOGS]: f_name = create_out_fname(cfg[OUTFILE_NAME], ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) silent_remove(f_name) for gausslog_file in cfg[GAUSSLOG_FILES]: if not cfg[PDB_TPL_FILE]: pdb_tpl_content[SEC_HEAD] = ["TITLE {}".format(gausslog_file)] pdb_tpl_content[SEC_TAIL] = ["END"] if not cfg[COMBINE_LOGS]: if cfg[OUTFILE_NAME]: out_name = cfg[OUTFILE_NAME] else: out_name = gausslog_file f_name = create_out_fname(out_name, ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) process_gausslog_file(cfg, gausslog_file, pdb_tpl_content, f_name)
def process_pdb_file(cfg, gau_tpl_content, pdb_file): with open(pdb_file) as d: mol_num = 0 pdb_atom_line = [] for line in d.readlines(): pdb_section = line[:PDB_LINE_TYPE_LAST_CHAR] if pdb_section == 'MODEL ': mol_num += 1 elif pdb_section == 'ATOM ' or pdb_section == 'HETATM': element = line[ PDB_BEFORE_ELE_LAST_CHAR:PDB_ELE_LAST_CHAR].strip() if element == '': element = line[PDB_ATOM_NUM_LAST_CHAR: PDB_ATOM_TYPE_LAST_CHAR].strip() pdb_xyz = line[PDB_MOL_NUM_LAST_CHAR:PDB_Z_LAST_CHAR] pdb_atom_line.append(["{:6}".format(element), pdb_xyz]) elif pdb_section == 'END\n': if mol_num == 0: mol_id = '' else: mol_id = '_' + str(mol_num) d_out = create_out_fname(pdb_file, suffix=mol_id, ext='.com') if cfg[REMOVE_H]: del pdb_atom_line[-1] list_to_file( gau_tpl_content[SEC_HEAD] + pdb_atom_line + gau_tpl_content[SEC_TAIL], d_out) if cfg[NUM] and mol_num >= cfg[NUM]: return pdb_atom_line = []
def plot_mz_v_intensity(fname, data_array_dict, num_decimals_ms_accuracy, out_dir): """ Plot m/z v intensities for all entries in the data_dict_array (key is ms_level) :param fname: str, name of the file where the data originated :param data_array_dict: dict, str (label): ndarray (n x 3) with M/Z, intensity, and retention times :param num_decimals_ms_accuracy: int, number of decimal points in MS accuracy, for rounding :param out_dir: None or str, provides location where new file should be saved (None for current directory) :return: ndarray, (m x 2), were m is the number of unique retention times, in first column. Second column is total intensity for that retention time. """ labels = list(data_array_dict.keys()) first_label = labels[0] lower_fname = fname.lower() if isinstance(first_label, str): if "ms" in first_label: level = first_label else: level = f"ms{first_label}" else: # assumes numeric if the MS2, and the level is the ionization energy; only include the level if single level level = "ms2" if len(labels) == 1: ion_energy = f"hcd{first_label}" if ion_energy not in lower_fname: if level in lower_fname: level = ion_energy else: level += "_" + ion_energy title = f"M/Z versus Intensity from {level.upper()} Data" suffix = "_mz_v_int" if level not in lower_fname: suffix = "_" + level + suffix plot_fname = create_out_fname(fname, suffix=suffix, ext='png', base_dir=out_dir) default_x_max = 1000. data_max_x = 0. data_max_y = 0. for data_array in data_array_dict.values(): current_max_x = np.max(data_array[:, 0]) if current_max_x > data_max_x: data_max_x = current_max_x current_max_y = np.max(data_array[:, 1]) if current_max_y > data_max_y: data_max_y = current_max_y if data_max_x > default_x_max: x_value_warning(data_max_x, default_x_max) y_max = find_pos_plot_limit(data_max_y) make_vlines_plot(title, "M/Z Values", "Intensity (unscaled)", data_array_dict, plot_fname, num_decimals_ms_accuracy, default_x_max, y_max)
def create_coms_from_mol_list(conformer_list, gau_tpl_content, base_out_name, max_num_coms, print_original): """ From a list of RDKit mol objects, create gaussian output files, optionally for only the specified number of objects :param conformer_list: :param gau_tpl_content: :param base_out_name: :param max_num_coms: int or infinity :param print_original: Boolean, whether to print the initial conformation :return: """ energy_list = [] if print_original: start_at = 0 else: start_at = 1 RDLogger.DisableLog('rdApp.*') for current_mol in conformer_list[start_at:]: opt_results = MMFFOptimizeMoleculeConfs(current_mol, maxIters=0) energy_list.append(opt_results[0][1]) combined_lists = zip(energy_list, conformer_list) zipped_sorted = sorted(combined_lists, key=itemgetter(0)) # for energy in sorted(energy_list): # print(f"{energy:15.8f}") mol_num = 0 last_energy = np.nan print_note = False com_fname = None for energy, current_mol in zipped_sorted: if mol_num >= max_num_coms: if np.isclose(energy, last_energy): print_note = True else: break mol_num += 1 last_energy = energy com_fname = create_out_fname(base_out_name, suffix=f"_{mol_num}", ext=".com", rel_path=True) pdb_str = MolToPDBBlock(current_mol) create_com_from_pdb_str(pdb_str, gau_tpl_content, com_fname) print(f"{int(energy):12,} {com_fname}") if com_fname: print( f"Wrote {mol_num} files, ending with: {os.path.relpath(com_fname)}" ) else: print("No output created from rotating dihedrals.") if print_note: print( f"More than {max_num_coms} conformations were output to ties calculated energies." )
def create_bond_v_sg_plots(add_rate_str, cfg, sg_adjs): all_avg_bonds, all_std_bonds = get_avg_percent_bonds(BOND_TYPE_LIST, len(cfg[SG_RATIOS]), sg_adjs, cfg[NUM_REPEATS], cfg[BREAK_CO]) title = f"Add rate {add_rate_str} monomer/s" x_axis_label = 'SG Ratio' y_axis_label = 'Bond Type Yield (%)' fname = create_out_fname(f'bond_dist_v_sg_{add_rate_str}', base_dir=cfg[OUT_DIR], ext='.png') plot_bond_error_bars(cfg[SG_RATIOS], all_avg_bonds, all_std_bonds, BOND_TYPE_LIST, x_axis_label, y_axis_label, title, fname)
def setup_and_submit(cfg, current_job_list, tpl_dict, testing_mode, chk_warn): if len(current_job_list) == 1 and current_job_list[0] == '': suffix = '' else: if current_job_list[0] == '': suffix = '_' + '_'.join(current_job_list[1:]) else: suffix = '_' + '_'.join(current_job_list) tpl_dict[JOB_DESCRIP] = tpl_dict[JOB_NAME] + suffix new_ini_fname = create_out_fname(tpl_dict[JOB_DESCRIP], ext='.ini', base_dir=cfg[OUT_DIR]) new_sbatch_fname = create_out_fname(tpl_dict[JOB_DESCRIP], ext='.slurm', base_dir=cfg[OUT_DIR]) sbatch_dict = create_sbatch_dict( cfg, tpl_dict, os.path.relpath(new_ini_fname), current_job_list, start_from_job_name_chk=cfg[START_FROM_SAME_CHK], ignore_chk_warning=chk_warn) tpl_str = read_tpl(cfg[SBATCH_TPL]) fill_save_tpl(tpl_str, sbatch_dict, cfg[SBATCH_TPL], new_sbatch_fname) # read ini_tpl and check if it has fields for submitting spawned jobs, if needed create_ini_with_req_keys(current_job_list, cfg[TPL_DICT], cfg, new_ini_fname) if not cfg[NO_SUBMIT]: # Do not want to actually (attempt to) submit a job during testing; this way, do not have to specify both # testing mode and NO_SUBMIT (could make NO_SUBMIT if in testing mode, but no real advantage to that if testing_mode: sbatch_result = subprocess.check_output( ["echo", "Running in testing mode: " "'sbatch' not called"]).decode("utf-8").strip() else: # Will not be covered in testing mode, as is not part of written code to be tested sbatch_result = subprocess.check_output( ["sbatch", new_sbatch_fname]).decode("utf-8").strip() print(sbatch_result)
def process_smiles(gau_tpl_fname, smi_list, max_num_confs, out_dir): """ Creates Gaussian input files for each SMILES string provided https://www.rdkit.org/docs/GettingStartedInPython.html :param smi_list: list of SMILES strings :param gau_tpl_fname: str, the location of the template file to use to create input files :param max_num_confs: int, the maximum number of conformations to generate :param out_dir: str, directory where files are to be saved (if None, saves to working directory) :return: N/A, writes files and prints notes on files created """ gau_tpl_str = read_tpl(gau_tpl_fname) if REQ_STR not in gau_tpl_str: raise InvalidDataError( f"Did not find the required string '{REQ_STR}' in the provided Gaussian input " f"template file.") for smi in smi_list: mol = Chem.MolFromSmiles(smi) if mol is None: warning(f"Skipping SMILES input string '{smi}' due to error\n") continue Chem.Kekulize(mol) mol = AddHs(mol) confs = gen_conformers(mol, num_confs=max_num_confs) mol_name = get_mol_name(smi) base_fname = create_out_fname(mol_name, ext='com', base_dir=out_dir, rel_path=True) conf_id = -1 # make IDE happy for conf_id in confs: com_fname = create_out_fname(base_fname, suffix=f'_{conf_id}') pdb_str = MolToPDBBlock(mol, confId=conf_id) coord_list = get_pdb_coord_list(pdb_str) fill_save_tpl(gau_tpl_str, {ATOMS: "\n".join(coord_list)}, gau_tpl_fname, com_fname, print_info=False) print(f"Wrote {conf_id + 1} files with base name '{base_fname}'")
def produce_output(adj_matrix, mono_list, cfg): if cfg[SUPPRESS_SMI] and not (cfg[SAVE_JSON] or cfg[SAVE_PNG] or cfg[SAVE_SVG]): format_list = [SAVE_TCL] mol = None # Make IDE happy else: # Default out is SMILES, which requires getting an rdKit molecule object; also required for everything # except the TCL format format_list = [SAVE_TCL, SAVE_JSON, SAVE_PNG, SAVE_SVG] block = generate_mol(adj_matrix, mono_list) mol = MolFromMolBlock(block) try: smi_str = MolToSmiles(mol) + '\n' except: raise InvalidDataError("Error in producing SMILES string.") # if SMI is to be saved, don't output to stdout if cfg[SAVE_SMI]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=SAVE_SMI) str_to_file(smi_str, fname, print_info=True) else: print("\nSMILES representation: \n", MolToSmiles(mol), "\n") if cfg[SAVE_PNG] or cfg[SAVE_SVG] or cfg[SAVE_JSON]: # PNG and SVG make 2D images and thus need coordinates # JSON will save coordinates--zero's if not computed; might as well compute and save non-zero values Compute2DCoords(mol) for save_format in format_list: if cfg[save_format]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=save_format) if save_format == SAVE_TCL: gen_tcl(adj_matrix, mono_list, tcl_fname=fname, chain_id=cfg[CHAIN_ID], psf_fname=cfg[PSF_FNAME], toppar_dir=cfg[TOPPAR_DIR], out_dir=cfg[OUT_DIR]) if save_format == SAVE_JSON: json_str = MolToJSON(mol) str_to_file(json_str + '\n', fname) elif save_format == SAVE_PNG or save_format == SAVE_SVG: MolToFile(mol, fname, size=cfg[IMAGE_SIZE]) print(f"Wrote file: {fname}")
def save_mol_files(smi_list, out_dir): """ Given a list of smiles strings, save each in a separate file :param smi_list: str, standard SMILES format :param out_dir: None or str, if None saves file to current directory, if str to location in str :return: n/a, saves a mol file for each smi """ for smi_str in smi_list: fname = create_out_fname(smi_str, ext='mol', base_dir=out_dir) mol = Chem.MolFromSmiles(smi_str) # simplest (no H, no coordinates) # MolToMolFile(mol, fname, includeStereo=False, kekulize=True) # 2D coords without H Chem.Kekulize(mol) Compute2DCoords(mol) MolToMolFile(mol, fname, includeStereo=False)
def make_image_grid(file_label, smi_list, labels=None, out_dir=PNG_DIR, mol_img_size=(400, 300), write_output=True): """ Given a molecular formula (or other label) and the set of SMI, make an image grid of all smiles within https://www.rdkit.org/docs/GettingStartedInPython.html :param file_label: str, such as chemical formula that corresponds to all smiles in SMILES set :param smi_list: list or set of SMILES strings; used to generate images :param labels: if None, will use the smi_list as labels; otherwise a list to use :param out_dir: directory where the file should be saved :param mol_img_size: tuple of ints to determine size of individual molecules :param write_output: boolean to determine whether to write to screen that a file was created :return: N/A, save a file """ mols = [] for smi in smi_list: mol = Chem.MolFromSmiles(smi) Compute2DCoords(mol) mols.append(mol) if labels: img_labels = labels else: img_labels = smi_list if len(mols) == 1: # didn't see a way for RDKit to add a label to an image with a single molecule (grid image does not work # for one image), so add to file name file_label += '_' + img_labels[0] fname = create_out_fname(file_label, ext='png', base_dir=out_dir) if len(mols) == 1: MolToFile(mols[0], fname, size=mol_img_size) else: img_grid = MolsToGridImage(mols, molsPerRow=3, subImgSize=mol_img_size, legends=img_labels) img_grid.save(fname) if write_output: print(f"Wrote file: {os.path.relpath(fname)}")
def print_clean_csv(fname, fname_lower, ms_level, data_array, comment, direct_injection, omit_csv_headers, numpy_save_fmt, out_dir): if "ms" + ms_level in fname_lower: suffix = "" else: suffix = f"_ms{ms_level}" if "clean" not in fname: suffix = suffix + "_clean" if direct_injection: if omit_csv_headers: suffix = suffix + "_unlabeled" elif "direct" not in fname_lower: suffix = suffix + "_direct" # data_array will already be properly sorted; not rounded but okay because printing takes care of this f_out = create_out_fname(fname, suffix=suffix, ext='csv', base_dir=out_dir) # noinspection PyTypeChecker if omit_csv_headers: np.savetxt(f_out, data_array[:, :2], fmt=numpy_save_fmt, delimiter=',') else: np.savetxt(f_out, data_array, fmt=numpy_save_fmt, delimiter=',', header=comment + quote('","'.join(CSV_RET_HEADER)), comments='') print(f"Wrote file: {os.path.relpath(f_out)}")
def make_dbe_mw_graphs(fkey, ion_energies_dict, out_dir=None): """ makes and saves a graph of the bde value vs fragmentation energy for each set of qualifying files. the file the graph is saved to will be the fkey+_dbe_graph.png. :param fkey: str, used to designate sets of MS2 data :param ion_energies_dict: dict with data used for parent structure analysis, including average MW and DBEs :param out_dir: None if default output location is to be used :return: nothing """ energy_levels = sorted(list(ion_energies_dict.keys())) dbe_list = [] dbe_dev = [] dbe_var = [] dbe_skew = [] dbe_kurt = [] mz_list = [] mz_dev = [] mz_var = [] mz_skew = [] mz_kurt = [] for energy_level in energy_levels: weighted_avg_dbe, std_dev_dbe, variation_dbe, skew_dbe, kurtosis_dbe = ion_energies_dict[energy_level][AVG_DBE] dbe_list.append(weighted_avg_dbe) dbe_dev.append(std_dev_dbe) dbe_var.append(variation_dbe) dbe_skew.append(skew_dbe) dbe_kurt.append(kurtosis_dbe) weighted_avg_mz, std_dev_mz, variation_mz, skew_mz, kurtosis_mz = ion_energies_dict[energy_level][AVG_MZ] mz_list.append(weighted_avg_mz) mz_dev.append(std_dev_mz) mz_var.append(variation_mz) mz_skew.append(skew_mz) mz_kurt.append(kurtosis_mz) out_filename = create_out_fname(fkey, suffix=GRAPH_SUFFIX, base_dir=out_dir, ext="png") fig = plt.figure(figsize=(9, 12)) # The add_subplot sometimes throws a warning that we want to ignore with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) ax1 = fig.add_subplot(411) ax2 = fig.add_subplot(413) ax3 = fig.add_subplot(412) ax4 = fig.add_subplot(414) # ax1.plot(energy_levels, dbe_list, 'or-') ax1.errorbar(energy_levels, dbe_list, yerr=dbe_dev, fmt='or-') ax1.set_title('DBE vs. Fragmentation Energy') ax1.set_xlabel('Fragmentation Energy') ax1.set_ylabel('Double Bond Equivalent') ax3.plot(energy_levels, dbe_var, 'b', label="variance") ax3.plot(energy_levels, dbe_skew, 'g', label="skew") ax3.plot(energy_levels, dbe_kurt, 'r', label="kurtosis") ax3.legend(loc=0) ax3.set_title('DBE Statistics vs. Fragmentation Energy') ax3.set_xlabel('Fragmentation Energy') ax3.set_ylabel('Property value') # now mw ax2.errorbar(energy_levels, mz_list, yerr=mz_dev, fmt='ob-') ax2.set_title('Weighted Average M/Z vs. Fragmentation Energy') ax2.set_xlabel('Fragmentation Energy') ax2.set_ylabel('Weighted Average M/Z') ax4.plot(energy_levels, mz_var, 'b', label="variance") ax4.plot(energy_levels, mz_skew, 'g', label="skew") ax4.plot(energy_levels, mz_kurt, 'r', label="kurtosis") ax4.legend(loc=0) ax4.set_title('M/Z Statistics vs. Fragmentation Energy') ax4.set_xlabel('Fragmentation Energy') ax4.set_ylabel('Property value') fig.tight_layout() fig.savefig(out_filename) print(f"Wrote file: {os.path.relpath(out_filename)}") plt.close()
def process_gausscom_file(cfg, gausscom_file, pdb_tpl_content): with open(gausscom_file) as d: if cfg[PDB_TPL_FILE]: pdb_data_section = copy.deepcopy(pdb_tpl_content[SEC_ATOMS]) else: pdb_data_section = [] section = SEC_HEAD atom_id = 0 for line in d: line = line.strip() # not currently keeping anything from the header; just check num atoms if section == SEC_HEAD: # there may be some instructions (which start with %, and can have some blank lines) before the # "route card lines" (which start with #) while not GAU_HEADER_PAT.match(line): line = next(d).strip() # skip first line of route card line = next(d).strip() # for "route card" and then description, there may be more than one header line; look for blank line for i in range(2): while len(line) > 0: line = next(d).strip() # now move past the blank line, and get the content of the following line line = next(d).strip() # now on charge, multiplicity line, which we also skip with the "continue" section = SEC_ATOMS continue elif section == SEC_ATOMS: if len(line) == 0: # Since the tail will come only from the template, nothing more is needed after reading atoms break split_line = line.split() atom_type = split_line[0] # if working from a template, check atom type if cfg[PDB_TPL_FILE]: try: pdb_atom_type = pdb_data_section[atom_id][8].split( ' ')[-1] except IndexError: raise InvalidDataError( 'Gausscom file: {}\n has more atoms than the expected {} atoms in ' 'the template file: {}'.format( gausscom_file, pdb_tpl_content[NUM_ATOMS], cfg[PDB_TPL_FILE])) if atom_type != pdb_atom_type: warning( "Atom types do not match for atom number {}; pdb atom type is {} while gausscom type " "is {}".format(atom_id, pdb_atom_type, atom_type)) else: pdb_data_section.append(atom_id) pdb_data_section[atom_id] = [ 'HETATM', '{:5d}'.format(atom_id + 1), ' {:4} '.format(atom_type), 'UNL ', 1, 0.0, 0.0, 0.0, ' 1.00 0.00 {:>2}'.format(atom_type) ] pdb_data_section[atom_id][5:8] = map(float, split_line[1:4]) atom_id += 1 # Now that finished reading the file, first make sure didn't exit before reaching the desired number of atoms if cfg[PDB_TPL_FILE]: if atom_id != pdb_tpl_content[NUM_ATOMS]: raise InvalidDataError( 'In gausscom file: {}\n found {} atoms, while the pdb template has {} atoms' .format(gausscom_file, atom_id, pdb_tpl_content[NUM_ATOMS])) f_name = create_out_fname(gausscom_file, ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section + pdb_tpl_content[SEC_TAIL], f_name, list_format=PDB_FORMAT)
def write_output(fname, ms_level, num_matches, short_output_list, long_output_list, matched_formulas, combined_out_fname, omit_mol_ion_flag, deprot_flag, prot_flag, write_mode, out_dir): """ Print output from matching M/Z to lignin molecule library :param fname: location of input file processed :param ms_level: int, type of MS output, for output name so there are separate files from multiple-channel input :param num_matches: the number of matches made between input M/Z and MW in lignin library :param short_output_list: list of dicts of summary matching data (one list per match) :param long_output_list: list of dicts of extended matching data (sorted by MZ values) :param matched_formulas: set of formula names that were matched to M/Z values :param combined_out_fname: None or string if output from multiple files is to be written to one file :param omit_mol_ion_flag: boolean to indicate if molecular ion matches were not attempted (True) or sought (False) :param deprot_flag: boolean to indicate if matches were found for molecular ions :param prot_flag: flag to indicate if matches were found for molecular ions :param write_mode: flag to indicate if matches were found for molecular ions :param out_dir: location of output directory, or None if the current directory is the output directory :return: n/a; several output files created """ # prepare string for txt output file if write_mode == 'a': short_txt_output_str = '' else: short_txt_output_str = MATCH_STR_HEADER for mz_dict in short_output_list: peak_str = MZ_STR_FMT.format(mz_dict[M_Z], mz_dict[INTENSITY], mz_dict[RET_TIME]) short_txt_output_str += MATCH_STR_FMT.format(peak_str, mz_dict[REL_INTENSITY], mz_dict[CALC_MW], mz_dict[PPM_ERR], mz_dict[PARENT_FORMULA], mz_dict[DBE], mz_dict[MATCH_TYPE]) ms_str = f"_ms{ms_level}" if ms_str in fname: suffix = DEF_SUFFIX ext_suffix = DEF_LONG_SUFFIX else: suffix = ms_str + DEF_SUFFIX ext_suffix = ms_str + DEF_LONG_SUFFIX f_out_txt = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="txt") f_out_csv = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="csv") if combined_out_fname: f_out_long = create_out_fname(combined_out_fname, suffix="_ext", base_dir=out_dir, ext="csv") else: f_out_long = create_out_fname(fname, suffix=ext_suffix, base_dir=out_dir, ext="csv") # Print quick summary; first note which types of matches were investigated if omit_mol_ion_flag: match_str_list = [] else: match_str_list = ["molecular ion"] if deprot_flag: match_str_list.append("deprotonated ion") if prot_flag: match_str_list.append("protonated ion") print(f" {num_matches} of these matched a MW in our dictionaries for a {' or a '.join(match_str_list)}") # save output to files short_write_mode = 'w' if num_matches == 0: warning(f"No MW to MZ matches (within specified ppm error) found for file: {os.path.basename(fname)}\n " f"Summary output will not be printed.") else: str_to_file(short_txt_output_str, os.path.relpath(f_out_txt), print_info=True, mode=short_write_mode) write_csv(short_output_list, os.path.relpath(f_out_csv), SHORT_OUTPUT_HEADERS, extrasaction="ignore", mode=short_write_mode) if out_dir: struct_dir = os.path.join(out_dir, STRUCT_DIR) else: struct_dir = STRUCT_DIR make_dir(struct_dir) for formula in matched_formulas: my_formula = formula.replace("*", "") make_image_grid(formula, list(FORMULA_SMI_DICT[my_formula]), out_dir=struct_dir, write_output=False) # print long output even if no matches write_csv(long_output_list, os.path.relpath(f_out_long), OUTPUT_HEADERS, extrasaction="ignore", mode=write_mode)
def process_gausscom_file(gausscom_file, tpl_com_content, read_new_charge, out_dir): # to make the later part easier to read tpl_atoms = tpl_com_content[SEC_ATOMS] tpl_atom_types = tpl_com_content[ATOM_TYPES] tpl_atom_num = len(tpl_atom_types) with open(gausscom_file) as d: section = SEC_HEAD atom_id = 0 atom_content = [] try: for line in d: line = line.strip() # not currently keeping anything from the header; just check num atoms if section == SEC_HEAD: # there may be some instructions (which start with %, and can have some blank lines) before the # "route card lines" (which start with #) while not GAU_HEADER_PAT.match(line): line = next(d).strip() # skip first line of route card line = next(d).strip() # for "route card" and then description, there may be more than one header line; look for blank line for i in range(2): while len(line) > 0: line = next(d).strip() # now move past the blank line, and get the content of the following line line = next(d).strip() # now on charge, multiplicity line, which we also skip unless we use its charge/mult if read_new_charge: # make sure reading a valid charge/mult line, with at least 2 integers try: charge_mult = line.split() int(charge_mult[0]) int(charge_mult[1]) if len(charge_mult) % 2 != 0: raise IndexError except (IndexError, ValueError): raise InvalidDataError("Problem while reading file: {}\nOption to read charge and " "multiplicity from template not chosen, but found invalid data on " "the expected line: {}".format(os.path.basename(gausscom_file), line)) tpl_com_content[SEC_HEAD][-1] = line section = SEC_ATOMS continue elif section == SEC_ATOMS: # stay in atom section until a blank line is reached while len(line) > 0: split_line = line.split() # if there is a freeze/no freeze col, will be 5 columns (split by ' '); Keep atom info together if len(split_line) == 5: atom_info = "{:2}{:>8}".format(split_line[0], split_line[1]) else: atom_info = split_line[0] # if template has atoms, check atom type if tpl_atom_num > 0: atom_type = atom_info.split()[0].split('(')[0] if atom_type != tpl_atom_types[atom_id]: raise InvalidDataError("Problem while reading file: {}\nAtom types do not match for " "atom number {}: file has type {} while tpl has type " "{}".format(os.path.basename(gausscom_file), atom_id + 1, tpl_atom_types[atom_id], atom_type)) atom_info = tpl_atoms[atom_id] atom_xyz = ["{:>12}".format(x) for x in split_line[-3:]] atom_content.append('{:18}'.format(atom_info) + ' '.join(atom_xyz)) atom_id += 1 line = next(d).strip() # Don't need to read the tail, because we won't use it break except StopIteration: pass except UnicodeDecodeError: raise InvalidDataError(f"Error in reading file: {gausscom_file}\n Exiting program.") # now loop is done; check atom number if atoms are in the tpl file check_num_atoms(atom_id, gausscom_file, tpl_atom_num) f_name = create_out_fname(gausscom_file, ext='.com', base_dir=out_dir) list_to_file(tpl_com_content[SEC_HEAD] + atom_content + tpl_com_content[SEC_TAIL], f_name)
def plot_total_intensity_v_ret_time(fname, ms_level, data_array, num_decimals_ret_time_accuracy, out_dir): """ Plot total intensity versus retention times (combines retention times in this method; calls plotting function) :param fname: str, name of the file where the data originated :param ms_level: str, used to distinguish between different MS output of the same input file (no overwriting) :param data_array: ndarray (n x 3) with M/Z, intensity, and retention times :param num_decimals_ret_time_accuracy: number of decimal points in retention time accuracy, for rounding :param out_dir: None or str, provides location where new file should be saved (None for current directory) :return: ndarray, (m x 2), were m is the number of unique retention times, in first column. Second column is total intensity for that retention time. """ default_x_max = 16. x_index = 2 # in case not already rounded and sorted... data_array[:, x_index] = np.around(data_array[:, x_index], num_decimals_ret_time_accuracy) # the intensity and mz order does not matter, only ret time data_array = data_array[data_array[:, x_index].argsort()] unique_ret_times = np.unique(data_array[:, x_index]) total_intensities = np.full((len(unique_ret_times)), np.nan) for ret_index, ret_time in enumerate(unique_ret_times): unique_ret_time_data_array = data_array[data_array[:, x_index] == ret_time] total_intensities[ret_index] = np.sum(unique_ret_time_data_array[:, 1]) data_max_x = np.max(unique_ret_times) min_y_max = np.max(total_intensities) if data_max_x > default_x_max: x_value_warning(data_max_x, default_x_max) y_max = find_pos_plot_limit(min_y_max) title = f"Total Intensity Plot" x_label = "Retention time (min)" y_label = "Total intensity (unscaled)" suffix = "_tot_int" if "_ms" not in fname.lower(): suffix = f"_ms{ms_level}" + suffix plot_fname = create_out_fname(fname, suffix=suffix, ext='png', base_dir=out_dir) # # Uncomment below if want both vlines and not # make_fig(plot_fname, unique_ret_times, total_intensities, x_label=x_label, y_label=y_label, # loc=0, title=title) # print(f"Wrote file: {os.path.relpath(plot_fname)}") # plot_fname = create_out_fname(base_fname, suffix="_tot_int_vlines", ext='png', base_dir=out_dir) ret_time_tot_intensity_array = np.column_stack( (unique_ret_times, total_intensities)) make_vlines_plot(title, x_label, y_label, {"total_intensities": ret_time_tot_intensity_array}, plot_fname, num_decimals_ret_time_accuracy, default_x_max, y_max, loc="upper left") return ret_time_tot_intensity_array
def process_gausslog_file(gausslog_file, com_tpl_content, charge_from_log_flag, find_low_energy, step_num, base_dir, out_fname): with open(gausslog_file) as d: rel_path_fname = os.path.relpath(gausslog_file) # The header may be more than 5 lines long--counting from end makes sure the comment goes in the correct line if find_low_energy: com_tpl_content[SEC_HEAD][ -3] = "Low energy conformation from file {}".format( rel_path_fname) elif step_num: step_num = int(step_num) com_tpl_content[SEC_HEAD][ -3] = "Conformation from step number {} in file {}".format( step_num, rel_path_fname) else: com_tpl_content[SEC_HEAD][ -3] = "Last conformation from file {}".format(rel_path_fname) lowest_energy_found = 0.0 current_step_num = None final_atoms_section = [] atom_type_list = [] section = SEC_HEAD atom_id = 0 # so don't change the flag that is passed it, so if there is another log file it will also be checked if not charge_from_log_flag: find_charge = True else: find_charge = False for line in d: line = line.strip() if len(line) == 0: continue # not currently keeping anything from the header if section == SEC_HEAD: if find_charge: if GAU_CHARGE_PAT.match(line): charge_mult = [] while find_charge: split_line = line.split('=') charge_mult.append('{} {}'.format( int(split_line[1].split()[0]), int(split_line[2].split()[0]))) line = next(d).strip() if not GAU_CHARGE_PAT.match(line): if len(charge_mult) > 1: section = SEC_INITIAL_COORDINATES final_atoms_section = [] # already reading the next section, so grab the needed info atom_type_list.append(line.split()[0]) com_tpl_content[SEC_HEAD][-1] = ' '.join( charge_mult) find_charge = False continue if step_num and GAU_STEP_PAT.match(line): split_line = line.split() current_step_num = int(split_line[2]) if current_step_num == step_num: break if GAU_COORD_PAT.match(line): atoms_section = [] next(d) next(d) section = SEC_ATOMS continue elif section == SEC_INITIAL_COORDINATES: while len(line) > 0: # originally just added whole line to final. Then found that this section prints fewer sig figs # than the coordinate section, so taking those instead atom_type_list.append(line.split()[0]) line = next(d).strip() while not GAU_COORD_PAT.match(line): line = next(d).strip() next(d) next(d) line = next(d).strip() while not GAU_SEP_PAT.match(line): split_line = line.split() atom_xyz = ["{:>12}".format(x) for x in split_line[3:6]] final_atoms_section.append( '{:16}'.format(atom_type_list[atom_id]) + ' '.join(atom_xyz)) atom_id += 1 line = next(d).strip() break elif section == SEC_ATOMS: if GAU_SEP_PAT.match(line): section = SEC_TAIL continue split_line = line.split() try: atom_type = ATOM_NUM_DICT[int(split_line[1])] except KeyError: raise InvalidDataError( "Currently, this code only expects atom numbers up to 36 (Kr), and the " "atomic number read was {}. Update the code to use this with your current " "output.".format(split_line[1])) if com_tpl_content[NUM_ATOMS]: com_atom_type = re.split( '[ (]', com_tpl_content[SEC_ATOMS][atom_id])[0].strip() if com_atom_type != atom_type: try: if ATOM_NUM_DICT[int(com_atom_type)] != atom_type: raise ValueError except ValueError: raise InvalidDataError( "For atom number {}, {} has atom type '{}', while the template has " "atom type '{}'".format( atom_id + 1, gausslog_file, atom_type, com_atom_type)) atom_type = com_tpl_content[SEC_ATOMS][ atom_id] # This keeps the "fragment" number if there atom_type = '{:16}'.format(atom_type) atom_xyz = ["{:>12}".format(x) for x in split_line[3:6]] atoms_section.append(atom_type + ''.join(atom_xyz)) atom_id += 1 elif section == SEC_TAIL: if com_tpl_content[ NUM_ATOMS] and atom_id != com_tpl_content[NUM_ATOMS]: raise InvalidDataError( 'In gausslog file: {}\n found {} atoms, but the tpl expects ' '{} atoms'.format(gausslog_file, atom_id, com_tpl_content[NUM_ATOMS])) if GAU_E_PAT.match(line): if find_low_energy: split_line = line.split() energy = float(split_line[4]) if energy < lowest_energy_found: final_atoms_section = atoms_section[:] else: final_atoms_section = atoms_section[:] section = SEC_HEAD atom_id = 0 if len(final_atoms_section) == 0: raise InvalidDataError( "Check that the following log file has coordinates to use and/or specified step " "number: {}".format(gausslog_file)) if out_fname: f_name = create_out_fname(out_fname, base_dir=base_dir) else: f_name = create_out_fname(gausslog_file, suffix='_' + com_tpl_content[BASE_NAME], ext='.com', base_dir=base_dir) list_to_file( com_tpl_content[SEC_HEAD] + final_atoms_section + com_tpl_content[SEC_TAIL], f_name)
def plot_select_mz_intensity_v_ret_time(fname, ms_level, mz_list_to_plot, data_array, num_decimals_ms_accuracy, num_decimals_ret_time_accuracy, out_dir): """ Plot total intensity versus retention times (combines retention times in this method; calls plotting function) :param fname: str, name of the file where the data originated :param ms_level: str, used to distinguish between different MS output of the same input file (no overwriting) :param data_array: ndarray (n x 3) with M/Z, intensity, and retention times :param mz_list_to_plot: list, with up to 5 mz values to plot vs time on the same plot :param num_decimals_ms_accuracy: int, number of decimal points in MS accuracy, for rounding :param num_decimals_ret_time_accuracy: number of decimal points in retention time accuracy, for rounding :param out_dir: None or str, provides location where new file should be saved (None for current directory) :return: ndarray, (m x 2), were m is the number of unique retention times, in first column. Second column is total intensity for that retention time. """ default_x_max = 16. data_x_max = 0. x_index = 2 if len(mz_list_to_plot) > 5: warning( "Error while attempting to plot select M/Z values versus retention times.\n This " "method expects at most 5 M/Z values to display on one plot. This plot will not be produced." ) return if len(mz_list_to_plot) == 0: warning( "Error while attempting to plot select M/Z values versus retention times.\n No " "M/Z values provided. This plot will not be produced.") return if len(mz_list_to_plot) == 1: title = f"Intensity versus Retention Time for M/Z={mz_list_to_plot[0]}" else: title = "Intensity versus Retention Time for Selected M/Z Values" # At least sometimes, mz_list_to_plot and data_array are not already rounded, so doing so here mz_list_to_plot = np.around(mz_list_to_plot, num_decimals_ms_accuracy) data_array[:, 0] = np.around(data_array[:, 0], num_decimals_ms_accuracy) # wait to check for max retention time (in case it does not apply to chosen mz values, but not intensity, to have # more consistent y-axis ranges max_intensity = np.max(data_array[:, 1]) y_max = find_pos_plot_limit(max_intensity) inten_time_dict = {} for mz_val in mz_list_to_plot: sub_data_array = data_array[data_array[:, 0] == mz_val] if len(sub_data_array) < 1: warning( f"No retention time data found for M/Z value {mz_val} from {os.path.relpath(fname)}.\n This " f"M/Z will be omitted from the plot.") else: curve_label = f"{mz_val:.{num_decimals_ms_accuracy}f}" # make this x, y, so ret_time, intensity inten_time_dict[curve_label] = np.column_stack( (sub_data_array[:, x_index], sub_data_array[:, 1])) sub_array_max_x = np.max(sub_data_array[:, x_index]) if sub_array_max_x > data_x_max: data_x_max = sub_array_max_x if data_x_max > default_x_max: warning( f"The default maximum x-axis value ({default_x_max}) is less than the maximum x-axis value in the " f"data ({data_x_max}). Not all data will be shown.") x_label = "Retention time (min)" y_label = "Intensity (unscaled)" suffix = "_int_v_time" if "_ms" not in fname.lower(): suffix = f"_ms{ms_level}" + suffix plot_fname = create_out_fname(fname, suffix=suffix, ext='png', base_dir=out_dir) make_vlines_plot(title, x_label, y_label, inten_time_dict, plot_fname, num_decimals_ret_time_accuracy, default_x_max, y_max, loc="upper left") # Maybe later... would need to re-slice data # inten_time_dict = defaultdict(lambda: None) # y_val_dict = defaultdict(lambda: None) # curve_label = defaultdict(lambda: "") # mz_counter = 0 # make_fig(plot_fname + "_make_fig", # x_array=inten_time_dict[0], y1_array=y_val_dict[0], y1_label=curve_label[0], color1=NREL_COLORS[1], # x2_array=inten_time_dict[1], y2_array=inten_time_dict[1], y2_label=curve_label[1], color2=NREL_COLORS[2], # x3_array=inten_time_dict[2], y3_array=inten_time_dict[2], y3_label=curve_label[2], color3=NREL_COLORS[3], # x4_array=inten_time_dict[3], y4_array=inten_time_dict[3], y4_label=curve_label[3], color4=NREL_COLORS[4], # x5_array=inten_time_dict[4], y5_array=inten_time_dict[4], y5_label=curve_label[4], color5=NREL_COLORS[5], # x_label=x_label, y_label=y_label, loc=0, title=title) return inten_time_dict
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= 'Calculates A and Ea from Gaussian output files using GoodVibes. ' 'List files to be analyzed, reactant(s) first and ending with the ' 'transition structure. These can be listed on the command line or in ' 'a file (each line listing a set of reactant(s) and transition ' 'structure).') parser.add_argument( "-d", "--out_dir", help= "A directory where output files should be saved. The default location " "is the current working directory.", default=None) parser.add_argument( "-f", dest="freq_cutoff", help="Cut-off frequency for both entropy and enthalpy (wavenumbers) " "(default = 0)", default="0") parser.add_argument( "-l", "--list", help="The location of the list of Gaussian output files. " "The default file name.", default=None) parser.add_argument( "-q", "--quasiharmonic", help="Use the '-q' option in GoodVibes, which turns on turns on " "quasi-harmonic corrections to both entropy and enthalpy in the " "Gibbs free energy (qh-G(T)) output from GoodVibes. ", action='store_true') parser.add_argument( "--temp", help= "Temperature in K for calculating \u0394G. The default is the first " "temperature in 'temp_range' (if specified). If a value is given, the program " "will use the temperature closest to it in the temp_range.", default=None) parser.add_argument( "-ti", "--temp_range", help="Initial temp, final temp, (and optionally) step size (K) for " "thermochemistry calculations. The default range is 300,600,30", default="300,600,30") parser.add_argument( "-v", "--vib_scale", help="Scaling factor to be used for vibrational frequencies. If not " "provided, the GoodVibes default value will be used.", default=None) parser.add_argument( "-p", "--plot", help="Make a \u0394G plot at the specified temp. The default is False.", action='store_true') parser.add_argument( "-pl", "--plot_labels", help="Optional labels for \u0394G plot. Enter as a list.", default=None) parser.add_argument( "-c", "--vibes_check", help="In addition to standard checks always run (matching solvent, " "level of theory, stoichiometry, charge, multiplicity, and " "Gaussian versions), run files through GoodVibes '--check' before " "performing calculations. The default is False.", action='store_true') parser.add_argument( "-o", "--output_fname", help="The name of the output file to be created. The default is the " "list name with the extension '.csv', or '{}' if no list name " "provided.".format(DEF_OUT_FILE_NAME), default=None) parser.add_argument( "-s", "--save_vibes", help="Save the output from running GoodVibes in separate files, " "named with the Gaussian log file prefix and '.dat'. " "The default is False.", action='store_true') parser.add_argument( "-t", "--tog_vibes", help="Save the output from running GoodVibes in one file, " "renamed with the output file prefix and '.dat'. " "The default is False.", action='store_true') args = None try: args = parser.parse_known_args(argv) options = args[0] if not options.out_dir: options.out_dir = os.getcwd() # user can define a new directory as the output directory if not os.path.exists(options.out_dir): os.makedirs(options.out_dir) if options.output_fname: options.output_fname = os.path.abspath( os.path.join(options.out_dir, options.output_fname)) elif options.list: options.output_fname = create_out_fname(options.list, ext='.csv', base_dir=options.out_dir) else: options.output_fname = create_out_fname(DEF_OUT_FILE_NAME, ext='.csv', base_dir=options.out_dir) if options.plot_labels: options.plot_labels = options.plot_labels.split(',') else: options.plot_labels = [''] if options.vib_scale: options.vib_scale = float(options.vib_scale) except (SystemExit, ValueError) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def run_job(job, job_name_perhaps_with_dir, tpl_dict, cfg, testing_mode): # Determine if it will run fresh or from an old checkpoint if job == '': new_job_name = tpl_dict[JOB_NAME] tpl_dict[INPUT_FILE] = job_name_perhaps_with_dir + cfg[GAUSS_IN_EXT] if cfg[FIRST_JOB_CHK]: tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format( cfg[FIRST_JOB_CHK]) else: tpl_dict[OLD_CHECK_ECHO] = '' else: new_job_name = tpl_dict[JOB_NAME] + '_' + job tpl_dict[OLD_JOB_NAME] = tpl_dict[JOB_NAME] tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format( tpl_dict[OLD_JOB_NAME]) tpl_dict[INPUT_FILE] = cfg[TPL_DICT][job] tpl_file = cfg[JOB_RUN_TPL] job_runner_fname = create_out_fname(new_job_name, ext=".sh", base_dir=cfg[OUT_DIR]) print("Running {}".format(new_job_name)) tpl_dict[JOB_NAME] = new_job_name for key_name in [ USER, MEM, PROC_LIST, ]: if key_name in cfg: tpl_dict[key_name] = cfg[key_name] tpl_str = read_tpl(tpl_file) # if either MEM or PROC_LIST is the default (Nonetype), and is used to run the job, get info from the node before # creating the job script mem_required = '{' + MEM + '}' in tpl_str get_mem = mem_required and not tpl_dict[MEM] proc_required = '{' + PROC_LIST + '}' in tpl_str get_proc = proc_required and not tpl_dict[PROC_LIST] default_gauss_required = '{' + DEF_ROUTE + '}' in tpl_str num_procs = 1 # to make IDE happy proc_list = '0' # to make IDE happy if get_mem or get_proc or default_gauss_required: # explicitly check each possible required info flag, because any or all can be requested if testing_mode: hostname = subprocess.check_output(["echo", "r1i7n35" ]).decode("utf-8").strip() else: # Will not be covered in testing mode, as is not part of written code to be tested hostname = subprocess.check_output(["hostname" ]).decode("utf-8").strip() print( "Obtaining available memory and/or number of processors on node {}.\n " "Note: this program assumes the whole node will be allocated to Gaussian.\n" .format(hostname)) if get_mem: tpl_dict[MEM] = get_node_mem(testing_mode) max_cache = 1024 * 1024 # to make IDE happy; Gaussian default (conservative) is 1024 * 1024 if get_proc or default_gauss_required: num_procs, proc_list, max_cache = get_proc_info(testing_mode) if get_proc: tpl_dict[PROC_LIST] = proc_list print( " Found {} processors. Will allow use of cpus {}.\n".format( num_procs, proc_list)) if get_mem or get_proc: print( " The user may override these values by specifying the '{}' and/or '{}' keywords in the " "configuration file.\n Be sure to use the formatting Gaussian expects.\n" .format(MEM, PROC_LIST)) if default_gauss_required: max_disk = get_max_disk(testing_mode) max_cache = int(max_cache) print( "Since '{}' found in the {}, read machine specs to determine CacheSize={} and " "MaxDisk={}".format(DEF_ROUTE, JOB_RUN_TPL, max_cache, max_disk)) default_route_list = [ "-#- CacheSize={}".format(max_cache), "-#- MaxDisk={}".format(max_disk) ] fname = create_out_fname('Default.Route', base_dir=cfg[SCRATCH_DIR]) list_to_file(default_route_list, fname) tpl_dict[ DEF_ROUTE] = '' # there is an action triggered, not a value needed, so replaced with blank space move_on = False while not move_on: try: fill_save_tpl(tpl_str, tpl_dict, tpl_file, job_runner_fname) move_on = True except KeyError as e: missing_key = e.args[0].split("\'")[1] if missing_key in cfg: tpl_dict[missing_key] = cfg[missing_key] else: raise e subprocess.call(["chmod", "+x", job_runner_fname]) if testing_mode: print( "Testing mode; did not run job script or check Gaussian output for normal termination.\n" ) else: # do not want this tested, as actually running Gaussian would take too long, and not what should be tested p1 = subprocess.Popen(job_runner_fname) p1.wait() out_file = tpl_dict[JOB_NAME] + ".log" last_line = subprocess.check_output(["tail", "-1", out_file]).strip().decode("utf-8") if GAU_GOOD_PAT.match(last_line): print("Successfully completed {}\n".format(out_file)) os.remove(job_runner_fname) else: raise InvalidDataError('Job failed: {}'.format(out_file))
def find_good_fit(x_vals, y_vals, x_fit, png_fname=None): """ Find a good functional fit for scan data :param x_vals: np array, x values for fitting :param y_vals: np array, y values for fitting :param x_fit: np array, x values to use for creating curve :param png_fname: str, path to save plot, if desired :return: """ smallest_resid = np.inf best_y_fit = None print("Residuals from curve fitting:") charmm_n_multipliers = [ np.ones(5, dtype=int), np.asarray([0, 1, 1, 1, 1]), np.asarray([1, 1, 1, 1, 0]), np.asarray([1, 1, 1, 0, 0]), np.asarray([1, 1, 0, 0, 0]), np.asarray([1, 0, 0, 0, 0]), np.asarray([0, 1, 0, 1, 1]), np.asarray([1, 0, 1, 0, 0]) ] if png_fname: plt.plot(x_vals, y_vals, '.', label='data') for idx, multipliers in enumerate(charmm_n_multipliers): n_vals = multipliers * N_DIHE # fit curve ini_vals = np.ones(len(N_DIHE) * 2) with warnings.catch_warnings(): warnings.simplefilter("error", OptimizeWarning) try: popt, pcov = curve_fit(f=lambda x, *params: charmm_dihedral( x, *params, *multipliers), xdata=x_vals, ydata=y_vals, p0=ini_vals) except OptimizeWarning: pass y_fit = charmm_dihedral(x_fit, *popt, *multipliers) if png_fname: plt.plot(x_fit, y_fit, '-', color=assign_color(idx), label=f'fit: {multipliers}') y_from_fit = charmm_dihedral(x_vals, *popt, *multipliers) resid = np.sqrt(np.mean(np.square(y_from_fit - y_vals))) # Root Mean Squared Error print( f' CHARMM dihedral eq with n = {",".join([str(x) for x in n_vals[n_vals != 0]]) + ":":10} ' f'{resid:5.2f}') if resid < smallest_resid: smallest_resid = resid best_y_fit = y_fit if png_fname: # plt.legend() charmm_fname = create_out_fname(png_fname, suffix="_charmm") plt.savefig( charmm_fname, transparent=True, bbox_inches='tight', ) plt.close() print(f"Saved: {charmm_fname}") if png_fname: plt.plot(x_vals, y_vals, '.', label='data') for idx, order in enumerate(range(1, 12)): # noinspection PyTupleAssignmentBalance p, residuals, rank, singular_values, rcond = np.polyfit(x_vals, y_vals, order, full=True) y_fit = np.polyval(p, x_fit) if png_fname: plt.plot(x_fit, y_fit, '-', color=COLOR_SEQUENCE[idx], label=f'fit: poly order {order}') y_from_fit = np.polyval(p, x_vals) resid = np.sqrt(np.mean(np.square(y_from_fit - y_vals))) print(f' Polynomial order {order:2}: {resid:5.2f}') if resid < smallest_resid: smallest_resid = resid best_y_fit = y_fit if png_fname: # plt.legend() poly_fname = create_out_fname(png_fname, suffix="_poly") plt.savefig( poly_fname, transparent=True, bbox_inches='tight', ) plt.close() print(f"Saved: {poly_fname}") return best_y_fit
def check_convergence(check_file_list, step_converg, last_step, best_conv, all_steps_to_stdout): """ Reads a Gaussian output file to check convergence :param all_steps_to_stdout: Boolean to print convergence to standard out :param check_file_list: list of file names :param step_converg: boolean; if True, capture convergence of each step. If false, only the final convergence. :param last_step: None or int; if int, the last step number to check for convergence :param best_conv: Boolean; if true, print ten steps with the best convergence :return: nothing: either saves a file or prints to stdout """ fname_str_length = 36 conv_str_length = 11 for fname in check_file_list: if len(os.path.basename(fname)) > fname_str_length: fname_str_length = len(os.path.basename(fname)) print( f"{F_NAME:{fname_str_length}} {CONVERG:{conv_str_length}} {CONVERG_ERR}" ) if step_converg: headers = STEP_CONVERG_HEADERS else: headers = FINAL_CONVERG_HEADERS for fname in check_file_list: log_content = process_gausslog_file(fname, find_converg=True, find_step_converg=step_converg, last_step_to_read=last_step) log_content[F_NAME] = os.path.basename(fname) if step_converg: # all_steps_to_stdout doesn't need an out_fname, but doesn't hurt either if last_step: out_fname = sys.stdout else: out_fname = create_out_fname(fname, prefix='', suffix='_conv_steps', ext='.csv') # create list of dicts for each step, for all step_converg options step_list = [] for step_num in log_content[CONVERG_STEP_DICT].keys(): # not sure necessary to make this new dict, but it is fast enough and clearer for next steps step_list.append({ F_NAME: log_content[F_NAME], STEP_NUM: step_num, ENERGY: log_content[CONVERG_STEP_DICT][step_num][ENERGY], MAX_FORCE: log_content[CONVERG_STEP_DICT][step_num][MAX_FORCE], RMS_FORCE: log_content[CONVERG_STEP_DICT][step_num][RMS_FORCE], MAX_DISPL: log_content[CONVERG_STEP_DICT][step_num][MAX_DISPL], RMS_DISPL: log_content[CONVERG_STEP_DICT][step_num][RMS_DISPL], CONVERG: log_content[CONVERG_STEP_DICT][step_num][CONVERG], CONVERG_ERR: log_content[CONVERG_STEP_DICT][step_num][CONVERG_ERR], }) # different output depending on which step_converg option if last_step or best_conv: if len(step_list) == 0: print("No convergence data found for file: {}".format( log_content[F_NAME])) continue sorted_by_converg = sorted(step_list, key=itemgetter(CONVERG)) if last_step: print( "Steps sorted by convergence to step number {} for file: {}" .format(last_step, log_content[F_NAME])) stop_step = last_step else: print( "Best (up to 10) steps sorted by convergence for file: {}" .format(log_content[F_NAME])) stop_step = 10 print(" StepNum Convergence") for print_num, step_dict in enumerate(sorted_by_converg): if print_num == stop_step: # break this for, and go to next file if there is one break print(" {:7} {:10.3f}".format(step_dict[STEP_NUM], step_dict[CONVERG])) elif all_steps_to_stdout: # print all steps to stdout, not sorted by convergence print("Convergence of all steps for file: {}".format( log_content[F_NAME])) print(" StepNum Convergence") for step_dict in step_list: print(" {:7} {:10.3f}".format(step_dict[STEP_NUM], step_dict[CONVERG])) else: # save all steps, not sorted by convergence print( f"{log_content[F_NAME]:{fname_str_length}} {step_list[-1][CONVERG]:{conv_str_length}.4f} " f"{step_list[-1][CONVERG_ERR]}") write_csv(step_list, out_fname, headers, extrasaction="ignore", round_digits=6) # also make plots of step versus convergence create_convergence_plots(out_fname, step_list) else: # this is the printing for final termination step only (not step_converg) fname = log_content[headers[0]] print( f"{fname:{fname_str_length}} {log_content[headers[1]]:{conv_str_length}.4f} " f"{log_content[headers[2]]}")
def get_thermochem(file_set, results_dict, save_vibes, out_dir, tog_output_fname, qh_h_opt, write_mode): """ Calls GoodVibes to get thermochem at a range of temps :param file_set: list of reactant file(s), TS file (or separator), and optionally products :param results_dict: dictionary of results from running hartree and goodvibes :param save_vibes: boolean to determine whether to save each GoodVibes output separately :param out_dir: directory to save GoodVibes output files (if requested) :param tog_output_fname: None or string (file name) if saving each GoodVibes output together :param qh_h_opt: boolean to use the '-q' option in GoodVibes (corrections to both entropy and enthalpy) :param write_mode: boolean to start a new to add to an all-together goodvibes output file :return: nothing """ h = [] qh_h = [] gt = [] qh_gt = [] temps = [] for index, file in enumerate(file_set): base_name = os.path.basename(file) if file == REACT_PROD_SEP: h.append(np.full([len(temps)], np.nan)) qh_h.append(np.full([len(temps)], np.nan)) gt.append(np.full([len(temps)], np.nan)) qh_gt.append(np.full([len(temps)], np.nan)) continue vibes_out = results_dict[base_name][GOODVIBES_OUT] found_structure = False skip_line = True h.append([]) qh_h.append([]) gt.append([]) qh_gt.append([]) # we know the last line should be dropped, and at least the first 10 for line in vibes_out[10:-2]: if GOODVIBES_ERROR_PAT.match(line): raise InvalidDataError( "See GoodVibes output: {}".format(vibes_out)) if not found_structure: if GOODVIBES_DATA_PAT.match(line): found_structure = True continue elif skip_line: skip_line = False continue else: vals = line.split() if index == 0: temps.append(float(vals[1])) h[index].append(float(vals[2])) if qh_h_opt: qh_h[index].append(float(vals[3])) gt[index].append(float(vals[-2])) qh_gt[index].append(float(vals[-1])) if save_vibes: vibes_out_fname = os.path.relpath( create_out_fname(file, suffix='_vibes', base_dir=out_dir, ext='.dat')) list_to_file(vibes_out, vibes_out_fname, print_message=False) print('Saved GoodVibes output as: {}'.format(vibes_out_fname)) if tog_output_fname: list_to_file(vibes_out, tog_output_fname, mode=write_mode, print_message=False) if write_mode == 'w': print("Adding all GoodVibes output to: {}".format( tog_output_fname)) write_mode = "a" temps = np.asarray(temps) # for each molecule, multiply the array to convert to kcal/mol for index in range(len(gt)): h[index] = np.asarray(h[index]) * EHPART_TO_KCAL_MOL if qh_h_opt: qh_h[index] = np.asarray(qh_h[index]) * EHPART_TO_KCAL_MOL gt[index] = np.asarray(gt[index]) * EHPART_TO_KCAL_MOL qh_gt[index] = np.asarray(qh_gt[index]) * EHPART_TO_KCAL_MOL return temps, h, qh_h, gt, qh_gt
def main(argv=None): print( f"Running GaussianWrangler script goodvibes_helper version {__version__}" ) # Read input args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret try: # Make a list of lists; each inner list a set of reactant file(s) with TS # Include anything in the "list" file as well as entered on the command line options = args[0] if options.list: with open(options.list) as f: row_list = [row.strip().split() for row in f.readlines()] row_list = list(filter(None, row_list)) else: row_list = [] if len(args[1]) > 0: row_list.append(args[1]) if len(row_list) == 0: raise InvalidDataError("No files or list of files found") # now a quick first check that all files exist, and get unique names missing_files = set() unique_fnames = set() for file_set in row_list: for file in file_set: if file != REACT_PROD_SEP: if os.path.isfile(file): unique_fnames.add(file) else: missing_files.add(file) if len(missing_files) > 0: raise IOError(missing_files) # Initialization to make IDE happy; used for plotting g_ts_list, g_rxn_list, qh_g_ts_list, qh_g_rxn_list = [], [], [], [] g_temp = None h_ts_list, h_rxn_list, qh_h_ts_list, qh_h_rxn_list = [], [], [], [] # now the calculations and printing print_mode = 'w' # for the AEa output, so only prints header once, and then appends to file print_message = True if options.tog_vibes: tog_fname = os.path.relpath( create_out_fname(options.output_fname, suffix='_vibes', ext='.dat')) else: tog_fname = None results_dict = get_gauss_results(options, unique_fnames) for file_set in row_list: # the called method returns values needed for printing and plotting temps, a, ea, kt, delta_h_ts, delta_h_rxn, delta_gibbs_ts, delta_gibbs_rxn, qh_a, qh_ea, qh_kt, \ qh_delta_h_ts, qh_delta_h_rxn, qh_delta_gibbs_ts, qh_delta_gibbs_rxn = \ process_file_set(file_set, options, print_mode, results_dict, tog_fname) temp_index = get_temp_index(options.temp, temps) if REACT_PROD_SEP in file_set: k_temp = "" qh_k_temp = "" else: k_temp = round_sig_figs(kt[temp_index]) qh_k_temp = round_sig_figs(qh_kt[temp_index]) g_temp = temps[temp_index] g_ts = round_sig_figs(delta_gibbs_ts[temp_index]) g_rxn = round_sig_figs(delta_gibbs_rxn[temp_index]) qh_g_ts = round_sig_figs(qh_delta_gibbs_ts[temp_index]) qh_g_rxn = round_sig_figs(qh_delta_gibbs_rxn[temp_index]) h_ts = round_sig_figs(delta_h_ts[temp_index]) h_rxn = round_sig_figs(delta_h_rxn[temp_index]) if options.quasiharmonic: qh_h_ts = round_sig_figs(qh_delta_h_ts[temp_index]) qh_h_rxn = round_sig_figs(qh_delta_h_rxn[temp_index]) else: qh_h_ts, qh_h_rxn = 0, 0 # So don't use an undefined variable below print_results(a, ea, qh_a, qh_ea, g_temp, k_temp, g_ts, g_rxn, qh_k_temp, qh_g_ts, qh_g_rxn, file_set, options.output_fname, print_mode, print_message=print_message) if options.plot: g_ts_list.append(g_ts) g_rxn_list.append(g_rxn) qh_g_ts_list.append(qh_g_ts) qh_g_rxn_list.append(qh_g_rxn) h_ts_list.append(h_ts) h_rxn_list.append(h_rxn) if options.quasiharmonic: qh_h_ts_list.append(qh_h_ts) qh_h_rxn_list.append(qh_h_rxn) print_mode = 'a' print_message = False if options.plot: g_fname = create_out_fname(options.output_fname, suffix='_g', ext='.png') plot_delta(g_fname, g_temp, g_ts_list, g_rxn_list, options.plot_labels) qh_g_fname = create_out_fname(options.output_fname, suffix='_g_qh', ext='.png') plot_delta(qh_g_fname, g_temp, qh_g_ts_list, qh_g_rxn_list, options.plot_labels) h_fname = create_out_fname(options.output_fname, suffix='_h', ext='.png') plot_delta(h_fname, g_temp, h_ts_list, h_rxn_list, options.plot_labels, var='H') if options.quasiharmonic: qh_h_fname = create_out_fname(options.output_fname, suffix='_h_qh', ext='.png') plot_delta(qh_h_fname, g_temp, qh_h_ts_list, qh_h_rxn_list, options.plot_labels, var='H') except IOError as e: warning("Problems reading file:", e) return IO_ERROR except InvalidDataError as e: warning("Problems reading data:", e) return INVALID_DATA return GOOD_RET # success