def read_cfg(f_loc, cfg_proc=process_cfg): """ Reads the given configuration file, returning a dict with the converted values supplemented by default values. :param f_loc: The location of the file to read. :param cfg_proc: The processor to use for the raw configuration values. Uses default values when the raw value is missing. :return: A dict of the processed configuration file's data. """ config = ConfigParser() good_files = config.read(f_loc) if not good_files: raise IOError('Could not read file {}'.format(f_loc)) main_proc = cfg_proc(dict(config.items(MAIN_SEC)), DEF_CFG_VALS, REQ_KEYS, int_list=False, store_extra_keys=True) main_proc[CONFIG_FILE] = f_loc main_proc[TPL_DICT] = {} all_job_types = [] for job_list_key in [JOB_LIST, FOLLOW_JOBS_LIST]: job_list = main_proc[job_list_key] jobs_list = [] # need to distinguish between NoneType and '' if job_list is not None: threads = [thread.split(',') for thread in job_list.split(';')] for thread in threads: thread = [job.strip() for job in thread] jobs_list.append(thread) all_job_types += thread main_proc[job_list_key] = jobs_list else: main_proc[job_list_key] = [] for job in all_job_types: if job == '': continue if job in main_proc: tpl_name = main_proc[job] else: tpl_name = job + '.tpl' if os.path.isfile(tpl_name): main_proc[TPL_DICT][job] = tpl_name else: raise InvalidDataError( "For job '{}', could not find a template file '{}'\n" "You may specify the template to use (including path) using {} as a key in the " "config file.".format(job, tpl_name, job)) if not os.path.isfile(main_proc[JOB_RUN_TPL]): raise InvalidDataError( "Could not find the submit template '{}'".format( main_proc[JOB_RUN_TPL])) return main_proc
def collect_output_scan_steps(check_file_list): """ Looks for scan values in one or more files. Current functionality: returns one scan, or combines two scans if they search in opposite directions :param check_file_list: :return: a 2D numpy array with the scan values and energy differences in kcal/mol """ scan_arrays = [] for fname in check_file_list: log_content = process_gausslog_file(fname, collect_scan_steps=True) if len(log_content[SCAN_DICT]) > 0: scan_arrays.append( np.array(list(log_content[SCAN_DICT].items()), dtype=float)) num_arrays = len(scan_arrays) # if only one scan file, return it if num_arrays == 1: return_array = scan_arrays[0] elif num_arrays == 0: raise InvalidDataError("No scan information found.") elif num_arrays == 2: first_array = scan_arrays[0] second_array = scan_arrays[1] first_diff = process_scan_array(first_array) second_diff = process_scan_array(second_array) # check if the first entry is in common, as for scan in two directions if abs(first_array[0][0] - second_array[0][0]) < 0.002: if first_diff < 0 < second_diff: first_array = np.flip(first_array, 0) return_array = np.vstack((first_array[:-1, :], second_array)) elif first_diff > 0 > second_diff: second_array = np.flip(second_array, 0) return_array = np.vstack((second_array[:-1, :], first_array)) else: raise InvalidDataError( "Check how the scans are to be combined.") else: raise InvalidDataError( "The program cannot currently handle these files. Check input, and if correct, " "please open an issue on github.") # convert dict to array else: raise InvalidDataError( "The program can't yet handle this number of files. Please open an issue." ) # find lowest energy and convert to differences in kcal/mol min_e = np.min(return_array[:, 1]) return_array[:, 1] = (return_array[:, 1] - min_e) * EHPART_TO_KCAL_MOL return return_array
def validate_decimal_input(input_accuracy, option_str): """ Make sure that the provided input is within the allowable range, and determine the number of significant decimals :param input_accuracy: str or float :param option_str: the option corresponding to the input, to allow a more helpful error message :return: input_accuracy as float, num_decimals_accuracy as int """ max_val = 1. min_val = 0.000000001 tolerance = 1e-6 try: input_accuracy = float(input_accuracy) if input_accuracy < min_val or input_accuracy > max_val: raise ValueError base_10_log = np.log10(input_accuracy) if not abs(base_10_log - round(base_10_log, 0) < tolerance): remainder = 1. % input_accuracy if not remainder < tolerance: raise ValueError num_decimals_accuracy = int(len(str(input_accuracy)) - str(input_accuracy).index('.') - 1.) except ValueError: raise InvalidDataError(f"Read '{input_accuracy}' for the {option_str} option.\n" f" This tolerance must be a non-negative fraction of 1 (1 must be a multiple of the " f"tolerance), between {min_val} and {max_val}.") return input_accuracy, num_decimals_accuracy
def read_cfg(f_loc, cfg_proc=process_cfg): """ Reads the given configuration file, returning a dict with the converted values supplemented by default values. :param f_loc: The location of the file to read. :param cfg_proc: The processor to use for the raw configuration values. Uses default values when the raw value is missing. :return: A dict of the processed configuration file's data. """ config = ConfigParser() good_files = config.read(f_loc) if not good_files: raise IOError('Could not read file {}'.format(f_loc)) main_proc = cfg_proc(dict(config.items(MAIN_SEC)), DEF_CFG_VALS, REQ_KEYS) # To fix; have this as default! main_proc[GAUSSCOM_FILES] = [] if os.path.isfile(main_proc[GAUSSCOM_FILES_FILE]): with open(main_proc[GAUSSCOM_FILES_FILE]) as f: for data_file in f: main_proc[GAUSSCOM_FILES].append(data_file.strip()) if main_proc[GAUSSCOM_FILE] is not None: main_proc[GAUSSCOM_FILES].append(main_proc[GAUSSCOM_FILE]) if len(main_proc[GAUSSCOM_FILES]) == 0: raise InvalidDataError("No files to process: no '{}' specified and " "no list of files found for: {}".format( GAUSSCOM_FILE, main_proc[GAUSSCOM_FILES_FILE])) return main_proc
def rotate_dihes_pdb_files(cfg, gau_tpl_content, pdb_files): """ If dihedral data is specified, use RDkit to rotate specified dihedrals :param cfg: dict of configuration input :param gau_tpl_content: dict of data to create gaussian input files :param pdb_files: list of pdb files with dihedral angles to be rotated :return: n/a, saves new Gaussian input files """ for pdb_file in pdb_files: mol_orig = MolFromPDBFile(pdb_file, removeHs=False) all_confs = [mol_orig] num_atoms = mol_orig.GetNumAtoms() for dih in cfg[DIH_DATA]: max_id = max(dih[:4]) rot_deg = dih[4] if max_id > num_atoms: raise InvalidDataError( f"Dihedral rotation specifies an atom id of {max_id}, while only {num_atoms} " f"atoms were found in the PBD file {os.path.relpath(pdb_file)}" ) new_confs = [] for mol_id, current_mol in enumerate(all_confs): # atoms = [a for a in current_mol.GetAtoms()] # for a in atoms: # print(a.GetIdx(), a.GetSymbol()) dih_deg = GetDihedralDeg(current_mol.GetConformer(0), *dih[:4]) for _ in range(int(round(360. / rot_deg, 0) - 1)): dih_deg = dih_deg + rot_deg # print(dih, dih_deg) SetDihedralDeg(current_mol.GetConformer(0), *dih[:4], dih_deg) new_confs.append(current_mol.__copy__()) all_confs.extend(new_confs) create_coms_from_mol_list(all_confs, gau_tpl_content, pdb_file, cfg[MAX_CONF], cfg[ORIGINAL])
def __init__(self, unit_type, i): """ Constructor for the Monomer class, which sets the properties depending on what monolignol is being represented. The only attributes that need to be set are the species [0-2], and the unique integer identifier. Everything else will be computed from these values. The active site is initially set to 0, indicating that the monomer is not oxidized, but can be eventually. The open positions are either {4,5,8} or {4,8} depending on whether a 5-methoxy is present. The set of monomers that are connected to begin as just containing the self's identity. Example calls are below: mon = Monomer(G, 0) # Makes a guaiacol unit monomer with ID = 0 mon = Monomer(S, 0) # Makes a syringol unit monomer with ID = 0 (not recommended to repeat IDs) mon = Monomer(H, 0) # Makes a caffeoyl unit monomer with ID = 0 mon = Monomer(S, n) # Makes a sinapyl alcohol with ID = n :param unit_type: str, monomer type :param i: int, unique identifier for the monomer Outputs: New instance of a monomer object with the desired attributes """ self.identity = i self.type = unit_type # The active attribute is the active position: if 0 the monomer is not yet activated (not yet oxidized), and # -1 means it is inactive and cannot be active (cannot be oxidized) self.active = 0 if unit_type == G or unit_type == C: self.open = {4, 5, 8} elif unit_type == S: self.open = {4, 8} else: raise InvalidDataError(f"Encountered unit type {unit_type}, but only the following types are " f"currently available: 'G' ({G}), 'S' ({S}), 'C' ({C})") self.connectedTo = {i}
def process_gausscom_file(gausscom_file): # Grabs and stores in gausscom_content as a dictionary with the keys: # SEC_HEAD: header (route section, blank lines, comments, and full charge and multiplicity line) # CHARGE: overall charge (only) as int # MULT: overall multiplicity (only) as int # SEC_ATOMS: atoms as a dict of dicts, with atom_id as key to dict with # ATOM_TYPE: atom_type (str), ATOM_COORDS: (np array) # SEC_TAIL: everything including and after the blank line following SEC_ATOMS with open(gausscom_file) as d: gausscom_content = { SEC_HEAD: [], SEC_ATOMS: {}, SEC_TAIL: [], BASE_NAME: get_fname_root(gausscom_file) } section = SEC_HEAD atom_id = 1 blank_header_lines = 0 for line in d: line = line.strip() if section == SEC_HEAD: gausscom_content[SEC_HEAD].append(line) if GAU_HEADER_PAT.match(line): continue elif len(line) == 0: blank_header_lines += 1 if blank_header_lines == 2: section = SEC_ATOMS line = next(d).strip() gausscom_content[SEC_HEAD].append(line) split_line = line.split() try: gausscom_content[CHARGE] = int(split_line[0]) gausscom_content[MULT] = int(split_line[1]) except (IndexError, ValueError): raise InvalidDataError( "Error in reading file {}\n as a Gaussian input file. On the line " "where charge and multiplicity are expected, " "found: '{}'".format(gausscom_file, line)) elif section == SEC_ATOMS: if len(line) == 0: section = SEC_TAIL gausscom_content[SEC_TAIL].append(line) continue split_line = line.split() atom_type = split_line[0] atom_xyz = np.array(list(map(float, split_line[1:4]))) gausscom_content[SEC_ATOMS][atom_id] = { ATOM_TYPE: atom_type, ATOM_COORDS: atom_xyz } atom_id += 1 elif section == SEC_TAIL: gausscom_content[SEC_TAIL].append(line) return gausscom_content
def check_input(args, cfg): # override config entries if command-line options used if args.file: cfg[GAUSSLOG_FILE] = args.file if args.list: cfg[GAUSSLOG_FILES_FILE] = args.list if args.tpl: cfg[PDB_TPL_FILE] = args.tpl if args.out_dir: cfg[OUT_BASE_DIR] = args.out_dir if args.only_first: cfg[ONLY_FIRST] = True if args.only_final: cfg[ONLY_FINAL] = True if args.out_fname: cfg[OUTFILE_NAME] = args.out_fname if args.out_dir: cfg[OUT_BASE_DIR] = args.out_dir # checking if cfg[COMBINE_LOGS] and not cfg[OUTFILE_NAME]: raise InvalidDataError("When combining outputs from multiple log files into one pdb, specify the output " "file name") if cfg[COMBINE_LOGS] and not cfg[ONLY_FINAL]: warning("When combining outputs from multiple log files into one pdb, only the last coordinates of each " "log file will be kept.") cfg[ONLY_FINAL] = True if cfg[OUT_BASE_DIR]: if not os.path.exists(cfg[OUT_BASE_DIR]): os.makedirs(cfg[OUT_BASE_DIR])
def main(argv=None): print( f"Running GaussianWrangler script gausslog2com version {__version__}") args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret try: # Make sure there are files to process gausslog_files = check_for_files(args.file, args.list) # and a template file to process if not args.tpl: raise InvalidDataError("No template file ('-t' option) specified") if not os.path.isfile(args.tpl): raise IOError(args.tpl) # Read template and data files com_tpl_content = process_gausscom_tpl(args.tpl, args.charge_from_tpl) process_gausslog_files(gausslog_files, com_tpl_content, args.charge_from_tpl, args.low_energy, args.step_num, args.out_dir, args.output_fname) except IOError as e: warning("Problems reading file:", e) return IO_ERROR except (InvalidDataError, UnicodeDecodeError) as e: warning("Problems reading data:", e) return INVALID_DATA return GOOD_RET # success
def subtract_blank_data(blank_data, ms_data, ms_tol, ret_type=2): """ Since looking at differences (not making a unique list) this method does not need mz's to be rounded first :param blank_data: :param ms_data: :param ms_tol: :param ret_type: int, 0 for matched MZ in retention time; 2 for matched retention time but no matched MZ :return: """ blank_counter = 0 ms_counter = 0 try: while ms_counter < len(ms_data) and blank_counter < len(blank_data): diff = abs(blank_data[blank_counter][0] - ms_data[ms_counter][0]) # add a little buffer so machine precision doesn't make it not match when it should if diff <= ms_tol * (1 + ms_tol): ms_data[ms_counter][1] = ms_data[ms_counter][1] - blank_data[blank_counter][1] ret_type = 0 blank_counter += 1 elif ms_data[ms_counter][0] < blank_data[blank_counter][0]: ms_counter += 1 else: blank_counter += 1 min_blank_val = ms_data[ms_counter][0] - ms_tol * (1. - ms_tol) while blank_data[blank_counter][0] < min_blank_val: blank_counter += 1 return ms_data, ret_type except IndexError as e: if "out of bounds" in e.args[0]: return ms_data, ret_type else: raise InvalidDataError("Error in subtract_blank_data method")
def smi_to_formula(smi_str): """ Given a smiles string in arbitrary format, return the smiles string as produced by RDKit, the molecular formula, and the molecular weight using only the most abundant isotopes :param smi_str: str, standard SMILES format :return: str, the molecular formula in standard chemistry notation """ # Use RDKit to make a SMILES from a SMILES so that we get a unique string for any given SMILES entry mol = Chem.MolFromSmiles(smi_str) if mol is None: raise InvalidDataError( f"The input SMILES string '{smi_str}' could not be recognized by RDKit" ) Chem.Kekulize(mol) rd_smi = Chem.MolToSmiles(mol, kekuleSmiles=True) mol_formula = CalcMolFormula(mol) stoich_dict = parse_stoich(mol_formula) dbe = calc_dbe(stoich_dict) mol_mass = 0 for atom_type, num_atoms in stoich_dict.items(): mass_most_abundant_isotope = LIGNIN_ISOTOPE_DICT[atom_type][MASS][0] mol_mass += mass_most_abundant_isotope * num_atoms mw_deprot = round(mol_mass - LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0], MAX_SIG_FIGS) mw_prot = round(mol_mass + LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0], MAX_SIG_FIGS) return rd_smi, mol_formula, round(mol_mass, MAX_SIG_FIGS), mw_deprot, mw_prot, dbe
def check_and_print(cfg, atom_id, pdb_tpl_content, gausslog_file, pdb_data_section, f_name, mode, message): # Check Num atoms and print if cfg[PDB_TPL_FILE]: if atom_id != pdb_tpl_content[NUM_ATOMS]: raise InvalidDataError('In gausslog file: {}\nfound {} atoms, while the pdb template has {} atoms' 'atoms'.format(gausslog_file, atom_id, pdb_tpl_content[NUM_ATOMS])) list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section + pdb_tpl_content[SEC_TAIL], f_name, list_format=PDB_FORMAT, mode=mode, print_message=message)
def find_common_ret_times(blank_data_array, ms_data_array, ret_time_accuracy, num_decimals_ms_accuracy, ppm_threshold): """ Facilitates comparing two arrays including ret time by making them into dicts with the ret times (as strings) as keys. Keeps all the ms run data in the dict, but only the data for common ret times from the blank data :param blank_data_array: ndarray with m/z, intensity, and retention time :param ms_data_array: ndarray with m/z, intensity, and retention time :param ret_time_accuracy: float, accuracy used to determine if retention times are significantly different :param num_decimals_ms_accuracy: int, number of decimal points in MS accuracy, for rounding :param ppm_threshold: str, the tolerance (in ppm) to consider two M/Z values identical :return: lists of retention times and dicts with the retention times as keys and ndarrays of peak and intensity data only for that retention time """ # Already rounded before getting here, so not rounding again here blank_ret_times = np.unique(blank_data_array[:, 2]) ms_ret_times = np.unique(ms_data_array[:, 2]) # because of accuracy tolerance, not just using np.unique, but iterating to find those within tolerance common_ret_times = [] ms_ret_time_dict = {} blank_ret_time_dict = {} blank_ret_time_counter = 0 # want to keep all the ms run data (in a dict to make it easier to match up to blank data), but we don't # need to keep all the blank data float_error_factor = 1.000001 for ret_time in ms_ret_times: ret_time_str = str(ret_time) # grabbing sub array ms_ret_time_dict[ret_time_str] = ms_data_array[ms_data_array[:, 2] == ret_time] if blank_ret_time_counter < len(blank_ret_times): try: while ret_time > blank_ret_times[blank_ret_time_counter] * float_error_factor: blank_ret_time_counter += 1 diff = abs(ret_time - blank_ret_times[blank_ret_time_counter]) # the multiplication below is needed because of machine precision error in storing floats if diff <= ret_time_accuracy: common_ret_times.append(ret_time_str) blank_ret_time = blank_ret_times[blank_ret_time_counter] # grabbing sub array sub_array = blank_data_array[blank_data_array[:, 2] == blank_ret_time] if ret_time_str in blank_ret_time_dict: # this is very unlikely to happen, but I don't mind checking for edge cases blank_ret_time_dict[ret_time_str] = np.concatenate((blank_ret_time_dict[ret_time_str], sub_array), axis=0) blank_ret_time_dict[ret_time_str][:, 2] = np.nan blank_ret_time_dict[ret_time_str] = \ trim_close_mz_vals(blank_ret_time_dict[ret_time_str], num_decimals_ms_accuracy, ppm_threshold, ret_time_accuracy, len(blank_ret_time_dict[ret_time_str])) else: blank_ret_time_dict[ret_time_str] = sub_array except IndexError as e: if "out of bounds" in e.args[0]: continue else: raise InvalidDataError(e.args[0]) return common_ret_times, blank_ret_time_dict, ms_ret_time_dict
def main(argv=None): print(f"Running GaussianWrangler script gausslog2pdb version {__version__}") # Read input args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret cfg = args.config # Read template and data files try: check_input(args, cfg) # set up list of files to process cfg[GAUSSLOG_FILES] = [] if os.path.isfile(cfg[GAUSSLOG_FILES_FILE]): with open(cfg[GAUSSLOG_FILES_FILE]) as f: for data_file in f: cfg[GAUSSLOG_FILES].append(data_file.strip()) if cfg[GAUSSLOG_FILE] is not None: cfg[GAUSSLOG_FILES].append(cfg[GAUSSLOG_FILE]) if len(cfg[GAUSSLOG_FILES]) == 0: raise InvalidDataError("No files to process: no '{}' specified and " "no list of files found for: {}".format(GAUSSLOG_FILE, cfg[GAUSSLOG_FILES_FILE])) if cfg[ONLY_FIRST] and cfg[ONLY_FINAL]: raise InvalidDataError("Cannot specify both '{}' and '{}'".format(ONLY_FIRST, ONLY_FINAL)) # now start the actual work if cfg[PDB_TPL_FILE]: pdb_tpl_content = process_pdb_file(cfg[PDB_TPL_FILE]) else: pdb_tpl_content = {} process_gausscom_files(cfg, pdb_tpl_content) except (IOError, UnicodeDecodeError) as e: warning("Problems reading file:", e) return IO_ERROR except InvalidDataError as e: warning("Problems reading data:", e) return INVALID_DATA return GOOD_RET # success
def process_input_file(input_fname, mw_formula_dict, mw_deprot_formula_dict, mw_prot_formula_dict, form_smi_dict, form_dbe_dict, smi_name_dict, smi_source_dict): """ Read the file and uses the data to update dictionaries :return: the number of entries that were added to the dictionaries """ rel_path_name = os.path.relpath(input_fname) new_entries = 0 with open(input_fname) as f: for line in f: stripped_line = line.strip() if len(stripped_line) == 0: continue line_list = [ entry.strip() for entry in stripped_line.split(SEP_KEY) ] # if there is no SMILES str, there is no way to properly add any data to the library if not line_list[0]: warning( f"In reading file: {rel_path_name}\n Line: '{stripped_line}'\n does not " f"provide a SMILES string as the first '|'-separated entry. This line will be skipped." ) continue # if there aren't 3 entries, pad with blank strings, as 2nd two are optional while len(line_list) < 3: line_list.append("") if len(line_list) > 3: rel_path = os.path.relpath(input_fname) raise InvalidDataError( f"Error while reading: {rel_path}\n line: '{stripped_line}'\n" f" Expected no more than 3 comma-separated values: \n SMILES " f"string (only one per line),\n molecule name(s) (separate " f"multiple names with semicolons),\n string description of the " f"data source (with no commas or semicolons)") # being explicit in separating out line_list entries; do not change global variables new_entry_flag = add_smi_to_dicts(mw_formula_dict, mw_deprot_formula_dict, mw_prot_formula_dict, form_smi_dict, form_dbe_dict, smi_name_dict, smi_source_dict, line_list[0], mol_name=line_list[1], mol_source=line_list[2]) if new_entry_flag: new_entries += 1 print( f"Completed reading file: {rel_path_name}\n Added {new_entries} entries to the dictionaries\n" ) return new_entries
def validate_input(args): """ Checks for valid command-line input and performs any required casting :param args: command-line input and default values for program options """ # '-d', '-e', '-f', and 'l' skipped: they are already the required type (str) and validation performed as # part of the function that looks for files to process # '-s' skipped: Boolean will be returned by argparse, and an error if the user tries to give it a value try: # if already a float, no problem args.threshold = float(args.threshold) if args.threshold < 0 or args.threshold > 1000: raise ValueError except ValueError: raise InvalidDataError(f"Read '{args.threshold}' for the threshold value (in ppm; '-t' option) for matching " f"M/Z to MW. \n This must be a non-negative number, no greater than 1000.") args.ms_accuracy, args.num_decimals_ms_accuracy = validate_decimal_input(args.ms_accuracy, "'-a'/'--ms_accuracy'") args.ret_time_accuracy, args.num_decimals_ret_time_accuracy = validate_decimal_input(args.ret_time_accuracy, "'-r'/'--ret_time_accuracy'") if args.unlabeled_csvs: args.direct_injection = True # When unlabeled CSVS are chosen, peaks should be combined as with direct injection args.numpy_save_fmt = f'%.{args.num_decimals_ms_accuracy}f,%.0f' else: args.numpy_save_fmt = f'%.{args.num_decimals_ms_accuracy}f,%.0f,%.{args.num_decimals_ret_time_accuracy}f' try: args.min_rel_intensity = float(args.min_rel_intensity) if args.min_rel_intensity < 0 or args.min_rel_intensity > 100: raise ValueError except ValueError: raise InvalidDataError(f"Read {args.min_rel_intensity}% for the minimum relative intensity (percent of " f"the maximum intensity required\n for peak to be analyzed; " f"'-m' option). This must be a non-negative number, no greater than 100.")
def read_cfg(floc, cfg_proc=process_cfg): """ Reads the given configuration file, returning a dict with the converted values supplemented by default values. :param floc: The location of the file to read. :param cfg_proc: The processor to use for the raw configuration values. Uses default values when the raw value is missing. :return: A dict of the processed configuration file's data. """ config = ConfigParser() good_files = config.read(floc) if good_files: main_proc = cfg_proc(dict(config.items(MAIN_SEC)), def_cfg_vals=DEF_CFG_VALS, req_keys=REQ_KEYS) if main_proc[NUM]: main_proc[NUM] = int(main_proc[NUM]) else: main_proc = {GAU_TPL_FILE: None, CONFIG_NAME: floc} for key, def_val in DEF_CFG_VALS.items(): main_proc[key] = def_val main_proc[DIH_DATA] = [] if main_proc[DIH_ROT] is not None: try: dih_list = main_proc[DIH_ROT].split(";") for dih in dih_list: dih_data = dih.split(",") if len(dih_data) != 5: raise IndexError # note: RDKit is zero-based with atom indices, thus subtracting one from each number dih_data[:4] = [int(x) - 1 for x in dih_data[:4]] # noinspection PyTypeChecker dih_data[4] = float(dih_data[4]) main_proc[DIH_DATA].append(dih_data) except (ValueError, IndexError): raise InvalidDataError( "Error in parsing dihedral entry. Enter multiple dihedrals by separating data " "with a semicolon (';'). Each dihedral should be specified with 5 values, were the " "first four are one-based integer atom ids, and the last value is the rotation " "increment in degrees. ") if main_proc[MAX_CONF]: main_proc[MAX_CONF] = int(main_proc[MAX_CONF]) return main_proc
def process_pdb_files(cfg, gau_tpl_content): pdb_files = [] if cfg[PDB_FILE]: if os.path.isfile(cfg[PDB_FILE]): pdb_files.append(cfg[PDB_FILE]) else: raise IOError(cfg[PDB_FILE]) if os.path.isfile(cfg[PDB_LIST_FILE]): with open(cfg[PDB_LIST_FILE]) as f: for pdb_file in f.readlines(): pdb_file = pdb_file.strip() if len(pdb_file) > 0: pdb_files.append(pdb_file) if len(pdb_files) == 0: raise InvalidDataError("No pdb files found to process.") if cfg[DIH_DATA]: rotate_dihes_pdb_files(cfg, gau_tpl_content, pdb_files) else: for pdb_file in pdb_files: process_pdb_file(cfg, gau_tpl_content, pdb_file)
def check_input_csv_header(fname): """ Checks first line of specified for expected header :param fname: str, the location of the file to check the header :return: num_header_lines, int: 1 by default; 0 if it appears that the header is missing """ num_header_lines = 1 potential_header = read_csv_header(fname) base_fname = os.path.relpath(fname) if potential_header is None: raise InvalidDataError(f"Input file may be blank: {base_fname}") while potential_header[0].startswith("#"): with open(fname) as f: for row in f: if row.startswith("#"): num_header_lines += 1 else: potential_header = row.strip().split(",") potential_header = [dequote(x) for x in potential_header] break if potential_header != TYPICAL_CSV_HEADER and potential_header != CSV_RET_HEADER: try: # Still move on to reading values, but first check if there may not be a header if len(potential_header) > 1: # if right into values (that is, no trouble converting to float), continue to reading values float(potential_header[0]) float(potential_header[1]) num_header_lines = 0 warning(f"No header found in file: {base_fname}\n Will attempt to read data as M/Z and intensity.") else: raise ValueError except ValueError: # check that the difference is not a trivial difference in case if (len(potential_header) in [2, 3]) and (potential_header[0].lower() == TYPICAL_CSV_HEADER[0].lower()) \ and (potential_header[1].lower() == TYPICAL_CSV_HEADER[1].lower()): pass else: warning(f"While reading file: {base_fname}\n Did not find the expected headers " f"'{TYPICAL_CSV_HEADER}', but '{potential_header}'\n Will attempt to read data as M/Z, " f"intensity, and, if there is a third column, retention time (in min).") return num_header_lines
def check_if_files_to_be_saved(cfg): """ Evaluate input for requests to save output and check for valid specified locations :param cfg: dict of configuration values :return: if the cfg designs that files should be created, returns an updated cfg dict, and raises errors if invalid data in encountered """ if cfg[OUT_FORMAT_LIST]: # remove any periods to aid comparison; might as well also change comma to space and then split on just space out_format_list = cfg[OUT_FORMAT_LIST].replace(".", " ").replace(",", " ") format_set = set(out_format_list.split()) else: format_set = set() if cfg[BASENAME] and (cfg[BASENAME] != DEF_BASENAME): # If cfg[BASENAME] is not just the base name, make it so, saving a dir or ext in their spots out_path, base_name = os.path.split(cfg[BASENAME]) if out_path and cfg[OUT_DIR]: cfg[OUT_DIR] = os.path.join(cfg[OUT_DIR], out_path) elif out_path: cfg[OUT_DIR] = out_path base, ext = os.path.splitext(base_name) cfg[BASENAME] = base format_set.add(ext.replace(".", "")) if len(format_set) > 0: for format_type in format_set: if format_type in OUT_TYPE_LIST: cfg[SAVE_FILES] = True cfg[format_type] = True else: raise InvalidDataError(f"Invalid extension provided: '{format_type}'. The currently supported types " f"are: '{OUT_TYPE_STR}'") if cfg[PLOT_BONDS]: cfg[SAVE_FILES] = True # if out_dir does not already exist, recreate it, only if we will actually need it if cfg[SAVE_FILES] and cfg[OUT_DIR]: make_dir(cfg[OUT_DIR])
def process_smiles(gau_tpl_fname, smi_list, max_num_confs, out_dir): """ Creates Gaussian input files for each SMILES string provided https://www.rdkit.org/docs/GettingStartedInPython.html :param smi_list: list of SMILES strings :param gau_tpl_fname: str, the location of the template file to use to create input files :param max_num_confs: int, the maximum number of conformations to generate :param out_dir: str, directory where files are to be saved (if None, saves to working directory) :return: N/A, writes files and prints notes on files created """ gau_tpl_str = read_tpl(gau_tpl_fname) if REQ_STR not in gau_tpl_str: raise InvalidDataError( f"Did not find the required string '{REQ_STR}' in the provided Gaussian input " f"template file.") for smi in smi_list: mol = Chem.MolFromSmiles(smi) if mol is None: warning(f"Skipping SMILES input string '{smi}' due to error\n") continue Chem.Kekulize(mol) mol = AddHs(mol) confs = gen_conformers(mol, num_confs=max_num_confs) mol_name = get_mol_name(smi) base_fname = create_out_fname(mol_name, ext='com', base_dir=out_dir, rel_path=True) conf_id = -1 # make IDE happy for conf_id in confs: com_fname = create_out_fname(base_fname, suffix=f'_{conf_id}') pdb_str = MolToPDBBlock(mol, confId=conf_id) coord_list = get_pdb_coord_list(pdb_str) fill_save_tpl(gau_tpl_str, {ATOMS: "\n".join(coord_list)}, gau_tpl_fname, com_fname, print_info=False) print(f"Wrote {conf_id + 1} files with base name '{base_fname}'")
def produce_output(adj_matrix, mono_list, cfg): if cfg[SUPPRESS_SMI] and not (cfg[SAVE_JSON] or cfg[SAVE_PNG] or cfg[SAVE_SVG]): format_list = [SAVE_TCL] mol = None # Make IDE happy else: # Default out is SMILES, which requires getting an rdKit molecule object; also required for everything # except the TCL format format_list = [SAVE_TCL, SAVE_JSON, SAVE_PNG, SAVE_SVG] block = generate_mol(adj_matrix, mono_list) mol = MolFromMolBlock(block) try: smi_str = MolToSmiles(mol) + '\n' except: raise InvalidDataError("Error in producing SMILES string.") # if SMI is to be saved, don't output to stdout if cfg[SAVE_SMI]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=SAVE_SMI) str_to_file(smi_str, fname, print_info=True) else: print("\nSMILES representation: \n", MolToSmiles(mol), "\n") if cfg[SAVE_PNG] or cfg[SAVE_SVG] or cfg[SAVE_JSON]: # PNG and SVG make 2D images and thus need coordinates # JSON will save coordinates--zero's if not computed; might as well compute and save non-zero values Compute2DCoords(mol) for save_format in format_list: if cfg[save_format]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=save_format) if save_format == SAVE_TCL: gen_tcl(adj_matrix, mono_list, tcl_fname=fname, chain_id=cfg[CHAIN_ID], psf_fname=cfg[PSF_FNAME], toppar_dir=cfg[TOPPAR_DIR], out_dir=cfg[OUT_DIR]) if save_format == SAVE_JSON: json_str = MolToJSON(mol) str_to_file(json_str + '\n', fname) elif save_format == SAVE_PNG or save_format == SAVE_SVG: MolToFile(mol, fname, size=cfg[IMAGE_SIZE]) print(f"Wrote file: {fname}")
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= 'Checks for normal termination of Gaussian output files in a ' 'specified directory, and moves them to a new location.') parser.add_argument( "-a", "--all", help="Check convergence of all steps and print to standard out.", action="store_true", default=False) parser.add_argument( "-b", "--best", help= "Check convergence of each step and list the convergence of the best 10 " "steps, sorted by convergence.", action="store_true", default=False) parser.add_argument( "-d", "--directory", help= "The directory where to look for Gaussian output files to check for " "normal termination, without checking in subdirectories.", metavar="path", default=None) parser.add_argument( "-ds", "--dir_subdirs", help="The directory where to look for Gaussian output files to check " "for normal termination, including checking in subdirectories.", metavar="path", default=None) parser.add_argument( "-e", "--extension", help="The extension of the Gaussian output file(s) to look for when " "searching a directory for output files. The default is '{}'." "".format(DEF_EXT), metavar="ext", default=DEF_EXT) parser.add_argument( "-f", "--file_name", help= "A file name (with path, if not the current directory) to check for " "either normal termination or convergence. If used, this option " "overrides the '-d' option, and no searching for files is " "performed.", metavar="path", default=None) parser.add_argument( "-l", "--file_list", help="A file name (with path, if not the current directory) with a " "list of files (also with path, if not the current directory) " "overrides the '-d' option, and no searching for files is to check " "for either normal termination or convergence. If used, this " "option overrides the '-d' option, and no searching for files is " "performed.", metavar="path", default=None) parser.add_argument( "-o", "--output_directory", help="The directory where to put Gaussian output files that have " "terminated normally. The default is '{}'." "".format(DEF_COMPLETE_DIR), metavar="path", default=DEF_COMPLETE_DIR) parser.add_argument( "-s", "--step_converg", help="Report the convergence for each step value for the files in the " "directory or those specified with the '-f' or '-l' options. When " "this option is chosen, the check for normal termination is " "skipped. The default is False.", action="store_true", default=False) parser.add_argument( "-t", "--to_step", help="Check convergence of each step only to provided step number, and " "before printing to standard out, sort by convergence.", default=False) parser.add_argument( "-z", "--final_converg", help="Report the final convergence value for the files in the " "directory or those specified with the '-f' or '-l' options. " "When this option is chosen, the check for normal termination " "is skipped. The default is False.", action="store_true", default=False) parser.add_argument( "--scan", help= "Read output file(s) from a scan and writes the converged energies from each " "point of the scan to a csv file and creates a plot saved as the given file " "name.", metavar="path", default=None) args = None try: args = parser.parse_args(argv) if args.to_step or args.best or args.all: args.step_converg = True if args.to_step: try: args.to_step = int(args.to_step) except ValueError: raise InvalidDataError( "When the '-t' option is used, an integer must be provided." ) if args.step_converg and args.final_converg: raise InvalidDataError( "Choose either the '-a', '-b', '-s', '-t', or '-z' option.") # make the default output directory a subdirectory of the directory to search if args.output_directory == DEF_COMPLETE_DIR: if args.dir_subdirs: args.output_directory = os.path.relpath( os.path.join(args.dir_subdirs, DEF_COMPLETE_DIR)) if args.directory: args.output_directory = os.path.relpath( os.path.join(args.directory, DEF_COMPLETE_DIR)) except (KeyError, InvalidDataError, MissingSectionHeaderError, SystemExit) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= "This script has two modes, chosen by selected '-f' or '-i': " "1) The '-f' option: reads a file to add entries to " "dictionaries of lignin decomposition molecules that may be " "observed in mass spectrometry of lignin-derived compounds. Given " "SMILES strings, and optionally/ideally molecular names and/or source " "of the SMILES (e.g. observed in analysis of model compounds), the " "dictionaries are expanded to include additional potentially " "observed molecular weights and isomers. Note: it does not change " "the original libraries within this package, but instead outputs " "new libraries, which could be used to update the library in this " "package. 2) The '-i' option: creates an image library of all " "SMILES structures currently in the compound library (further details " "provided under the '-i' option description).") parser.add_argument( "-d", "--out_dir", help= "A directory where output files should be saved. The default location " "is the current working directory.", default=None) parser.add_argument( "-f", "--file_name", help=f"File name of values separated by '{SEP_KEY}' (to avoid conflicts " f"with IUPAC molecule names) with up to 3 values per line: SMILES " f"string (required), molecule name(s) (optional; split multiple " f"names with a semicolon), source (e.g. model compound analysis)", default=None) parser.add_argument( "-i", "--image_library", help=f"Flag to request that the program create a 2D image library of " f"the SMILES strings in the library. One file will be created " f"per exact molecular weight (calculated only from the most " f"abundant isotope). If there are multiple SMILES matches for a " f"molecular formula, the name of the file is '{{molecular " f"weight (with a '-' instead of a '.')}}_{{molecular formula}}" f".png', and the images of each structure within the file will " f"be labeled with its SMILES string. If there is only one " f"structure in the library for a molecular formula, the SMILES " f"string will be appended to the name. These files will be " f"saved in the current directory, unless a different directory " f"is specified with the '-o' option.", action='store_true') parser.add_argument( "-m", "--mw_list", help="A list of molecular weight keys for making an image library.", default=None) args = None try: args = parser.parse_args(argv) if not args.image_library and not args.file_name: raise InvalidDataError( "Please choose to either provide a file_name ('-f') to read new dictionary " "entries, or the image_library flag ('-i') to request 2D image library." ) except (KeyError, InvalidDataError, IOError, SystemExit) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def process_gausscom_file(cfg, gausscom_file, pdb_tpl_content): with open(gausscom_file) as d: if cfg[PDB_TPL_FILE]: pdb_data_section = copy.deepcopy(pdb_tpl_content[SEC_ATOMS]) else: pdb_data_section = [] section = SEC_HEAD atom_id = 0 for line in d: line = line.strip() # not currently keeping anything from the header; just check num atoms if section == SEC_HEAD: # there may be some instructions (which start with %, and can have some blank lines) before the # "route card lines" (which start with #) while not GAU_HEADER_PAT.match(line): line = next(d).strip() # skip first line of route card line = next(d).strip() # for "route card" and then description, there may be more than one header line; look for blank line for i in range(2): while len(line) > 0: line = next(d).strip() # now move past the blank line, and get the content of the following line line = next(d).strip() # now on charge, multiplicity line, which we also skip with the "continue" section = SEC_ATOMS continue elif section == SEC_ATOMS: if len(line) == 0: # Since the tail will come only from the template, nothing more is needed after reading atoms break split_line = line.split() atom_type = split_line[0] # if working from a template, check atom type if cfg[PDB_TPL_FILE]: try: pdb_atom_type = pdb_data_section[atom_id][8].split( ' ')[-1] except IndexError: raise InvalidDataError( 'Gausscom file: {}\n has more atoms than the expected {} atoms in ' 'the template file: {}'.format( gausscom_file, pdb_tpl_content[NUM_ATOMS], cfg[PDB_TPL_FILE])) if atom_type != pdb_atom_type: warning( "Atom types do not match for atom number {}; pdb atom type is {} while gausscom type " "is {}".format(atom_id, pdb_atom_type, atom_type)) else: pdb_data_section.append(atom_id) pdb_data_section[atom_id] = [ 'HETATM', '{:5d}'.format(atom_id + 1), ' {:4} '.format(atom_type), 'UNL ', 1, 0.0, 0.0, 0.0, ' 1.00 0.00 {:>2}'.format(atom_type) ] pdb_data_section[atom_id][5:8] = map(float, split_line[1:4]) atom_id += 1 # Now that finished reading the file, first make sure didn't exit before reaching the desired number of atoms if cfg[PDB_TPL_FILE]: if atom_id != pdb_tpl_content[NUM_ATOMS]: raise InvalidDataError( 'In gausscom file: {}\n found {} atoms, while the pdb template has {} atoms' .format(gausscom_file, atom_id, pdb_tpl_content[NUM_ATOMS])) f_name = create_out_fname(gausscom_file, ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section + pdb_tpl_content[SEC_TAIL], f_name, list_format=PDB_FORMAT)
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= 'Sets up and runs series of Gaussian jobs, checking between jobs ' 'for normal termination.') parser.add_argument( "job_name", help= "The job name to run. If the first job to run is '', a Gaussian input file " "(with extension '{}' or specified with '{}' argument in the config file) is " "needed. Otherwise, a checkpoint file (with extension '.chk') is " "needed.".format(DEF_GAUSS_IN_EXT, GAUSS_IN_EXT)) parser.add_argument( "-c", "--config", help="The location of the configuration file in ini format. " "The default file name is {}, located in the base directory " "where the program as run.".format(DEF_CFG_FILE), default=DEF_CFG_FILE, type=read_cfg) parser.add_argument( "-i", "--ignore_chk_warning", help="Ignore warning that a chk file cannot be found in the " "current directory for a job that will attempt to read it. " "Default is False.", action="store_true", default=False) parser.add_argument( "-l", "--list_of_jobs", help="The input in the position of 'job_name' will be read as a file " "name with a list of jobs to set up and submit. Each job name " "should be on a separate line. Any extension, or none, can follow " "the job name. If a 'setup_submit' or 'list_of_jobs' are not " "specified, the script will instead attempt to run the 'job_name'." " The default is False.", action="store_true", default=False) parser.add_argument( "-n", "--no_submit", help="Set up jobs without submitting them. This flag only effects the " "'-s' and '-l' options.", action="store_true", default=False) parser.add_argument( "-o", "--old_chk_fname", help="The name of the checkpoint file (will use base name plus " "'.chk' whether or not an extension of any type is provided) " "to be used for the first job (optional).", default=None) parser.add_argument( "-s", "--setup_submit", help="The script will setup and submit, rather than run, the provided " "'job_name'. Any extension, or none, can be included in the job " "name. If a 'single_job' or 'list_of_jobs' are not specified, " "the script will instead attempt to run the 'job_name'. The " "default is False.", action="store_true", default=False) parser.add_argument( "-t", "--testing", help="Run in testing mode, which will not check for normal Gaussian " "termination before continuing. Default is False.", action="store_true", default=False) args = None try: args = parser.parse_args(argv) if args.setup_submit and args.list_of_jobs: raise InvalidDataError( "Cannot choose both 'setup_submit' and 'list_of_jobs' options") if args.list_of_jobs: if not os.path.isfile(args.job_name): raise IOError( "When using the 'list_of_jobs' option, the first positional argument \n ('job_name') " "must be the name of the file with the list of jobs. " "Could not read: {}".format(args.job_name)) if not (args.list_of_jobs or args.setup_submit): if len(args.config[JOB_LIST]) > 1: raise InvalidDataError( "Found ';' in the '{}'. This option (setting up multiple job threads) is " "currently only supported for setting up (and optionally submitting) jobs " "(using the '-s' or '-l' options).".format(JOB_LIST)) elif len(args.config[JOB_LIST]) == 1: args.config[JOB_LIST] = args.config[JOB_LIST][0] except IOError as e: warning("Problems reading file:", e) parser.print_help() return args, IO_ERROR except (KeyError, InvalidDataError, MissingSectionHeaderError, SystemExit) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def run_job(job, job_name_perhaps_with_dir, tpl_dict, cfg, testing_mode): # Determine if it will run fresh or from an old checkpoint if job == '': new_job_name = tpl_dict[JOB_NAME] tpl_dict[INPUT_FILE] = job_name_perhaps_with_dir + cfg[GAUSS_IN_EXT] if cfg[FIRST_JOB_CHK]: tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format( cfg[FIRST_JOB_CHK]) else: tpl_dict[OLD_CHECK_ECHO] = '' else: new_job_name = tpl_dict[JOB_NAME] + '_' + job tpl_dict[OLD_JOB_NAME] = tpl_dict[JOB_NAME] tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format( tpl_dict[OLD_JOB_NAME]) tpl_dict[INPUT_FILE] = cfg[TPL_DICT][job] tpl_file = cfg[JOB_RUN_TPL] job_runner_fname = create_out_fname(new_job_name, ext=".sh", base_dir=cfg[OUT_DIR]) print("Running {}".format(new_job_name)) tpl_dict[JOB_NAME] = new_job_name for key_name in [ USER, MEM, PROC_LIST, ]: if key_name in cfg: tpl_dict[key_name] = cfg[key_name] tpl_str = read_tpl(tpl_file) # if either MEM or PROC_LIST is the default (Nonetype), and is used to run the job, get info from the node before # creating the job script mem_required = '{' + MEM + '}' in tpl_str get_mem = mem_required and not tpl_dict[MEM] proc_required = '{' + PROC_LIST + '}' in tpl_str get_proc = proc_required and not tpl_dict[PROC_LIST] default_gauss_required = '{' + DEF_ROUTE + '}' in tpl_str num_procs = 1 # to make IDE happy proc_list = '0' # to make IDE happy if get_mem or get_proc or default_gauss_required: # explicitly check each possible required info flag, because any or all can be requested if testing_mode: hostname = subprocess.check_output(["echo", "r1i7n35" ]).decode("utf-8").strip() else: # Will not be covered in testing mode, as is not part of written code to be tested hostname = subprocess.check_output(["hostname" ]).decode("utf-8").strip() print( "Obtaining available memory and/or number of processors on node {}.\n " "Note: this program assumes the whole node will be allocated to Gaussian.\n" .format(hostname)) if get_mem: tpl_dict[MEM] = get_node_mem(testing_mode) max_cache = 1024 * 1024 # to make IDE happy; Gaussian default (conservative) is 1024 * 1024 if get_proc or default_gauss_required: num_procs, proc_list, max_cache = get_proc_info(testing_mode) if get_proc: tpl_dict[PROC_LIST] = proc_list print( " Found {} processors. Will allow use of cpus {}.\n".format( num_procs, proc_list)) if get_mem or get_proc: print( " The user may override these values by specifying the '{}' and/or '{}' keywords in the " "configuration file.\n Be sure to use the formatting Gaussian expects.\n" .format(MEM, PROC_LIST)) if default_gauss_required: max_disk = get_max_disk(testing_mode) max_cache = int(max_cache) print( "Since '{}' found in the {}, read machine specs to determine CacheSize={} and " "MaxDisk={}".format(DEF_ROUTE, JOB_RUN_TPL, max_cache, max_disk)) default_route_list = [ "-#- CacheSize={}".format(max_cache), "-#- MaxDisk={}".format(max_disk) ] fname = create_out_fname('Default.Route', base_dir=cfg[SCRATCH_DIR]) list_to_file(default_route_list, fname) tpl_dict[ DEF_ROUTE] = '' # there is an action triggered, not a value needed, so replaced with blank space move_on = False while not move_on: try: fill_save_tpl(tpl_str, tpl_dict, tpl_file, job_runner_fname) move_on = True except KeyError as e: missing_key = e.args[0].split("\'")[1] if missing_key in cfg: tpl_dict[missing_key] = cfg[missing_key] else: raise e subprocess.call(["chmod", "+x", job_runner_fname]) if testing_mode: print( "Testing mode; did not run job script or check Gaussian output for normal termination.\n" ) else: # do not want this tested, as actually running Gaussian would take too long, and not what should be tested p1 = subprocess.Popen(job_runner_fname) p1.wait() out_file = tpl_dict[JOB_NAME] + ".log" last_line = subprocess.check_output(["tail", "-1", out_file]).strip().decode("utf-8") if GAU_GOOD_PAT.match(last_line): print("Successfully completed {}\n".format(out_file)) os.remove(job_runner_fname) else: raise InvalidDataError('Job failed: {}'.format(out_file))
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= 'Creates Gaussian input files from pdb files, given a template input ' 'file. The required input file provides the name/location of the ' 'template file and a file with a list of pdb files to convert.') parser.add_argument( "-c", "--config", help= "Optional: the location of the configuration file. The default file " "name is '{}', located in the base directory where the program as run. " "If a config file is not provided, use the command-line options to " "specify the '{}' (-t) and '{}' (-1) or '{}' (-f). The command lines " "for the '{}' flag (-r) or only the first entry in the pdb ('{}', -a) " "may also be specified.".format(DEF_CFG_FILE, GAU_TPL_FILE, PDB_LIST_FILE, PDB_FILE, REMOVE_H, NUM), default=DEF_CFG_FILE, type=read_cfg) parser.add_argument("-t", "--tpl_file", help="Specifies the '{}'".format(GAU_TPL_FILE), default=None) parser.add_argument( "-l", "--pdb_list_file", help="Option to specify a file with a list of pdbs ('{}') to convert " "(one file per line on the list).".format(PDB_LIST_FILE), default=None) parser.add_argument( "-f", "--file", help="Option to specify a pdb file ('{}') to convert.".format( PDB_FILE), default=None) parser.add_argument( "-n", "--num", help= "Only read if a config file is not provided. This command can be used to " "specify only using the first '-n'/'--num' set(s) of coordinates in a pdb " "file to create gausscom file(s). The default is to use all coordinates, " "making as many input files as there are molecules/conformations in the " "pdb.", default=None, type=int) parser.add_argument( "-r", "--remove_final_h", help="Option to specify removing the last H atom from the PDB " "file(s) when creating the gausscom files. The default is " "False.", action='store_true') args = None try: args = parser.parse_args(argv) if args.config[GAU_TPL_FILE] is None: if args.tpl_file is None: raise InvalidDataError( "Could not read config file: {}\n and did not specify a 'tpl_file' " "('-t' option). A tpl_file is needed to run this " "script.".format(args.config[CONFIG_NAME])) else: args.config[GAU_TPL_FILE] = args.tpl_file if args.num: args.config[NUM] = args.num if args.remove_final_h: args.config[REMOVE_H] = True if args.file: args.config[PDB_FILE] = args.file if args.pdb_list_file: args.config[PDB_LIST_FILE] = args.pdb_list_file except (IOError, KeyError, InvalidDataError, MissingSectionHeaderError, SystemExit) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def main(argv=None): print( f"Running GaussianWrangler script gausslog_unique version {__version__}" ) # Read input args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret # Read template and data files try: gausslog_files = [] missing_files = [] log_info = {} # check input if args.max_diff: args.max_diff = float(args.max_diff) if not args.energy and not args.gibbs: args.enthalpy = True # check that we have files with open(args.list) as f: for line in f: fname = line.strip() if len(fname) == 0: continue # check that each log file can be found if os.path.isfile(fname): gausslog_files.append(fname) else: missing_files.append(fname) if len(missing_files) > 0: raise IOError( "Could not find the following file(s) listed in '{}':\n " "{}".format(args.list, '\n '.join(sorted(set(missing_files))))) if len(gausslog_files) < 2: raise InvalidDataError( "This program expects at least two files to compare to determine if they " "have the same conformation. Check input.") # get the data from the files for gausslog_file in gausslog_files: gausslog_content = process_gausslog_file(gausslog_file, find_dih=True, find_converg=True) log_info[os.path.basename(gausslog_file)] = gausslog_content # process data from files list_of_conf_lists = compare_gausslog_info(log_info, args.tol) winner_str, warn_files_str = print_results(log_info, list_of_conf_lists, args.enthalpy, args.energy, args.max_diff, args.out_fname) if len(warn_files_str) > 0: warning("Check convergence of file(s):" + warn_files_str) except IOError as e: warning("Problems reading file:", e) return IO_ERROR except (InvalidDataError, UnicodeDecodeError) as e: warning("Problems reading data:", e) return INVALID_DATA except ValueError as e: warning(e.args[0]) return INVALID_DATA return GOOD_RET # success
def create_sbatch_dict(cfg, tpl_dict, new_ini_fname, current_job_list, start_from_job_name_chk=True, ignore_chk_warning=False): sbatch_dict = { PARTITION: cfg[PARTITION], RUN_TIME: cfg[RUN_TIME], ACCOUNT: cfg[ACCOUNT], JOB_NAME: tpl_dict[JOB_NAME], RUN_GAUSS_INI: new_ini_fname, QOS: cfg[QOS], JOB_DESCRIP: tpl_dict[JOB_DESCRIP], } if cfg[FIRST_JOB_CHK]: if not os.path.isfile(cfg[FIRST_JOB_CHK] + CHK_EXT): raise InvalidInputError("Could not find specified '{}': {}".format( FIRST_JOB_CHK, cfg[FIRST_JOB_CHK] + CHK_EXT)) sbatch_dict[OLD_CHECK_ECHO] = '-o ' + cfg[FIRST_JOB_CHK] elif start_from_job_name_chk: fname_to_check = tpl_dict[JOB_NAME] + CHK_EXT if not os.path.isfile(fname_to_check): raise InvalidDataError( "Could not find required checkpoint file: {}".format( fname_to_check)) sbatch_dict[OLD_CHECK_ECHO] = '-o ' + tpl_dict[JOB_NAME] else: sbatch_dict[OLD_CHECK_ECHO] = '' if current_job_list[0] == '' and cfg[CHECK_FOR_CHK]: # in the case when there is no old_check_file, make sure the first input file does not try to read from chk # IOError is already caught; no don't need to add a try loop with open(tpl_dict[INPUT_FILE]) as f: try: read_route = False for line in f: line = line.strip() # route can be multiple lines, so first fine the line, then continue until a blank is reached if GAU_HEADER_PAT.match(line): read_route = True while line != '': if GUESS_READ_OR_GEOM_CHK_PAT.match( line) and not ignore_chk_warning: raise InvalidDataError( "Did not find an old checkpoint file to read, but the " "Gaussian input header indicates that Gaussian will attempt " "and fail to read from a checkpoint:\n file: {}\n" " route: {} ".format( tpl_dict[INPUT_FILE], line)) line = next(f).strip() if not read_route: raise StopIteration except StopIteration: raise InvalidDataError( 'The specified input file does not appear valid: {}' ''.format(tpl_dict[INPUT_FILE])) if cfg[EMAIL]: sbatch_dict[EMAIL] = '#SBATCH --mail-type=FAIL\n#SBATCH --mail-type=END\n' \ '#SBATCH --mail-user={}'.format(cfg[EMAIL]) else: sbatch_dict[EMAIL] = '' return sbatch_dict