def read_cfg(f_loc, cfg_proc=process_cfg):
    """
    Reads the given configuration file, returning a dict with the converted values supplemented by default values.

    :param f_loc: The location of the file to read.
    :param cfg_proc: The processor to use for the raw configuration values.  Uses default values when the raw
        value is missing.
    :return: A dict of the processed configuration file's data.
    """
    config = ConfigParser()
    good_files = config.read(f_loc)

    if not good_files:
        raise IOError('Could not read file {}'.format(f_loc))
    main_proc = cfg_proc(dict(config.items(MAIN_SEC)),
                         DEF_CFG_VALS,
                         REQ_KEYS,
                         int_list=False,
                         store_extra_keys=True)

    main_proc[CONFIG_FILE] = f_loc
    main_proc[TPL_DICT] = {}

    all_job_types = []
    for job_list_key in [JOB_LIST, FOLLOW_JOBS_LIST]:
        job_list = main_proc[job_list_key]
        jobs_list = []
        # need to distinguish between NoneType and ''
        if job_list is not None:
            threads = [thread.split(',') for thread in job_list.split(';')]
            for thread in threads:
                thread = [job.strip() for job in thread]
                jobs_list.append(thread)
                all_job_types += thread
            main_proc[job_list_key] = jobs_list
        else:
            main_proc[job_list_key] = []

    for job in all_job_types:
        if job == '':
            continue
        if job in main_proc:
            tpl_name = main_proc[job]
        else:
            tpl_name = job + '.tpl'
        if os.path.isfile(tpl_name):
            main_proc[TPL_DICT][job] = tpl_name
        else:
            raise InvalidDataError(
                "For job '{}', could not find a template file '{}'\n"
                "You may specify the template to use (including path) using {} as a key in the "
                "config file.".format(job, tpl_name, job))

    if not os.path.isfile(main_proc[JOB_RUN_TPL]):
        raise InvalidDataError(
            "Could not find the submit template '{}'".format(
                main_proc[JOB_RUN_TPL]))

    return main_proc
def collect_output_scan_steps(check_file_list):
    """
    Looks for scan values in one or more files.
    Current functionality: returns one scan, or combines two scans if they search in opposite directions
    :param check_file_list:
    :return: a 2D numpy array with the scan values and energy differences in kcal/mol
    """
    scan_arrays = []
    for fname in check_file_list:
        log_content = process_gausslog_file(fname, collect_scan_steps=True)
        if len(log_content[SCAN_DICT]) > 0:
            scan_arrays.append(
                np.array(list(log_content[SCAN_DICT].items()), dtype=float))
    num_arrays = len(scan_arrays)
    # if only one scan file, return it
    if num_arrays == 1:
        return_array = scan_arrays[0]
    elif num_arrays == 0:
        raise InvalidDataError("No scan information found.")
    elif num_arrays == 2:
        first_array = scan_arrays[0]
        second_array = scan_arrays[1]
        first_diff = process_scan_array(first_array)
        second_diff = process_scan_array(second_array)
        # check if the first entry is in common, as for scan in two directions
        if abs(first_array[0][0] - second_array[0][0]) < 0.002:
            if first_diff < 0 < second_diff:
                first_array = np.flip(first_array, 0)
                return_array = np.vstack((first_array[:-1, :], second_array))
            elif first_diff > 0 > second_diff:
                second_array = np.flip(second_array, 0)
                return_array = np.vstack((second_array[:-1, :], first_array))
            else:
                raise InvalidDataError(
                    "Check how the scans are to be combined.")
        else:
            raise InvalidDataError(
                "The program cannot currently handle these files. Check input, and if correct, "
                "please open an issue on github.")
    # convert dict to array
    else:
        raise InvalidDataError(
            "The program can't yet handle this number of files. Please open an issue."
        )
    # find lowest energy and convert to differences in kcal/mol
    min_e = np.min(return_array[:, 1])
    return_array[:, 1] = (return_array[:, 1] - min_e) * EHPART_TO_KCAL_MOL
    return return_array
Example #3
0
def validate_decimal_input(input_accuracy, option_str):
    """
    Make sure that the provided input is within the allowable range, and determine the number of significant decimals
    :param input_accuracy: str or float
    :param option_str: the option corresponding to the input, to allow a more helpful error message
    :return: input_accuracy as float, num_decimals_accuracy as int
    """
    max_val = 1.
    min_val = 0.000000001
    tolerance = 1e-6
    try:
        input_accuracy = float(input_accuracy)
        if input_accuracy < min_val or input_accuracy > max_val:
            raise ValueError
        base_10_log = np.log10(input_accuracy)
        if not abs(base_10_log - round(base_10_log, 0) < tolerance):
            remainder = 1. % input_accuracy
            if not remainder < tolerance:
                raise ValueError
        num_decimals_accuracy = int(len(str(input_accuracy)) - str(input_accuracy).index('.') - 1.)

    except ValueError:
        raise InvalidDataError(f"Read '{input_accuracy}' for the {option_str} option.\n"
                               f"    This tolerance must be a non-negative fraction of 1 (1 must be a multiple of the "
                               f"tolerance), between {min_val} and {max_val}.")
    return input_accuracy, num_decimals_accuracy
def read_cfg(f_loc, cfg_proc=process_cfg):
    """
    Reads the given configuration file, returning a dict with the converted values supplemented by default values.

    :param f_loc: The location of the file to read.
    :param cfg_proc: The processor to use for the raw configuration values.  Uses default values when the raw
        value is missing.
    :return: A dict of the processed configuration file's data.
    """
    config = ConfigParser()
    good_files = config.read(f_loc)

    if not good_files:
        raise IOError('Could not read file {}'.format(f_loc))
    main_proc = cfg_proc(dict(config.items(MAIN_SEC)), DEF_CFG_VALS, REQ_KEYS)

    # To fix; have this as default!
    main_proc[GAUSSCOM_FILES] = []
    if os.path.isfile(main_proc[GAUSSCOM_FILES_FILE]):
        with open(main_proc[GAUSSCOM_FILES_FILE]) as f:
            for data_file in f:
                main_proc[GAUSSCOM_FILES].append(data_file.strip())
    if main_proc[GAUSSCOM_FILE] is not None:
        main_proc[GAUSSCOM_FILES].append(main_proc[GAUSSCOM_FILE])
    if len(main_proc[GAUSSCOM_FILES]) == 0:
        raise InvalidDataError("No files to process: no '{}' specified and "
                               "no list of files found for: {}".format(
                                   GAUSSCOM_FILE,
                                   main_proc[GAUSSCOM_FILES_FILE]))

    return main_proc
Example #5
0
def rotate_dihes_pdb_files(cfg, gau_tpl_content, pdb_files):
    """
    If dihedral data is specified, use RDkit to rotate specified dihedrals
    :param cfg: dict of configuration input
    :param gau_tpl_content: dict of data to create gaussian input files
    :param pdb_files: list of pdb files with dihedral angles to be rotated
    :return: n/a, saves new Gaussian input files
    """
    for pdb_file in pdb_files:
        mol_orig = MolFromPDBFile(pdb_file, removeHs=False)
        all_confs = [mol_orig]
        num_atoms = mol_orig.GetNumAtoms()
        for dih in cfg[DIH_DATA]:
            max_id = max(dih[:4])
            rot_deg = dih[4]
            if max_id > num_atoms:
                raise InvalidDataError(
                    f"Dihedral rotation specifies an atom id of {max_id}, while only {num_atoms} "
                    f"atoms were found in the PBD file {os.path.relpath(pdb_file)}"
                )
            new_confs = []
            for mol_id, current_mol in enumerate(all_confs):
                # atoms = [a for a in current_mol.GetAtoms()]
                # for a in atoms:
                #     print(a.GetIdx(), a.GetSymbol())
                dih_deg = GetDihedralDeg(current_mol.GetConformer(0), *dih[:4])
                for _ in range(int(round(360. / rot_deg, 0) - 1)):
                    dih_deg = dih_deg + rot_deg
                    # print(dih, dih_deg)
                    SetDihedralDeg(current_mol.GetConformer(0), *dih[:4],
                                   dih_deg)
                    new_confs.append(current_mol.__copy__())
            all_confs.extend(new_confs)
        create_coms_from_mol_list(all_confs, gau_tpl_content, pdb_file,
                                  cfg[MAX_CONF], cfg[ORIGINAL])
    def __init__(self, unit_type, i):
        """
        Constructor for the Monomer class, which sets the properties depending on what monolignol is being represented.
        The only attributes that need to be set are the species [0-2], and the unique integer identifier. Everything
        else will be computed from these values. The active site is initially set to 0, indicating that the monomer is
        not oxidized, but can be eventually. The open positions are either {4,5,8} or {4,8} depending on whether a
        5-methoxy is present. The set of monomers that are connected to begin as just containing the self's identity.

        Example calls are below:
            mon = Monomer(G, 0) # Makes a guaiacol unit monomer with ID = 0
            mon = Monomer(S, 0) # Makes a syringol unit monomer with ID = 0 (not recommended to repeat IDs)
            mon = Monomer(H, 0) # Makes a caffeoyl unit monomer with ID = 0
            mon = Monomer(S, n) # Makes a sinapyl alcohol with ID = n

        :param unit_type: str, monomer type
        :param i: int, unique identifier for the monomer
        Outputs:
            New instance of a monomer object with the desired attributes
        """

        self.identity = i
        self.type = unit_type

        # The active attribute is the active position: if 0 the monomer is not yet activated (not yet oxidized), and
        #     -1 means it is inactive and cannot be active (cannot be oxidized)
        self.active = 0
        if unit_type == G or unit_type == C:
            self.open = {4, 5, 8}
        elif unit_type == S:
            self.open = {4, 8}
        else:
            raise InvalidDataError(f"Encountered unit type {unit_type},  but only the following types are "
                                   f"currently available: 'G' ({G}), 'S' ({S}), 'C' ({C})")
        self.connectedTo = {i}
def process_gausscom_file(gausscom_file):
    # Grabs and stores in gausscom_content as a dictionary with the keys:
    #    SEC_HEAD: header (route section, blank lines, comments, and full charge and multiplicity line)
    #    CHARGE: overall charge (only) as int
    #    MULT: overall multiplicity (only) as int
    #    SEC_ATOMS: atoms as a dict of dicts, with atom_id as key to dict with
    #        ATOM_TYPE: atom_type (str), ATOM_COORDS: (np array)
    #    SEC_TAIL: everything including and after the blank line following SEC_ATOMS
    with open(gausscom_file) as d:
        gausscom_content = {
            SEC_HEAD: [],
            SEC_ATOMS: {},
            SEC_TAIL: [],
            BASE_NAME: get_fname_root(gausscom_file)
        }
        section = SEC_HEAD
        atom_id = 1
        blank_header_lines = 0

        for line in d:
            line = line.strip()

            if section == SEC_HEAD:
                gausscom_content[SEC_HEAD].append(line)
                if GAU_HEADER_PAT.match(line):
                    continue
                elif len(line) == 0:
                    blank_header_lines += 1
                    if blank_header_lines == 2:
                        section = SEC_ATOMS
                        line = next(d).strip()
                        gausscom_content[SEC_HEAD].append(line)
                        split_line = line.split()
                        try:
                            gausscom_content[CHARGE] = int(split_line[0])
                            gausscom_content[MULT] = int(split_line[1])
                        except (IndexError, ValueError):
                            raise InvalidDataError(
                                "Error in reading file {}\n  as a Gaussian input file. On the line "
                                "where charge and multiplicity are expected, "
                                "found: '{}'".format(gausscom_file, line))

            elif section == SEC_ATOMS:
                if len(line) == 0:
                    section = SEC_TAIL
                    gausscom_content[SEC_TAIL].append(line)
                    continue
                split_line = line.split()
                atom_type = split_line[0]
                atom_xyz = np.array(list(map(float, split_line[1:4])))
                gausscom_content[SEC_ATOMS][atom_id] = {
                    ATOM_TYPE: atom_type,
                    ATOM_COORDS: atom_xyz
                }
                atom_id += 1

            elif section == SEC_TAIL:
                gausscom_content[SEC_TAIL].append(line)

    return gausscom_content
def check_input(args, cfg):
    # override config entries if command-line options used
    if args.file:
        cfg[GAUSSLOG_FILE] = args.file
    if args.list:
        cfg[GAUSSLOG_FILES_FILE] = args.list
    if args.tpl:
        cfg[PDB_TPL_FILE] = args.tpl
    if args.out_dir:
        cfg[OUT_BASE_DIR] = args.out_dir
    if args.only_first:
        cfg[ONLY_FIRST] = True
    if args.only_final:
        cfg[ONLY_FINAL] = True
    if args.out_fname:
        cfg[OUTFILE_NAME] = args.out_fname
    if args.out_dir:
        cfg[OUT_BASE_DIR] = args.out_dir

    # checking
    if cfg[COMBINE_LOGS] and not cfg[OUTFILE_NAME]:
        raise InvalidDataError("When combining outputs from multiple log files into one pdb, specify the output "
                               "file name")
    if cfg[COMBINE_LOGS] and not cfg[ONLY_FINAL]:
        warning("When combining outputs from multiple log files into one pdb, only the last coordinates of each "
                "log file will be kept.")
        cfg[ONLY_FINAL] = True

    if cfg[OUT_BASE_DIR]:
        if not os.path.exists(cfg[OUT_BASE_DIR]):
            os.makedirs(cfg[OUT_BASE_DIR])
def main(argv=None):
    print(
        f"Running GaussianWrangler script gausslog2com version {__version__}")
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    try:
        # Make sure there are files to process
        gausslog_files = check_for_files(args.file, args.list)

        # and a template file to process
        if not args.tpl:
            raise InvalidDataError("No template file ('-t' option) specified")
        if not os.path.isfile(args.tpl):
            raise IOError(args.tpl)

        # Read template and data files
        com_tpl_content = process_gausscom_tpl(args.tpl, args.charge_from_tpl)
        process_gausslog_files(gausslog_files, com_tpl_content,
                               args.charge_from_tpl, args.low_energy,
                               args.step_num, args.out_dir, args.output_fname)
    except IOError as e:
        warning("Problems reading file:", e)
        return IO_ERROR
    except (InvalidDataError, UnicodeDecodeError) as e:
        warning("Problems reading data:", e)
        return INVALID_DATA

    return GOOD_RET  # success
def subtract_blank_data(blank_data, ms_data, ms_tol, ret_type=2):
    """
    Since looking at differences (not making a unique list) this method does not need mz's to be rounded first
    :param blank_data:
    :param ms_data:
    :param ms_tol:
    :param ret_type: int, 0 for matched MZ in retention time; 2 for matched retention time but no matched MZ
    :return:
    """
    blank_counter = 0
    ms_counter = 0
    try:
        while ms_counter < len(ms_data) and blank_counter < len(blank_data):
            diff = abs(blank_data[blank_counter][0] - ms_data[ms_counter][0])
            # add a little buffer so machine precision doesn't make it not match when it should
            if diff <= ms_tol * (1 + ms_tol):
                ms_data[ms_counter][1] = ms_data[ms_counter][1] - blank_data[blank_counter][1]
                ret_type = 0
                blank_counter += 1
            elif ms_data[ms_counter][0] < blank_data[blank_counter][0]:
                ms_counter += 1
            else:
                blank_counter += 1
                min_blank_val = ms_data[ms_counter][0] - ms_tol * (1. - ms_tol)
                while blank_data[blank_counter][0] < min_blank_val:
                    blank_counter += 1
        return ms_data, ret_type
    except IndexError as e:
        if "out of bounds" in e.args[0]:
            return ms_data, ret_type
        else:
            raise InvalidDataError("Error in subtract_blank_data method")
Example #11
0
def smi_to_formula(smi_str):
    """
    Given a smiles string in arbitrary format, return the smiles string as produced by RDKit,
        the molecular formula, and the molecular weight using only the most abundant isotopes
    :param smi_str: str, standard SMILES format
    :return: str, the molecular formula in standard chemistry notation
    """
    # Use RDKit to make a SMILES from a SMILES so that we get a unique string for any given SMILES entry
    mol = Chem.MolFromSmiles(smi_str)
    if mol is None:
        raise InvalidDataError(
            f"The input SMILES string '{smi_str}' could not be recognized by RDKit"
        )
    Chem.Kekulize(mol)
    rd_smi = Chem.MolToSmiles(mol, kekuleSmiles=True)
    mol_formula = CalcMolFormula(mol)
    stoich_dict = parse_stoich(mol_formula)
    dbe = calc_dbe(stoich_dict)
    mol_mass = 0
    for atom_type, num_atoms in stoich_dict.items():
        mass_most_abundant_isotope = LIGNIN_ISOTOPE_DICT[atom_type][MASS][0]
        mol_mass += mass_most_abundant_isotope * num_atoms

    mw_deprot = round(mol_mass - LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0],
                      MAX_SIG_FIGS)
    mw_prot = round(mol_mass + LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0],
                    MAX_SIG_FIGS)

    return rd_smi, mol_formula, round(mol_mass,
                                      MAX_SIG_FIGS), mw_deprot, mw_prot, dbe
def check_and_print(cfg, atom_id, pdb_tpl_content, gausslog_file, pdb_data_section, f_name, mode, message):
    # Check Num atoms and print
    if cfg[PDB_TPL_FILE]:
        if atom_id != pdb_tpl_content[NUM_ATOMS]:
            raise InvalidDataError('In gausslog file: {}\nfound {} atoms, while the pdb template has {} atoms' 
                                   'atoms'.format(gausslog_file, atom_id, pdb_tpl_content[NUM_ATOMS]))
    list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section + pdb_tpl_content[SEC_TAIL],
                 f_name, list_format=PDB_FORMAT, mode=mode, print_message=message)
def find_common_ret_times(blank_data_array, ms_data_array, ret_time_accuracy, num_decimals_ms_accuracy, ppm_threshold):
    """
    Facilitates comparing two arrays including ret time by making them into dicts with the ret times (as strings)
    as keys. Keeps all the ms run data in the dict, but only the data for common ret times from the blank data
    :param blank_data_array: ndarray with m/z, intensity, and retention time
    :param ms_data_array: ndarray with m/z, intensity, and retention time
    :param ret_time_accuracy: float, accuracy used to determine if retention times are significantly different
    :param num_decimals_ms_accuracy: int, number of decimal points in MS accuracy, for rounding
    :param ppm_threshold: str, the tolerance (in ppm) to consider two M/Z values identical
    :return: lists of retention times and dicts with the retention times as keys and ndarrays of peak and intensity
        data only for that retention time
    """
    # Already rounded before getting here, so not rounding again here
    blank_ret_times = np.unique(blank_data_array[:, 2])
    ms_ret_times = np.unique(ms_data_array[:, 2])

    # because of accuracy tolerance, not just using np.unique, but iterating to find those within tolerance
    common_ret_times = []
    ms_ret_time_dict = {}
    blank_ret_time_dict = {}

    blank_ret_time_counter = 0
    # want to keep all the ms run data (in a dict to make it easier to match up to blank data), but we don't
    #     need to keep all the blank data
    float_error_factor = 1.000001
    for ret_time in ms_ret_times:
        ret_time_str = str(ret_time)
        # grabbing sub array
        ms_ret_time_dict[ret_time_str] = ms_data_array[ms_data_array[:, 2] == ret_time]
        if blank_ret_time_counter < len(blank_ret_times):
            try:
                while ret_time > blank_ret_times[blank_ret_time_counter] * float_error_factor:
                    blank_ret_time_counter += 1
                diff = abs(ret_time - blank_ret_times[blank_ret_time_counter])
                # the multiplication below is needed because of machine precision error in storing floats
                if diff <= ret_time_accuracy:
                    common_ret_times.append(ret_time_str)
                    blank_ret_time = blank_ret_times[blank_ret_time_counter]
                    # grabbing sub array
                    sub_array = blank_data_array[blank_data_array[:, 2] == blank_ret_time]
                    if ret_time_str in blank_ret_time_dict:
                        # this is very unlikely to happen, but I don't mind checking for edge cases
                        blank_ret_time_dict[ret_time_str] = np.concatenate((blank_ret_time_dict[ret_time_str],
                                                                           sub_array), axis=0)
                        blank_ret_time_dict[ret_time_str][:, 2] = np.nan
                        blank_ret_time_dict[ret_time_str] = \
                            trim_close_mz_vals(blank_ret_time_dict[ret_time_str], num_decimals_ms_accuracy,
                                               ppm_threshold, ret_time_accuracy, len(blank_ret_time_dict[ret_time_str]))
                    else:
                        blank_ret_time_dict[ret_time_str] = sub_array
            except IndexError as e:
                if "out of bounds" in e.args[0]:
                    continue
                else:
                    raise InvalidDataError(e.args[0])
    return common_ret_times, blank_ret_time_dict, ms_ret_time_dict
def main(argv=None):
    print(f"Running GaussianWrangler script gausslog2pdb version {__version__}")

    # Read input
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    cfg = args.config

    # Read template and data files
    try:
        check_input(args, cfg)

        # set up list of files to process
        cfg[GAUSSLOG_FILES] = []
        if os.path.isfile(cfg[GAUSSLOG_FILES_FILE]):
            with open(cfg[GAUSSLOG_FILES_FILE]) as f:
                for data_file in f:
                    cfg[GAUSSLOG_FILES].append(data_file.strip())
        if cfg[GAUSSLOG_FILE] is not None:
            cfg[GAUSSLOG_FILES].append(cfg[GAUSSLOG_FILE])
        if len(cfg[GAUSSLOG_FILES]) == 0:
            raise InvalidDataError("No files to process: no '{}' specified and "
                                   "no list of files found for: {}".format(GAUSSLOG_FILE, cfg[GAUSSLOG_FILES_FILE]))
        if cfg[ONLY_FIRST] and cfg[ONLY_FINAL]:
            raise InvalidDataError("Cannot specify both '{}' and '{}'".format(ONLY_FIRST, ONLY_FINAL))

        # now start the actual work
        if cfg[PDB_TPL_FILE]:
            pdb_tpl_content = process_pdb_file(cfg[PDB_TPL_FILE])
        else:
            pdb_tpl_content = {}
        process_gausscom_files(cfg, pdb_tpl_content)
    except (IOError, UnicodeDecodeError) as e:
        warning("Problems reading file:", e)
        return IO_ERROR
    except InvalidDataError as e:
        warning("Problems reading data:", e)
        return INVALID_DATA

    return GOOD_RET  # success
Example #15
0
def process_input_file(input_fname, mw_formula_dict, mw_deprot_formula_dict,
                       mw_prot_formula_dict, form_smi_dict, form_dbe_dict,
                       smi_name_dict, smi_source_dict):
    """
    Read the file and uses the data to update dictionaries
    :return: the number of entries that were added to the dictionaries
    """
    rel_path_name = os.path.relpath(input_fname)
    new_entries = 0
    with open(input_fname) as f:
        for line in f:
            stripped_line = line.strip()
            if len(stripped_line) == 0:
                continue
            line_list = [
                entry.strip() for entry in stripped_line.split(SEP_KEY)
            ]
            # if there is no SMILES str, there is no way to properly add any data to the library
            if not line_list[0]:
                warning(
                    f"In reading file: {rel_path_name}\n    Line: '{stripped_line}'\n        does not "
                    f"provide a SMILES string as the first '|'-separated entry. This line will be skipped."
                )
                continue
            # if there aren't 3 entries, pad with blank strings, as 2nd two are optional
            while len(line_list) < 3:
                line_list.append("")
            if len(line_list) > 3:
                rel_path = os.path.relpath(input_fname)
                raise InvalidDataError(
                    f"Error while reading: {rel_path}\n    line: '{stripped_line}'\n"
                    f"    Expected no more than 3 comma-separated values: \n        SMILES "
                    f"string (only one per line),\n        molecule name(s) (separate "
                    f"multiple names with semicolons),\n        string description of the "
                    f"data source (with no commas or semicolons)")

            # being explicit in separating out line_list entries; do not change global variables
            new_entry_flag = add_smi_to_dicts(mw_formula_dict,
                                              mw_deprot_formula_dict,
                                              mw_prot_formula_dict,
                                              form_smi_dict,
                                              form_dbe_dict,
                                              smi_name_dict,
                                              smi_source_dict,
                                              line_list[0],
                                              mol_name=line_list[1],
                                              mol_source=line_list[2])
            if new_entry_flag:
                new_entries += 1
    print(
        f"Completed reading file: {rel_path_name}\n    Added {new_entries} entries to the dictionaries\n"
    )
    return new_entries
Example #16
0
def validate_input(args):
    """
    Checks for valid command-line input and performs any required casting
    :param args: command-line input and default values for program options
    """
    # '-d', '-e', '-f',  and 'l' skipped: they are already the required type (str) and validation performed as
    #     part of the function that looks for files to process
    # '-s' skipped: Boolean will be returned by argparse, and an error if the user tries to give it a value

    try:
        # if already a float, no problem
        args.threshold = float(args.threshold)
        if args.threshold < 0 or args.threshold > 1000:
            raise ValueError
    except ValueError:
        raise InvalidDataError(f"Read '{args.threshold}' for the threshold value (in ppm; '-t' option) for matching "
                               f"M/Z to MW. \n    This must be a non-negative number, no greater than 1000.")

    args.ms_accuracy, args.num_decimals_ms_accuracy = validate_decimal_input(args.ms_accuracy, "'-a'/'--ms_accuracy'")
    args.ret_time_accuracy, args.num_decimals_ret_time_accuracy = validate_decimal_input(args.ret_time_accuracy,
                                                                                         "'-r'/'--ret_time_accuracy'")
    if args.unlabeled_csvs:
        args.direct_injection = True
        # When unlabeled CSVS are chosen, peaks should be combined as with direct injection
        args.numpy_save_fmt = f'%.{args.num_decimals_ms_accuracy}f,%.0f'
    else:
        args.numpy_save_fmt = f'%.{args.num_decimals_ms_accuracy}f,%.0f,%.{args.num_decimals_ret_time_accuracy}f'

    try:
        args.min_rel_intensity = float(args.min_rel_intensity)
        if args.min_rel_intensity < 0 or args.min_rel_intensity > 100:
            raise ValueError
    except ValueError:
        raise InvalidDataError(f"Read {args.min_rel_intensity}% for the minimum relative intensity (percent of "
                               f"the maximum intensity required\n    for peak to be analyzed; "
                               f"'-m' option). This must be a non-negative number, no greater than 100.")
Example #17
0
def read_cfg(floc, cfg_proc=process_cfg):
    """
    Reads the given configuration file, returning a dict with the converted values supplemented by default values.

    :param floc: The location of the file to read.
    :param cfg_proc: The processor to use for the raw configuration values.  Uses default values when the raw
        value is missing.
    :return: A dict of the processed configuration file's data.
    """
    config = ConfigParser()
    good_files = config.read(floc)
    if good_files:
        main_proc = cfg_proc(dict(config.items(MAIN_SEC)),
                             def_cfg_vals=DEF_CFG_VALS,
                             req_keys=REQ_KEYS)
        if main_proc[NUM]:
            main_proc[NUM] = int(main_proc[NUM])
    else:
        main_proc = {GAU_TPL_FILE: None, CONFIG_NAME: floc}
        for key, def_val in DEF_CFG_VALS.items():
            main_proc[key] = def_val

    main_proc[DIH_DATA] = []
    if main_proc[DIH_ROT] is not None:
        try:
            dih_list = main_proc[DIH_ROT].split(";")
            for dih in dih_list:
                dih_data = dih.split(",")
                if len(dih_data) != 5:
                    raise IndexError
                # note: RDKit is zero-based with atom indices, thus subtracting one from each number
                dih_data[:4] = [int(x) - 1 for x in dih_data[:4]]
                # noinspection PyTypeChecker
                dih_data[4] = float(dih_data[4])
                main_proc[DIH_DATA].append(dih_data)
        except (ValueError, IndexError):
            raise InvalidDataError(
                "Error in parsing dihedral entry. Enter multiple dihedrals by separating data "
                "with a semicolon (';'). Each dihedral should be specified with 5 values, were the "
                "first four are one-based integer atom ids, and the last value is the rotation "
                "increment in degrees. ")

        if main_proc[MAX_CONF]:
            main_proc[MAX_CONF] = int(main_proc[MAX_CONF])
    return main_proc
Example #18
0
def process_pdb_files(cfg, gau_tpl_content):
    pdb_files = []
    if cfg[PDB_FILE]:
        if os.path.isfile(cfg[PDB_FILE]):
            pdb_files.append(cfg[PDB_FILE])
        else:
            raise IOError(cfg[PDB_FILE])
    if os.path.isfile(cfg[PDB_LIST_FILE]):
        with open(cfg[PDB_LIST_FILE]) as f:
            for pdb_file in f.readlines():
                pdb_file = pdb_file.strip()
                if len(pdb_file) > 0:
                    pdb_files.append(pdb_file)
    if len(pdb_files) == 0:
        raise InvalidDataError("No pdb files found to process.")
    if cfg[DIH_DATA]:
        rotate_dihes_pdb_files(cfg, gau_tpl_content, pdb_files)
    else:
        for pdb_file in pdb_files:
            process_pdb_file(cfg, gau_tpl_content, pdb_file)
def check_input_csv_header(fname):
    """
    Checks first line of specified for expected header
    :param fname: str, the location of the file to check the header
    :return: num_header_lines, int: 1 by default; 0 if it appears that the header is missing
    """
    num_header_lines = 1
    potential_header = read_csv_header(fname)
    base_fname = os.path.relpath(fname)
    if potential_header is None:
        raise InvalidDataError(f"Input file may be blank: {base_fname}")
    while potential_header[0].startswith("#"):
        with open(fname) as f:
            for row in f:
                if row.startswith("#"):
                    num_header_lines += 1
                else:
                    potential_header = row.strip().split(",")
                    potential_header = [dequote(x) for x in potential_header]
                    break
    if potential_header != TYPICAL_CSV_HEADER and potential_header != CSV_RET_HEADER:
        try:
            # Still move on to reading values, but first check if there may not be a header
            if len(potential_header) > 1:
                # if right into values (that is, no trouble converting to float), continue to reading values
                float(potential_header[0])
                float(potential_header[1])
                num_header_lines = 0
                warning(f"No header found in file: {base_fname}\n    Will attempt to read data as M/Z and intensity.")
            else:
                raise ValueError
        except ValueError:
            # check that the difference is not a trivial difference in case
            if (len(potential_header) in [2, 3]) and (potential_header[0].lower() == TYPICAL_CSV_HEADER[0].lower()) \
                    and (potential_header[1].lower() == TYPICAL_CSV_HEADER[1].lower()):
                pass
            else:
                warning(f"While reading file: {base_fname}\n    Did not find the expected headers "
                        f"'{TYPICAL_CSV_HEADER}', but '{potential_header}'\n Will attempt to read data as M/Z, "
                        f"intensity, and, if there is a third column, retention time (in min).")
    return num_header_lines
Example #20
0
def check_if_files_to_be_saved(cfg):
    """
    Evaluate input for requests to save output and check for valid specified locations
    :param cfg: dict of configuration values
    :return: if the cfg designs that files should be created, returns an updated cfg dict, and raises errors if
              invalid data in encountered
    """
    if cfg[OUT_FORMAT_LIST]:
        # remove any periods to aid comparison; might as well also change comma to space and then split on just space
        out_format_list = cfg[OUT_FORMAT_LIST].replace(".", " ").replace(",", " ")
        format_set = set(out_format_list.split())
    else:
        format_set = set()

    if cfg[BASENAME] and (cfg[BASENAME] != DEF_BASENAME):
        # If cfg[BASENAME] is not just the base name, make it so, saving a dir or ext in their spots
        out_path, base_name = os.path.split(cfg[BASENAME])
        if out_path and cfg[OUT_DIR]:
            cfg[OUT_DIR] = os.path.join(cfg[OUT_DIR], out_path)
        elif out_path:
            cfg[OUT_DIR] = out_path
        base, ext = os.path.splitext(base_name)
        cfg[BASENAME] = base
        format_set.add(ext.replace(".", ""))

    if len(format_set) > 0:
        for format_type in format_set:
            if format_type in OUT_TYPE_LIST:
                cfg[SAVE_FILES] = True
                cfg[format_type] = True
            else:
                raise InvalidDataError(f"Invalid extension provided: '{format_type}'. The currently supported types "
                                       f"are: '{OUT_TYPE_STR}'")
    if cfg[PLOT_BONDS]:
        cfg[SAVE_FILES] = True

    # if out_dir does not already exist, recreate it, only if we will actually need it
    if cfg[SAVE_FILES] and cfg[OUT_DIR]:
        make_dir(cfg[OUT_DIR])
Example #21
0
def process_smiles(gau_tpl_fname, smi_list, max_num_confs, out_dir):
    """
    Creates Gaussian input files for each SMILES string provided
    https://www.rdkit.org/docs/GettingStartedInPython.html
    :param smi_list: list of SMILES strings
    :param gau_tpl_fname: str, the location of the template file to use to create input files
    :param max_num_confs: int, the maximum number of conformations to generate
    :param out_dir: str, directory where files are to be saved (if None, saves to working directory)
    :return: N/A, writes files and prints notes on files created
    """
    gau_tpl_str = read_tpl(gau_tpl_fname)
    if REQ_STR not in gau_tpl_str:
        raise InvalidDataError(
            f"Did not find the required string '{REQ_STR}' in the provided Gaussian input "
            f"template file.")
    for smi in smi_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            warning(f"Skipping SMILES input string '{smi}' due to error\n")
            continue
        Chem.Kekulize(mol)
        mol = AddHs(mol)
        confs = gen_conformers(mol, num_confs=max_num_confs)
        mol_name = get_mol_name(smi)
        base_fname = create_out_fname(mol_name,
                                      ext='com',
                                      base_dir=out_dir,
                                      rel_path=True)
        conf_id = -1  # make IDE happy
        for conf_id in confs:
            com_fname = create_out_fname(base_fname, suffix=f'_{conf_id}')
            pdb_str = MolToPDBBlock(mol, confId=conf_id)
            coord_list = get_pdb_coord_list(pdb_str)
            fill_save_tpl(gau_tpl_str, {ATOMS: "\n".join(coord_list)},
                          gau_tpl_fname,
                          com_fname,
                          print_info=False)
        print(f"Wrote {conf_id + 1} files with base name '{base_fname}'")
Example #22
0
def produce_output(adj_matrix, mono_list, cfg):
    if cfg[SUPPRESS_SMI] and not (cfg[SAVE_JSON] or cfg[SAVE_PNG] or cfg[SAVE_SVG]):
        format_list = [SAVE_TCL]
        mol = None  # Make IDE happy
    else:
        # Default out is SMILES, which requires getting an rdKit molecule object; also required for everything
        #    except the TCL format
        format_list = [SAVE_TCL, SAVE_JSON, SAVE_PNG, SAVE_SVG]
        block = generate_mol(adj_matrix, mono_list)
        mol = MolFromMolBlock(block)
        try:
            smi_str = MolToSmiles(mol) + '\n'
        except:
            raise InvalidDataError("Error in producing SMILES string.")
        # if SMI is to be saved, don't output to stdout
        if cfg[SAVE_SMI]:
            fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=SAVE_SMI)
            str_to_file(smi_str, fname, print_info=True)
        else:
            print("\nSMILES representation: \n", MolToSmiles(mol), "\n")
        if cfg[SAVE_PNG] or cfg[SAVE_SVG] or cfg[SAVE_JSON]:
            # PNG and SVG make 2D images and thus need coordinates
            # JSON will save coordinates--zero's if not computed; might as well compute and save non-zero values
            Compute2DCoords(mol)

    for save_format in format_list:
        if cfg[save_format]:
            fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=save_format)
            if save_format == SAVE_TCL:
                gen_tcl(adj_matrix, mono_list, tcl_fname=fname, chain_id=cfg[CHAIN_ID],
                        psf_fname=cfg[PSF_FNAME], toppar_dir=cfg[TOPPAR_DIR], out_dir=cfg[OUT_DIR])
            if save_format == SAVE_JSON:
                json_str = MolToJSON(mol)
                str_to_file(json_str + '\n', fname)
            elif save_format == SAVE_PNG or save_format == SAVE_SVG:
                MolToFile(mol, fname, size=cfg[IMAGE_SIZE])
            print(f"Wrote file: {fname}")
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        'Checks for normal termination of Gaussian output files in a '
        'specified directory, and moves them to a new location.')
    parser.add_argument(
        "-a",
        "--all",
        help="Check convergence of all steps and print to standard out.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-b",
        "--best",
        help=
        "Check convergence of each step and list the convergence of the best 10 "
        "steps, sorted by convergence.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-d",
        "--directory",
        help=
        "The directory where to look for Gaussian output files to check for "
        "normal termination, without checking in subdirectories.",
        metavar="path",
        default=None)
    parser.add_argument(
        "-ds",
        "--dir_subdirs",
        help="The directory where to look for Gaussian output files to check "
        "for normal termination, including checking in subdirectories.",
        metavar="path",
        default=None)
    parser.add_argument(
        "-e",
        "--extension",
        help="The extension of the Gaussian output file(s) to look for when "
        "searching a directory for output files. The default is '{}'."
        "".format(DEF_EXT),
        metavar="ext",
        default=DEF_EXT)
    parser.add_argument(
        "-f",
        "--file_name",
        help=
        "A file name (with path, if not the current directory) to check for "
        "either normal termination or convergence. If used, this option "
        "overrides the '-d' option, and no searching for files is "
        "performed.",
        metavar="path",
        default=None)
    parser.add_argument(
        "-l",
        "--file_list",
        help="A file name (with path, if not the current directory) with a "
        "list of files (also with path, if not the current directory)  "
        "overrides the '-d' option, and no searching for files is to check "
        "for either normal termination or convergence. If used, this "
        "option overrides the '-d' option, and no searching for files is "
        "performed.",
        metavar="path",
        default=None)
    parser.add_argument(
        "-o",
        "--output_directory",
        help="The directory where to put Gaussian output files that have "
        "terminated normally. The default is '{}'."
        "".format(DEF_COMPLETE_DIR),
        metavar="path",
        default=DEF_COMPLETE_DIR)
    parser.add_argument(
        "-s",
        "--step_converg",
        help="Report the convergence for each step value for the files in the "
        "directory or those specified with the '-f' or '-l' options. When "
        "this option is chosen, the check for normal termination is "
        "skipped. The default is False.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-t",
        "--to_step",
        help="Check convergence of each step only to provided step number, and "
        "before printing to standard out, sort by convergence.",
        default=False)
    parser.add_argument(
        "-z",
        "--final_converg",
        help="Report the final convergence value for the files in the "
        "directory or those specified with the '-f' or '-l' options. "
        "When this option is chosen, the check for normal termination "
        "is skipped. The default is False.",
        action="store_true",
        default=False)
    parser.add_argument(
        "--scan",
        help=
        "Read output file(s) from a scan and writes the converged energies from each "
        "point of the scan to a csv file and creates a plot saved as the given file "
        "name.",
        metavar="path",
        default=None)
    args = None
    try:
        args = parser.parse_args(argv)
        if args.to_step or args.best or args.all:
            args.step_converg = True
        if args.to_step:
            try:
                args.to_step = int(args.to_step)
            except ValueError:
                raise InvalidDataError(
                    "When the '-t' option is used, an integer must be provided."
                )
        if args.step_converg and args.final_converg:
            raise InvalidDataError(
                "Choose either the '-a', '-b', '-s', '-t', or '-z' option.")
        # make the default output directory a subdirectory of the directory to search
        if args.output_directory == DEF_COMPLETE_DIR:
            if args.dir_subdirs:
                args.output_directory = os.path.relpath(
                    os.path.join(args.dir_subdirs, DEF_COMPLETE_DIR))
            if args.directory:
                args.output_directory = os.path.relpath(
                    os.path.join(args.directory, DEF_COMPLETE_DIR))

    except (KeyError, InvalidDataError, MissingSectionHeaderError,
            SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Example #24
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        "This script has two modes, chosen by selected '-f' or '-i': "
        "1) The '-f' option: reads a file to add entries to "
        "dictionaries of lignin decomposition molecules that may be "
        "observed in mass spectrometry of lignin-derived compounds. Given "
        "SMILES strings, and optionally/ideally molecular names and/or source "
        "of the SMILES (e.g. observed in analysis of model compounds), the "
        "dictionaries are expanded to include additional potentially "
        "observed molecular weights and isomers. Note: it does not change "
        "the original libraries within this package, but instead outputs "
        "new libraries, which could be used to update the library in this "
        "package. 2) The '-i' option: creates an image library of all "
        "SMILES structures currently in the compound library (further details "
        "provided under the '-i' option description).")
    parser.add_argument(
        "-d",
        "--out_dir",
        help=
        "A directory where output files should be saved. The default location "
        "is the current working directory.",
        default=None)
    parser.add_argument(
        "-f",
        "--file_name",
        help=f"File name of values separated by '{SEP_KEY}' (to avoid conflicts "
        f"with IUPAC molecule names) with up to 3 values per line: SMILES "
        f"string (required), molecule name(s) (optional; split multiple "
        f"names with a semicolon), source (e.g. model compound analysis)",
        default=None)
    parser.add_argument(
        "-i",
        "--image_library",
        help=f"Flag to request that the program create a 2D image library of "
        f"the SMILES strings in the library. One file will be created "
        f"per exact molecular weight (calculated only from the most "
        f"abundant isotope). If there are multiple SMILES matches for a "
        f"molecular formula, the name of the file is '{{molecular "
        f"weight (with a '-' instead of a '.')}}_{{molecular formula}}"
        f".png', and the images of each structure within the file will "
        f"be labeled with its SMILES string. If there is only one "
        f"structure in the library for a molecular formula, the SMILES "
        f"string will be appended to the name. These files will be "
        f"saved in the current directory, unless a different directory "
        f"is specified with the '-o' option.",
        action='store_true')
    parser.add_argument(
        "-m",
        "--mw_list",
        help="A list of molecular weight keys for making an image library.",
        default=None)

    args = None
    try:
        args = parser.parse_args(argv)
        if not args.image_library and not args.file_name:
            raise InvalidDataError(
                "Please choose to either provide a file_name ('-f') to read new dictionary "
                "entries, or the image_library flag ('-i') to request 2D image library."
            )
    except (KeyError, InvalidDataError, IOError, SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR
    return args, GOOD_RET
def process_gausscom_file(cfg, gausscom_file, pdb_tpl_content):
    with open(gausscom_file) as d:
        if cfg[PDB_TPL_FILE]:
            pdb_data_section = copy.deepcopy(pdb_tpl_content[SEC_ATOMS])
        else:
            pdb_data_section = []
        section = SEC_HEAD
        atom_id = 0

        for line in d:
            line = line.strip()
            # not currently keeping anything from the header; just check num atoms
            if section == SEC_HEAD:
                # there may be some instructions (which start with %, and can have some blank lines) before the
                #    "route card lines" (which start with #)
                while not GAU_HEADER_PAT.match(line):
                    line = next(d).strip()
                # skip first line of route card
                line = next(d).strip()
                # for "route card" and then description, there may be more than one header line; look for blank line
                for i in range(2):
                    while len(line) > 0:
                        line = next(d).strip()
                    # now move past the blank line, and get the content of the following line
                    line = next(d).strip()
                # now on charge, multiplicity line, which we also skip with the "continue"
                section = SEC_ATOMS
                continue

            elif section == SEC_ATOMS:
                if len(line) == 0:
                    # Since the tail will come only from the template, nothing more is needed after reading atoms
                    break
                split_line = line.split()

                atom_type = split_line[0]
                # if working from a template, check atom type
                if cfg[PDB_TPL_FILE]:
                    try:
                        pdb_atom_type = pdb_data_section[atom_id][8].split(
                            ' ')[-1]
                    except IndexError:
                        raise InvalidDataError(
                            'Gausscom file: {}\n   has more atoms than the expected {} atoms in '
                            'the template file: {}'.format(
                                gausscom_file, pdb_tpl_content[NUM_ATOMS],
                                cfg[PDB_TPL_FILE]))
                    if atom_type != pdb_atom_type:
                        warning(
                            "Atom types do not match for atom number {}; pdb atom type is {} while gausscom type "
                            "is {}".format(atom_id, pdb_atom_type, atom_type))
                else:
                    pdb_data_section.append(atom_id)
                    pdb_data_section[atom_id] = [
                        'HETATM', '{:5d}'.format(atom_id + 1),
                        ' {:4} '.format(atom_type), 'UNL  ', 1, 0.0, 0.0, 0.0,
                        '  1.00  0.00          {:>2}'.format(atom_type)
                    ]
                pdb_data_section[atom_id][5:8] = map(float, split_line[1:4])
                atom_id += 1

    # Now that finished reading the file, first make sure didn't exit before reaching the desired number of atoms
    if cfg[PDB_TPL_FILE]:
        if atom_id != pdb_tpl_content[NUM_ATOMS]:
            raise InvalidDataError(
                'In gausscom file: {}\n  found {} atoms, while the pdb template has {} atoms'
                .format(gausscom_file, atom_id, pdb_tpl_content[NUM_ATOMS]))
    f_name = create_out_fname(gausscom_file,
                              ext='.pdb',
                              base_dir=cfg[OUT_BASE_DIR])
    list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section +
                 pdb_tpl_content[SEC_TAIL],
                 f_name,
                 list_format=PDB_FORMAT)
Example #26
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        'Sets up and runs series of Gaussian jobs, checking between jobs '
        'for normal termination.')
    parser.add_argument(
        "job_name",
        help=
        "The job name to run. If the first job to run is '', a Gaussian input file "
        "(with extension '{}' or specified with '{}' argument in the config file) is "
        "needed. Otherwise, a checkpoint file (with extension '.chk') is "
        "needed.".format(DEF_GAUSS_IN_EXT, GAUSS_IN_EXT))
    parser.add_argument(
        "-c",
        "--config",
        help="The location of the configuration file in ini format. "
        "The default file name is {}, located in the base directory "
        "where the program as run.".format(DEF_CFG_FILE),
        default=DEF_CFG_FILE,
        type=read_cfg)
    parser.add_argument(
        "-i",
        "--ignore_chk_warning",
        help="Ignore warning that a chk file cannot be found in the "
        "current directory for a job that will attempt to read it. "
        "Default is False.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-l",
        "--list_of_jobs",
        help="The input in the position of 'job_name' will be read as a file "
        "name with a list of jobs to set up and submit. Each job name "
        "should be on a separate line. Any extension, or none, can follow "
        "the job name. If a 'setup_submit' or 'list_of_jobs' are not "
        "specified, the script will instead attempt to run the 'job_name'."
        " The default is False.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-n",
        "--no_submit",
        help="Set up jobs without submitting them. This flag only effects the "
        "'-s' and '-l' options.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-o",
        "--old_chk_fname",
        help="The name of the checkpoint file (will use base name plus "
        "'.chk' whether or not an extension of any type is provided) "
        "to be used for the first job (optional).",
        default=None)
    parser.add_argument(
        "-s",
        "--setup_submit",
        help="The script will setup and submit, rather than run, the provided "
        "'job_name'. Any extension, or none, can be included in the job "
        "name. If a 'single_job' or 'list_of_jobs' are not specified, "
        "the script will instead attempt to run the 'job_name'. The "
        "default is False.",
        action="store_true",
        default=False)
    parser.add_argument(
        "-t",
        "--testing",
        help="Run in testing mode, which will not check for normal Gaussian "
        "termination before continuing. Default is False.",
        action="store_true",
        default=False)

    args = None
    try:
        args = parser.parse_args(argv)
        if args.setup_submit and args.list_of_jobs:
            raise InvalidDataError(
                "Cannot choose both 'setup_submit' and 'list_of_jobs' options")
        if args.list_of_jobs:
            if not os.path.isfile(args.job_name):
                raise IOError(
                    "When using the 'list_of_jobs' option, the first positional argument \n    ('job_name') "
                    "must be the name of the file with the list of jobs. "
                    "Could not read: {}".format(args.job_name))

        if not (args.list_of_jobs or args.setup_submit):
            if len(args.config[JOB_LIST]) > 1:
                raise InvalidDataError(
                    "Found ';' in the '{}'. This option (setting up multiple job threads) is "
                    "currently only supported for setting up (and optionally submitting) jobs "
                    "(using the '-s' or '-l' options).".format(JOB_LIST))
            elif len(args.config[JOB_LIST]) == 1:
                args.config[JOB_LIST] = args.config[JOB_LIST][0]

    except IOError as e:
        warning("Problems reading file:", e)
        parser.print_help()
        return args, IO_ERROR
    except (KeyError, InvalidDataError, MissingSectionHeaderError,
            SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Example #27
0
def run_job(job, job_name_perhaps_with_dir, tpl_dict, cfg, testing_mode):
    # Determine if it will run fresh or from an old checkpoint
    if job == '':
        new_job_name = tpl_dict[JOB_NAME]
        tpl_dict[INPUT_FILE] = job_name_perhaps_with_dir + cfg[GAUSS_IN_EXT]
        if cfg[FIRST_JOB_CHK]:
            tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format(
                cfg[FIRST_JOB_CHK])
        else:
            tpl_dict[OLD_CHECK_ECHO] = ''
    else:
        new_job_name = tpl_dict[JOB_NAME] + '_' + job
        tpl_dict[OLD_JOB_NAME] = tpl_dict[JOB_NAME]
        tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format(
            tpl_dict[OLD_JOB_NAME])
        tpl_dict[INPUT_FILE] = cfg[TPL_DICT][job]

    tpl_file = cfg[JOB_RUN_TPL]
    job_runner_fname = create_out_fname(new_job_name,
                                        ext=".sh",
                                        base_dir=cfg[OUT_DIR])
    print("Running {}".format(new_job_name))

    tpl_dict[JOB_NAME] = new_job_name
    for key_name in [
            USER,
            MEM,
            PROC_LIST,
    ]:
        if key_name in cfg:
            tpl_dict[key_name] = cfg[key_name]

    tpl_str = read_tpl(tpl_file)
    # if either MEM or PROC_LIST is the default (Nonetype), and is used to run the job, get info from the node before
    #    creating the job script
    mem_required = '{' + MEM + '}' in tpl_str
    get_mem = mem_required and not tpl_dict[MEM]

    proc_required = '{' + PROC_LIST + '}' in tpl_str
    get_proc = proc_required and not tpl_dict[PROC_LIST]

    default_gauss_required = '{' + DEF_ROUTE + '}' in tpl_str

    num_procs = 1  # to make IDE happy
    proc_list = '0'  # to make IDE happy

    if get_mem or get_proc or default_gauss_required:
        # explicitly check each possible required info flag, because any or all can be requested
        if testing_mode:
            hostname = subprocess.check_output(["echo", "r1i7n35"
                                                ]).decode("utf-8").strip()
        else:
            #  Will not be covered in testing mode, as is not part of written code to be tested
            hostname = subprocess.check_output(["hostname"
                                                ]).decode("utf-8").strip()
        print(
            "Obtaining available memory and/or number of processors on node {}.\n    "
            "Note: this program assumes the whole node will be allocated to Gaussian.\n"
            .format(hostname))
        if get_mem:
            tpl_dict[MEM] = get_node_mem(testing_mode)

        max_cache = 1024 * 1024  # to make IDE happy; Gaussian default (conservative) is 1024 * 1024
        if get_proc or default_gauss_required:
            num_procs, proc_list, max_cache = get_proc_info(testing_mode)
        if get_proc:
            tpl_dict[PROC_LIST] = proc_list
            print(
                "    Found {} processors. Will allow use of cpus {}.\n".format(
                    num_procs, proc_list))

        if get_mem or get_proc:
            print(
                "    The user may override these values by specifying the '{}' and/or '{}' keywords in the "
                "configuration file.\n    Be sure to use the formatting Gaussian expects.\n"
                .format(MEM, PROC_LIST))

        if default_gauss_required:
            max_disk = get_max_disk(testing_mode)
            max_cache = int(max_cache)
            print(
                "Since '{}' found in the {}, read machine specs to determine CacheSize={} and "
                "MaxDisk={}".format(DEF_ROUTE, JOB_RUN_TPL, max_cache,
                                    max_disk))
            default_route_list = [
                "-#- CacheSize={}".format(max_cache),
                "-#- MaxDisk={}".format(max_disk)
            ]
            fname = create_out_fname('Default.Route',
                                     base_dir=cfg[SCRATCH_DIR])
            list_to_file(default_route_list, fname)
            tpl_dict[
                DEF_ROUTE] = ''  # there is an action triggered, not a value needed, so replaced with blank space

    move_on = False
    while not move_on:
        try:
            fill_save_tpl(tpl_str, tpl_dict, tpl_file, job_runner_fname)
            move_on = True
        except KeyError as e:
            missing_key = e.args[0].split("\'")[1]
            if missing_key in cfg:
                tpl_dict[missing_key] = cfg[missing_key]
            else:
                raise e
    subprocess.call(["chmod", "+x", job_runner_fname])
    if testing_mode:
        print(
            "Testing mode; did not run job script or check Gaussian output for normal termination.\n"
        )
    else:
        # do not want this tested, as actually running Gaussian would take too long, and not what should be tested
        p1 = subprocess.Popen(job_runner_fname)
        p1.wait()
        out_file = tpl_dict[JOB_NAME] + ".log"
        last_line = subprocess.check_output(["tail", "-1",
                                             out_file]).strip().decode("utf-8")
        if GAU_GOOD_PAT.match(last_line):
            print("Successfully completed {}\n".format(out_file))
            os.remove(job_runner_fname)
        else:
            raise InvalidDataError('Job failed: {}'.format(out_file))
Example #28
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        'Creates Gaussian input files from pdb files, given a template input '
        'file. The required input file provides the name/location of the '
        'template file and a file with a list of pdb files to convert.')
    parser.add_argument(
        "-c",
        "--config",
        help=
        "Optional: the location of the configuration file. The default file "
        "name is '{}', located in the base directory where the program as run. "
        "If a config file is not provided, use the command-line options to "
        "specify the '{}' (-t) and '{}' (-1) or '{}' (-f). The command lines "
        "for the '{}' flag (-r) or only the first entry in the pdb ('{}', -a) "
        "may also be specified.".format(DEF_CFG_FILE, GAU_TPL_FILE,
                                        PDB_LIST_FILE, PDB_FILE, REMOVE_H,
                                        NUM),
        default=DEF_CFG_FILE,
        type=read_cfg)
    parser.add_argument("-t",
                        "--tpl_file",
                        help="Specifies the '{}'".format(GAU_TPL_FILE),
                        default=None)
    parser.add_argument(
        "-l",
        "--pdb_list_file",
        help="Option to specify a file with a list of pdbs ('{}') to convert "
        "(one file per line on the list).".format(PDB_LIST_FILE),
        default=None)
    parser.add_argument(
        "-f",
        "--file",
        help="Option to specify a pdb file ('{}') to convert.".format(
            PDB_FILE),
        default=None)
    parser.add_argument(
        "-n",
        "--num",
        help=
        "Only read if a config file is not provided. This command can be used to "
        "specify only using the first '-n'/'--num' set(s) of coordinates in a pdb "
        "file to create gausscom file(s). The default is to use all coordinates, "
        "making as many input files as there are molecules/conformations in the "
        "pdb.",
        default=None,
        type=int)
    parser.add_argument(
        "-r",
        "--remove_final_h",
        help="Option to specify removing the last H atom from the PDB "
        "file(s) when creating the gausscom files. The default is "
        "False.",
        action='store_true')
    args = None
    try:
        args = parser.parse_args(argv)
        if args.config[GAU_TPL_FILE] is None:
            if args.tpl_file is None:
                raise InvalidDataError(
                    "Could not read config file: {}\n    and did not specify a 'tpl_file' "
                    "('-t' option). A tpl_file is needed to run this "
                    "script.".format(args.config[CONFIG_NAME]))
            else:
                args.config[GAU_TPL_FILE] = args.tpl_file
                if args.num:
                    args.config[NUM] = args.num
                if args.remove_final_h:
                    args.config[REMOVE_H] = True
                if args.file:
                    args.config[PDB_FILE] = args.file
                if args.pdb_list_file:
                    args.config[PDB_LIST_FILE] = args.pdb_list_file
    except (IOError, KeyError, InvalidDataError, MissingSectionHeaderError,
            SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Example #29
0
def main(argv=None):
    print(
        f"Running GaussianWrangler script gausslog_unique version {__version__}"
    )
    # Read input
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    # Read template and data files
    try:
        gausslog_files = []
        missing_files = []
        log_info = {}

        # check input
        if args.max_diff:
            args.max_diff = float(args.max_diff)
            if not args.energy and not args.gibbs:
                args.enthalpy = True

        # check that we have files
        with open(args.list) as f:
            for line in f:
                fname = line.strip()
                if len(fname) == 0:
                    continue
                # check that each log file can be found
                if os.path.isfile(fname):
                    gausslog_files.append(fname)
                else:
                    missing_files.append(fname)
            if len(missing_files) > 0:
                raise IOError(
                    "Could not find the following file(s) listed in '{}':\n    "
                    "{}".format(args.list,
                                '\n    '.join(sorted(set(missing_files)))))
            if len(gausslog_files) < 2:
                raise InvalidDataError(
                    "This program expects at least two files to compare to determine if they "
                    "have the same conformation. Check input.")

        # get the data from the files
        for gausslog_file in gausslog_files:
            gausslog_content = process_gausslog_file(gausslog_file,
                                                     find_dih=True,
                                                     find_converg=True)
            log_info[os.path.basename(gausslog_file)] = gausslog_content

        # process data from files
        list_of_conf_lists = compare_gausslog_info(log_info, args.tol)
        winner_str, warn_files_str = print_results(log_info,
                                                   list_of_conf_lists,
                                                   args.enthalpy, args.energy,
                                                   args.max_diff,
                                                   args.out_fname)
        if len(warn_files_str) > 0:
            warning("Check convergence of file(s):" + warn_files_str)

    except IOError as e:
        warning("Problems reading file:", e)
        return IO_ERROR
    except (InvalidDataError, UnicodeDecodeError) as e:
        warning("Problems reading data:", e)
        return INVALID_DATA
    except ValueError as e:
        warning(e.args[0])
        return INVALID_DATA
    return GOOD_RET  # success
Example #30
0
def create_sbatch_dict(cfg,
                       tpl_dict,
                       new_ini_fname,
                       current_job_list,
                       start_from_job_name_chk=True,
                       ignore_chk_warning=False):
    sbatch_dict = {
        PARTITION: cfg[PARTITION],
        RUN_TIME: cfg[RUN_TIME],
        ACCOUNT: cfg[ACCOUNT],
        JOB_NAME: tpl_dict[JOB_NAME],
        RUN_GAUSS_INI: new_ini_fname,
        QOS: cfg[QOS],
        JOB_DESCRIP: tpl_dict[JOB_DESCRIP],
    }

    if cfg[FIRST_JOB_CHK]:
        if not os.path.isfile(cfg[FIRST_JOB_CHK] + CHK_EXT):
            raise InvalidInputError("Could not find specified '{}': {}".format(
                FIRST_JOB_CHK, cfg[FIRST_JOB_CHK] + CHK_EXT))
        sbatch_dict[OLD_CHECK_ECHO] = '-o ' + cfg[FIRST_JOB_CHK]
    elif start_from_job_name_chk:
        fname_to_check = tpl_dict[JOB_NAME] + CHK_EXT
        if not os.path.isfile(fname_to_check):
            raise InvalidDataError(
                "Could not find required checkpoint file: {}".format(
                    fname_to_check))
        sbatch_dict[OLD_CHECK_ECHO] = '-o ' + tpl_dict[JOB_NAME]
    else:
        sbatch_dict[OLD_CHECK_ECHO] = ''
        if current_job_list[0] == '' and cfg[CHECK_FOR_CHK]:
            # in the case when there is no old_check_file, make sure the first input file does not try to read from chk
            # IOError is already caught; no don't need to add a try loop
            with open(tpl_dict[INPUT_FILE]) as f:
                try:
                    read_route = False
                    for line in f:
                        line = line.strip()
                        # route can be multiple lines, so first fine the line, then continue until a blank is reached
                        if GAU_HEADER_PAT.match(line):
                            read_route = True
                            while line != '':
                                if GUESS_READ_OR_GEOM_CHK_PAT.match(
                                        line) and not ignore_chk_warning:
                                    raise InvalidDataError(
                                        "Did not find an old checkpoint file to read, but the "
                                        "Gaussian input header indicates that Gaussian will attempt "
                                        "and fail to read from a checkpoint:\n   file:  {}\n"
                                        "  route:  {} ".format(
                                            tpl_dict[INPUT_FILE], line))
                                line = next(f).strip()
                    if not read_route:
                        raise StopIteration
                except StopIteration:
                    raise InvalidDataError(
                        'The specified input file does not appear valid: {}'
                        ''.format(tpl_dict[INPUT_FILE]))

    if cfg[EMAIL]:
        sbatch_dict[EMAIL] = '#SBATCH --mail-type=FAIL\n#SBATCH --mail-type=END\n' \
                             '#SBATCH --mail-user={}'.format(cfg[EMAIL])
    else:
        sbatch_dict[EMAIL] = ''

    return sbatch_dict