Exemple #1
0
def main(argv=None):
    """ Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    kbt = calc_kbt(args.temp)

    if args.src_file is not None:
        proc_data = to_zero_point(calc_rad(args.src_file, kbt))
        write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX),
                  RAD_KEY_SEQ)
    else:
        found_files = find_files_by_dir(args.base_dir, args.pattern)
        logger.debug("Found '{}' dirs with files to process".format(
            len(found_files)))
        # noinspection PyCompatibility
        for f_dir, files in found_files.items():
            if not files:
                logger.warn("No files found for dir '{}'".format(f_dir))
                continue
            for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]):
                proc_data = to_zero_point(calc_rad(pmf_path, kbt))
                f_name = create_out_fname(pmf_path, prefix=OUT_PFX)
                if allow_write(f_name, overwrite=args.overwrite):
                    write_csv(proc_data, f_name, RAD_KEY_SEQ)
    return GOOD_RET  # success
Exemple #2
0
def print_content(atom_id_dict, cfg, content, data_file, highlight_content, section_order, type_dict):
    data_content = content[SEC_HEAD]
    select_data_content = []
    for section in section_order:
        # empty list will become an empty line
        data_content += [''] + [section, '']
        select_data_content += [section]
        sec_format = SEC_FORMAT_DICT[section][0]
        comment_col = SEC_FORMAT_DICT[section][1]
        for line in content[section]:
            data_content.append(sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:]))
        for line in highlight_content[section]:
            select_data_content.append(sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:]))

    # Only print a "new" data file if something is changed
    dict_lens = len(atom_id_dict)
    for name, t_dict in type_dict.items():
        dict_lens += len(t_dict)
    if dict_lens > 0 or cfg[SORT_ME]:
        f_name = create_out_fname(data_file, suffix='_new', ext='.data')
        list_to_file(data_content, f_name)
        print('Completed writing {}'.format(f_name))
    if (len(cfg[PRINT_DATA_ATOMS]) + len(cfg[PRINT_OWN_ATOMS])) > 0:
        f_name = create_out_fname(data_file, suffix='_selected', ext='.txt')
        list_to_file(select_data_content, f_name)
        print('Completed writing {}'.format(f_name))
def main(argv=None):
    """ Runs the main program.

    @param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != 0:
        return ret

    if args.src_file is not None:
        proc_data = calc_for_wham(args.src_file)
        write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX),
                  COLVAR_WHAM_KEY_SEQ)
    else:
        found_files = find_files_by_dir(args.base_dir, args.pattern)
        logger.debug("Found '%d' dirs with files to process", len(found_files))
        # noinspection PyCompatibility
        for f_dir, files in found_files.iteritems():
            if not files:
                logger.warn("No files found for dir '%s'", f_dir)
                continue
            for colvar_path in ([os.path.join(f_dir, tgt) for tgt in files]):
                proc_data = calc_for_wham(colvar_path)
                f_name = create_out_fname(colvar_path, prefix=OUT_PFX)
                if allow_write(f_name, overwrite=args.overwrite):
                    list_to_file([str(d['r']) for d in proc_data if 'r' in d],
                                 f_name)
                    # write_csv(proc_data, f_name, COLVAR_WHAM_KEY_SEQ, extrasaction="ignore")
    return 0  # success
def main(argv=None):
    """ Runs the main program.

    @param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != 0:
        return ret

    if args.src_file is not None:
        proc_data = calc_for_wham(args.src_file)
        write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), COLVAR_WHAM_KEY_SEQ)
    else:
        found_files = find_files_by_dir(args.base_dir, args.pattern)
        logger.debug("Found '%d' dirs with files to process", len(found_files))
        # noinspection PyCompatibility
        for f_dir, files in found_files.iteritems():
            if not files:
                logger.warn("No files found for dir '%s'", f_dir)
                continue
            for colvar_path in ([os.path.join(f_dir, tgt) for tgt in files]):
                proc_data = calc_for_wham(colvar_path)
                f_name = create_out_fname(colvar_path, prefix=OUT_PFX)
                if allow_write(f_name, overwrite=args.overwrite):
                    list_to_file([str(d['r']) for d in proc_data if 'r' in d], f_name)
                    # write_csv(proc_data, f_name, COLVAR_WHAM_KEY_SEQ, extrasaction="ignore")
    return 0  # success
Exemple #5
0
def print_content(atom_id_dict, cfg, content, data_file, highlight_content,
                  section_order, type_dict):
    data_content = content[SEC_HEAD]
    select_data_content = []
    for section in section_order:
        # empty list will become an empty line
        data_content += [''] + [section, '']
        select_data_content += [section]
        sec_format = SEC_FORMAT_DICT[section][0]
        comment_col = SEC_FORMAT_DICT[section][1]
        for line in content[section]:
            data_content.append(
                sec_format.format(*line[:comment_col]) +
                " ".join(line[comment_col:]))
        for line in highlight_content[section]:
            select_data_content.append(
                sec_format.format(*line[:comment_col]) +
                " ".join(line[comment_col:]))

    # Only print a "new" data file if something is changed
    dict_lens = len(atom_id_dict)
    for name, t_dict in type_dict.items():
        dict_lens += len(t_dict)
    if dict_lens > 0 or cfg[SORT_ME]:
        f_name = create_out_fname(data_file, suffix='_new', ext='.data')
        list_to_file(data_content, f_name)
        print('Completed writing {}'.format(f_name))
    if (len(cfg[PRINT_DATA_ATOMS]) + len(cfg[PRINT_OWN_ATOMS])) > 0:
        f_name = create_out_fname(data_file, suffix='_selected', ext='.txt')
        list_to_file(select_data_content, f_name)
        print('Completed writing {}'.format(f_name))
Exemple #6
0
def main(argv=None):
    """ Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    kbt = calc_kbt(args.temp)

    if args.src_file is not None:
        proc_data = to_zero_point(calc_rad(args.src_file, kbt))
        write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), RAD_KEY_SEQ)
    else:
        found_files = find_files_by_dir(args.base_dir, args.pattern)
        logger.debug("Found '{}' dirs with files to process".format(len(found_files)))
        # noinspection PyCompatibility
        for f_dir, files in found_files.iteritems():
            if not files:
                logger.warn("No files found for dir '{}'".format(f_dir))
                continue
            for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]):
                proc_data = to_zero_point(calc_rad(pmf_path, kbt))
                f_name = create_out_fname(pmf_path, prefix=OUT_PFX)
                if allow_write(f_name, overwrite=args.overwrite):
                    write_csv(proc_data, f_name, RAD_KEY_SEQ)
    return GOOD_RET  # success
Exemple #7
0
def adjust_atom_xyz(cfg, data_tpl_content):
    """
    If this options is selected, adjust the xyz coordinates as specified
    @param cfg: configuration for the run
    @param data_tpl_content: processed data from the template
    @return: will print new data files or raise InvalidDataError
    """
    if cfg[ADJUST_ATOM] > data_tpl_content[NUM_ATOMS]:
        raise InvalidDataError(
            "Keyword '{}' specified atom index {} to have its XYZ coordinates adjusted, "
            "but found only "
            "{} atoms in the data template file: {}".format(
                ADJUST_ATOM, cfg[ADJUST_ATOM], data_tpl_content[NUM_ATOMS],
                cfg[DATA_TPL_FILE]))
    diff_vector = np.asarray((np.subtract(cfg[XYZ2], cfg[XYZ1])))
    inc_vector = np.divide(diff_vector, cfg[XYZ_STEPS])
    head_content = data_tpl_content[HEAD_CONTENT]
    atoms_content = data_tpl_content[ATOMS_CONTENT]
    tail_content = data_tpl_content[TAIL_CONTENT]
    # since python is zero-based, must subtract 1
    adjust_atom_num = cfg[ADJUST_ATOM] - 1
    for multiplier in range(-cfg[XYZ_STEPS_EXTEND],
                            cfg[XYZ_STEPS] + cfg[XYZ_STEPS_EXTEND]):
        f_name = create_out_fname(cfg[DATA_TPL_FILE],
                                  suffix='_' + str(multiplier),
                                  ext='.data')
        atoms_content[adjust_atom_num][4:7] = np.round(
            multiplier * inc_vector + cfg[XYZ1], 6)
        list_to_file(head_content + atoms_content + tail_content, f_name)
Exemple #8
0
def fill_save_tpl(cfg,
                  tpl_str,
                  tpl_vals_dict,
                  tpl_name,
                  filled_tpl_name,
                  print_info=True):
    """
    use the dictionary to make the file name and filled template. Then save the file.
    @param cfg: configuration for run
    @param tpl_str: the string to be filled to make the filled tpl file
    @param tpl_vals_dict: dictionary of tpl keys and vals
    @param tpl_name: the cfg key for the template file name
    @param filled_tpl_name: the cfg key for the filled template file name
    @param print_info: print to standard out when a file is printed
    """
    try:
        filled_tpl_str = tpl_str.format(**tpl_vals_dict)
    except KeyError as e:
        raise KeyError(
            "Key '{}' not found in the configuration but required for template file: {}"
            "".format(e.message, tpl_name))

    try:
        filled_fname_str = filled_tpl_name.format(**tpl_vals_dict)
    except KeyError as e:
        raise KeyError(
            "Key '{}' not found in the configuration but required for filled template file name: {}"
            "".format(e.message, filled_tpl_name))

    tpl_vals_dict[NEW_FNAME] = create_out_fname(filled_fname_str,
                                                base_dir=cfg[OUT_DIR])
    str_to_file(filled_tpl_str,
                tpl_vals_dict[NEW_FNAME],
                print_info=print_info)
Exemple #9
0
def process_cp2k_file(cp2k_file, data_tpl_content, data_template_fname):
    new_atoms_section = None
    with open(cp2k_file) as f:
        data_tpl_content[HEAD_CONTENT][0] = "Created on {} by {} version {} from template file {} and " \
                                            "cp2k output file {}".format(datetime.now(), __name__, __version__,
                                                                         data_template_fname, cp2k_file
                                                                         )
        for line in f:
            line = line.strip()
            if ENERGY_PAT.match(line):
                qmmm_energy = line.split()[-1]
            if COORD_PAT.match(line):
                # Now advance to first line of coordinates
                for _ in range(3):
                    next(f)
                new_atoms_section = process_coords(f, data_tpl_content)

    # If we successfully returned the new_atoms_section, make new file
    if new_atoms_section is None:
        raise InvalidDataError(
            "Did not file atoms coordinates in file: {}".format(cp2k_file))
    print("{} energy: {}".format(cp2k_file, qmmm_energy))
    f_name = create_out_fname(cp2k_file, ext='.data')
    list_to_file(data_tpl_content[HEAD_CONTENT] + new_atoms_section +
                 data_tpl_content[TAIL_CONTENT],
                 f_name,
                 print_message=False)
Exemple #10
0
def process_file(f_list, new_f_name):

    value_dict = {}

    print("hello world")

    with open(f_list) as f:
        for f_name in f.readlines():
            f_name = f_name.strip()
            with open(f_name) as d:
                for line in d.readlines():
                    # string2 = string1.strip('\n')
                    line = line.strip()
                    split_line = line.split()
                    entries = len(split_line)
                    # For this purpose, subtract 1 (hydronium) and divide by 3
                    water_mol_number = (entries - 1) / 3
                    if water_mol_number in value_dict:
                        value_dict[water_mol_number] += 1
                    else:
                        value_dict[water_mol_number] = 1

    if new_f_name is None:
        new_f_name = create_out_fname(f_list, suffix='_count')

    with open(new_f_name, 'w') as w_file:
        for key in value_dict:
            w_file.write(str(key) + "," + str(value_dict[key]) + "\n")
            print(key, value_dict[key])
Exemple #11
0
def copy_par_result_file(cfg, tpl_vals_dict, print_info=False):
    """
    To keep a copy of a par file, make the new file name and copy the previously created par file
    @param cfg: configuration for run
    @param tpl_vals_dict: dictionary to fill strings
    @param print_info: boolean to determine if to print to standard out that a copy was made
    @return: KeyError if required variable is not defined
    """
    if cfg[TRIAL_NAME] is not None:
        try:
            tpl_vals_dict[TRIAL_NAME] = cfg[TRIAL_NAME].format(**tpl_vals_dict)
        except KeyError as e:
            raise KeyError(
                "Missing key name {} required for '{}': '{}'. Program will terminate."
                "".format(e, TRIAL_NAME, cfg[TRIAL_NAME]))

    for copy_name in [PAR_COPY_NAME, RESULT_COPY]:
        if cfg[copy_name] is not None:
            try:
                base_name = cfg[copy_name].format(**tpl_vals_dict)
            except KeyError as e:
                raise KeyError(
                    "Missing key name {} required for '{}': '{}'. File will not be copied."
                    "".format(e, copy_name, cfg[copy_name]))
            new_fname = create_out_fname(base_name, base_dir=cfg[COPY_DIR])
            if copy_name == PAR_COPY_NAME:
                shutil.copyfile(tpl_vals_dict[NEW_FNAME], new_fname)
            else:
                # if os.path.isfile(tpl_vals_dict[RESULT_FILE]):
                shutil.copyfile(cfg[RESULT_FILE], new_fname)

            if print_info:
                print(" Copied to: {}".format(new_fname))
Exemple #12
0
def process_file(f_list, new_f_name):

    value_dict = {}

    print("hello world")

    with open(f_list) as f:
        for f_name in f.readlines():
            f_name = f_name.strip()
            with open(f_name) as d:
                for line in d.readlines():
                    # string2 = string1.strip('\n')
                    line = line.strip()
                    split_line = line.split()
                    entries = len(split_line)
                    # For this purpose, subtract 1 (hydronium) and divide by 3
                    water_mol_number = (entries - 1) / 3
                    if water_mol_number in value_dict:
                        value_dict[water_mol_number] += 1
                    else:
                        value_dict[water_mol_number] = 1

    if new_f_name is None:
        new_f_name = create_out_fname(f_list, suffix='_count')

    with open(new_f_name, 'w') as w_file:
        for key in value_dict:
            w_file.write(str(key) + "," + str(value_dict[key]) + "\n")
            print(key, value_dict[key])
Exemple #13
0
def process_file(data_file, mcfg, delimiter=','):
    list_vectors, headers = read_csv_to_list(data_file,
                                             delimiter=delimiter,
                                             header=True)

    col_index_dict = {}
    for section in SUB_SECTIONS:
        col_index_dict[section] = {}
        for key, val in mcfg[section].items():
            if key in headers:
                # Parser already made sure that unique entries
                col_index_dict[section][headers.index(key)] = val
            else:
                raise InvalidDataError(
                    "Key '{}' found in configuration file but not in data file: "
                    "{}".format(key, data_file))

    edited_vectors = []
    for row in list_vectors:
        for col, max_val in col_index_dict[MAX_SEC].items():
            if row[col] > max_val:
                row[col] = max_val
        for col, min_val in col_index_dict[MIN_SEC].items():
            if row[col] < min_val:
                row[col] = min_val
        edited_vectors.append(row)

    f_name = create_out_fname(data_file, ext='.csv')
    list_to_csv([headers] + edited_vectors, f_name, delimiter=',')
Exemple #14
0
 def testOutFname(self):
     """
     Check for prefix addition.
     """
     self.assertTrue(
         create_out_fname(ORIG_WHAM_PATH,
                          prefix=OUT_PFX).endswith(os.sep + OUT_PFX +
                                                   ORIG_WHAM_FNAME))
Exemple #15
0
def process_pdb_tpl(cfg):
    tpl_loc = cfg[PDB_TPL_FILE]
    tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}

    atom_id = 0

    with open(tpl_loc) as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            # match 5 letters so don't need to set up regex for the ones that have numbers following the letters
            # noinspection SpellCheckingInspection
            if line_head[:-1] in ['HEADE', 'TITLE', 'REMAR', 'CRYST', 'MODEL', 'COMPN',
                                  'NUMMD', 'ORIGX', 'SCALE', 'SOURC', 'AUTHO', 'CAVEA',
                                  'EXPDT', 'MDLTY', 'KEYWD', 'OBSLT', 'SPLIT', 'SPRSD',
                                  'REVDA', 'JRNL ', 'DBREF', 'SEQRE', 'HET  ', 'HETNA',
                                  'HETSY', 'FORMU', 'HELIX', 'SHEET', 'SSBON', 'LINK ',
                                  'CISPE', 'SITE ', ]:
                tpl_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # By renumbering, handles the case when a PDB template has ***** after atom_id 99999.
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_id += 1
                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                else:
                    atom_num = '{:5d}'.format(atom_id)
                # Alternately, use this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]

                atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                # There is already a try when calling the subroutine, so maybe I don't need to?
                mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                last_cols = line[cfg[PDB_Z_LAST_CHAR]:]

                line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, last_cols]
                tpl_data[ATOMS_CONTENT].append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                tpl_data[TAIL_CONTENT].append(line)

    if logger.isEnabledFor(logging.DEBUG):
        f_name = create_out_fname('reproduced_tpl', ext='.pdb', base_dir=cfg[OUT_BASE_DIR])
        list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT] + tpl_data[TAIL_CONTENT],
                     f_name, list_format=cfg[PDB_FORMAT])
    return tpl_data
Exemple #16
0
def comp_files(cfg, atom_id_dict, type_dicts):
    """
    Compares each section of data files
    @param cfg: configuration information for current run
    @param atom_id_dict: dictionary for changing the atom id
    @param type_dicts: dictionary for changing atom and interaction types
    @return:
    """
    first_content, first_section_order = proc_data_file(
        cfg,
        cfg[DATA_FILE],
        atom_id_dict,
        type_dicts,
    )
    second_content, second_section_order = proc_data_file(
        cfg,
        cfg[DATA_COMP],
        atom_id_dict,
        type_dicts,
    )

    for section in second_section_order:
        if section not in first_section_order:
            warning("Skipping section '{}'; section found in the file: {}\n"
                    "   but not in file: {}".format(section, cfg[DATA_COMP],
                                                    cfg[DATA_FILE]))

    diffs = ["Differences in head section:"]
    compare_heads(first_content[SEC_HEAD], second_content[SEC_HEAD], diffs)

    for section in first_section_order:
        if section not in second_section_order:
            warning("Skipping section '{}'; section found in the file: {}\n"
                    "   but not in file: {}".format(section, cfg[DATA_FILE],
                                                    cfg[DATA_COMP]))
        elif section in [SEC_VELOS]:
            diffs.append("\nSkipping section '{}'".format(section))
        elif section in COMP_ORD_SEC_COL_DICT:
            diffs.append("\nDifferences in section '{}':".format(section))
            num_col_to_compare = COMP_ORD_SEC_COL_DICT[section]
            compare_lists(first_content[section], second_content[section], 0,
                          num_col_to_compare, diffs,
                          SEC_FORMAT_DICT[section][0],
                          SEC_FORMAT_DICT[section][1])
        elif section in NUM_SEC_DICT:
            diffs.append("\nDifferences in section '{}':".format(section))
            num_col_to_compare = NUM_SEC_DICT[section][1]
            compare_lists(first_content[section], second_content[section], 1,
                          num_col_to_compare, diffs,
                          SEC_FORMAT_DICT[section][0],
                          SEC_FORMAT_DICT[section][1])
        else:
            print("Encountered unexpected section '{}'".format(section))

    f_name = create_out_fname(cfg[DATA_COMP], prefix='diffs_', ext='.txt')
    list_to_file(diffs, f_name)
    print('Completed writing {}'.format(f_name))
def process_file(f_name, b_str, e_str, new_f_name):

    if new_f_name is None:
        new_f_name = create_out_fname(f_name, suffix='_amend')

    # open old file first; then if, there is a problem with it, no new file will be created
    with open(f_name) as f:
        with open(new_f_name, 'w') as w_file:
            for line in f:
                line = line.strip()
                w_file.write(b_str + line + e_str + "\n")
    print("Wrote file: {}".format(new_f_name))
Exemple #18
0
def process_file(f_name, b_str, e_str, new_f_name):

    if new_f_name is None:
        new_f_name = create_out_fname(f_name, suffix='_amend')

    # open old file first; then if, there is a problem with it, no new file will be created
    with open(f_name) as f:
        with open(new_f_name, 'w') as w_file:
            for line in f:
                line = line.strip()
                w_file.write(b_str + line + e_str + "\n")
    print("Wrote file: {}".format(new_f_name))
Exemple #19
0
def write_result(result, src_file, overwrite=False, basedir=None):
    """Writes the result to a file named for the given source file.

    :param result: The result to write.
    :param src_file: The original source file name.
    :param overwrite: Whether to overwrite an existing file name.
    :param basedir: The base directory to target (uses the source file's base directory
        if not specified)
    """
    f_name = create_out_fname(src_file, prefix=OUT_PFX, base_dir=basedir)
    if allow_write(f_name, overwrite=overwrite):
        write_csv(result, f_name, OUT_KEY_SEQ)
Exemple #20
0
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode):
    f_out = create_out_fname(dump_file,
                             suffix='_sum',
                             ext='.csv',
                             base_dir=cfg[OUT_BASE_DIR])
    write_csv(data_to_print,
              f_out,
              out_fieldnames,
              extrasaction="ignore",
              mode=write_mode,
              round_digits=ROUND_DIGITS,
              print_message=cfg[PRINT_PROGRESS])
def process_cv_file(cv_file, time_col, cv_col, row_index, time_conv):
    data_to_print = []
    with open(cv_file) as f:
        for line in f:
            if row_index == 0:
                row_index = 1
            else:
                data = [x.strip() for x in line.split()]
                try:
                    timestep = int(float(data[time_col]) * time_conv)
                    cv = float(data[cv_col])
                    data_to_print.append([timestep, cv])
                except ValueError as e:
                    warning("Excepted a number for the time_column ({}) and cv column({}). Found {} and {}."
                            "".format(time_col, cv_col, data[time_col], data[cv_col]), e)
                    return INVALID_DATA
    d_out = create_out_fname(cv_file, suffix='_converted', ext='.txt')
    list_to_file(data_to_print, d_out)
    print('Wrote file: {}'.format(d_out))

    d_out = create_out_fname(cv_file, suffix='_converted', ext='.csv')
    list_to_file(data_to_print, d_out, delimiter=',')
    print('Wrote file: {}'.format(d_out))
Exemple #22
0
def print_gofr(cfg, gofr_data):
    g_dr = cfg[GOFR_DR]
    dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2
    gofr_out_fieldnames = [GOFR_R]
    gofr_output = dr_array
    if cfg[CALC_HO_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HO)
        gofr_output = np.column_stack((gofr_output, gofr_ho))
    if cfg[CALC_OO_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OO)
        gofr_output = np.column_stack((gofr_output, gofr_oo))
    if cfg[CALC_HH_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HH)
        gofr_output = np.column_stack((gofr_output, gofr_hh))
    if cfg[CALC_OH_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OH)
        gofr_output = np.column_stack((gofr_output, gofr_oh))
    if cfg[CALC_TYPE_GOFR]:
        if gofr_data[TYPE_STEPS_COUNTED] > 0:
            normal_fac = np.square(
                dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr
            gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac)
            gofr_out_fieldnames.append(GOFR_TYPE)
            gofr_output = np.column_stack((gofr_output, gofr_type))
        else:
            warning("Did not find any timesteps with the pairs in {}. "
                    "This output will not be printed.".format(CALC_TYPE_GOFR))

    f_out = create_out_fname(cfg[DUMP_FILE_LIST],
                             suffix='_gofrs',
                             ext='.csv',
                             base_dir=cfg[OUT_BASE_DIR])
    # am not using the dict writer because the gofr output is a np.array
    list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(),
                f_out,
                print_message=cfg[PRINT_PROGRESS],
                round_digits=ROUND_DIGITS)
def proc_file(file_name):
    with open(file_name) as d:
        nodups_lines = ['']
        for line in d:
            line = line.strip()
            if len(line) == 0:
                continue
            elif line == nodups_lines[-1]:
                continue
            else:
                nodups_lines.append(line)
    print('Completed reading {}.\n'.format(file_name))
    f_out_name = create_out_fname(file_name, suffix='_nodups')
    list_to_file(nodups_lines[1:], f_out_name)
    print('Wrote {}.\n'.format(f_out_name))
def proc_file(file_name):
    with open(file_name) as d:
        nodups_lines = ['']
        for line in d:
            line = line.strip()
            if len(line) == 0:
                continue
            elif line == nodups_lines[-1]:
                continue
            else:
                nodups_lines.append(line)
    print('Completed reading {}.\n'.format(file_name))
    f_out_name = create_out_fname(file_name, suffix='_nodups')
    list_to_file(nodups_lines[1:], f_out_name)
    print('Wrote {}.\n'.format(f_out_name))
Exemple #25
0
def main(argv=None):
    """
    Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET:
        return ret

    deduped = compress_dups(read_csv(args.file, all_conv=float), args.column)
    write_csv(deduped, create_out_fname(args.file, prefix=PREFIX),
              read_csv_header(args.file))

    return GOOD_RET  # success
Exemple #26
0
def main(argv=None):
    """
    Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET:
        return ret

    deduped = compress_dups(read_csv(args.file, all_conv=float), args.column)
    write_csv(deduped, create_out_fname(args.file, prefix=PREFIX),
              read_csv_header(args.file))

    return GOOD_RET  # success
Exemple #27
0
    def testWriteCsv(self):
        tmp_dir = None
        data = csv_data()
        try:
            tmp_dir = tempfile.mkdtemp()
            tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir)

            write_csv(data, tgt_fname, RAD_KEY_SEQ)
            csv_result = read_csv(tgt_fname,
                                  data_conv={FREE_KEY: str_to_bool,
                                             CORR_KEY: float,
                                             COORD_KEY: str, })
            self.assertEqual(len(data), len(csv_result))
            for i, csv_row in enumerate(csv_result):
                self.assertDictEqual(data[i], csv_row)
        finally:
            shutil.rmtree(tmp_dir)
def read_file_list(file_list, out_dir):
    """
    @param file_list: the list of files to be read
    @param out_dir: user-specified output directory
    """
    summary_header = ['num_atoms', 'sum_x', 'sum_y', 'sum_z', 'total']
    summary_array = None

    with open(file_list) as f:
        for f_file in f:
            f_file = f_file.strip()
            if len(f_file) == 0:
                continue
            elif os.path.isfile(f_file):
                summary = process_cp2k_force_file(f_file, out_dir)
                if summary is not None:
                    if summary_array is None:
                        summary_array = summary
                    else:
                        summary_array = np.vstack((summary, summary_array))
            else:
                warning('Could not read file {} in file list {}. '
                        'Continuing to the next line in file list.'.format(f_file, file_list))
    # print(np.amax(summary_array, axis=1))
    if summary_array is None:
        warning("No valid cp2k force output files were read.")
    elif summary_array.size == 5:
        print('For the one CP2K force file read:')
        print(' ' + '      '.join(summary_header))
        print(' '.join(['%10.0f' % summary_array[0]] + ['%10.3f' % F for F in summary_array[1:]]))
    else:
        f_out = create_out_fname(file_list, prefix='force_sums_', base_dir=out_dir, ext='.csv')
        list_to_file(summary_array, f_out)
        with open(f_out, 'w') as logfile:
            logfile.write(','.join(summary_header) + "\n")
            # noinspection PyTypeChecker
            for line in summary_array:
                logfile.write(','.join(['%d' % line[0]] + ['%f' % F for F in line[1:]]) + "\n")
        print('Finished reading all cp2k force files. Printed each atomic force sum to: {}'.format(f_out))

        min_vals = np.amin(summary_array, axis=0)
        max_vals = np.amax(summary_array, axis=0)

        print('           ' + '      '.join(summary_header))
        print('min_vals: ' + ' '.join(['%10.0f' % min_vals[0]] + ['%10.3f' % F for F in min_vals[1:]]))
        print('max_vals: ' + ' '.join(['%10.0f' % max_vals[0]] + ['%10.3f' % F for F in max_vals[1:]]))
Exemple #29
0
def main(argv=None):
    """
    Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != 0:
        return ret

    pairs = parse_pairs(args.pair_files)
    dists = atom_distances(args.file, pairs)
    write_results(create_out_fname(args.file, prefix='pairs_', ext='.csv'),
                  dists, pairs)

    return 0  # success
Exemple #30
0
    def testWriteCsv(self):
        tmp_dir = None
        data = csv_data()
        try:
            tmp_dir = tempfile.mkdtemp()
            tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir)

            write_csv(data, tgt_fname, RAD_KEY_SEQ)
            csv_result = read_csv(tgt_fname,
                                  data_conv={FREE_KEY: str_to_bool,
                                             CORR_KEY: float,
                                             COORD_KEY: str, })
            self.assertEqual(len(data), len(csv_result))
            for i, csv_row in enumerate(csv_result):
                self.assertDictEqual(data[i], csv_row)
        finally:
            shutil.rmtree(tmp_dir)
Exemple #31
0
def create_hist_plot(hist_dict, header, out_dir, data_file):
    """
    See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html
    @param hist_dict: dict of label, count
    @param header: name of dictionary
    @param out_dir: str, name of directory where files are to be saved
    @param data_file: name of data file
    @return: a list of lists (label, count)
    """
    # remove spaces in name
    header = "".join(header.split())

    # convert dict to list for creating bar chat
    bar_data = [[key, val] for key, val in hist_dict.items()]
    bar_data.sort(key=itemgetter(0))
    bar_data.sort(key=itemgetter(1), reverse=True)

    # bar chart background style
    sns.set(style="whitegrid", font='Arial')
    # color options include pastel
    sns.set_color_codes("deep")
    # Initialize the matplotlib figure
    f, ax = plt.subplots(figsize=(6, 6))
    # Create pandas dataframe
    new_df = pd.DataFrame(bar_data, columns=["key", "count"])
    # Plot
    sns.barplot(x="count", y="key", data=new_df,
                label="Total", color="b")
    # other options: xlim=(0, 24)
    ax.set(xlabel="Count", ylabel="")
    ax.set_title(header)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        plt.tight_layout()

    f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png")
    plt.savefig(f_name, dpi=300)
    print("Wrote file: {}".format(f_name))

    # quote strings for printing so csv properly read, and add header
    count_to_print = [[header + "_key", header + "_count"]]
    for row in bar_data:
        count_to_print.append([row[0], row[1]])

    return count_to_print
Exemple #32
0
def create_hist_plot(hist_dict, header, out_dir, data_file):
    """
    See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html
    @param hist_dict: dict of label, count
    @param header: name of dictionary
    @param out_dir: str, name of directory where files are to be saved
    @param data_file: name of data file
    @return: a list of lists (label, count)
    """
    # remove spaces in name
    header = "".join(header.split())

    # convert dict to list for creating bar chat
    bar_data = [[key, val] for key, val in hist_dict.items()]
    bar_data.sort(key=itemgetter(1), reverse=True)

    # bar chart background style
    sns.set(style="whitegrid", font="Arial")
    # color options include pastel
    sns.set_color_codes("deep")
    # Initialize the matplotlib figure
    f, ax = plt.subplots(figsize=(6, 6))
    # Create pandas dataframe
    new_df = pd.DataFrame(bar_data, columns=["key", "count"])
    # Plot
    sns.barplot(x="count", y="key", data=new_df, label="Total", color="b")
    # other options: xlim=(0, 24)
    ax.set(xlabel="Count", ylabel="")
    ax.set_title(header)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        plt.tight_layout()

    f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png")
    plt.savefig(f_name, dpi=300)
    print("Wrote file: {}".format(f_name))

    # quote strings for printing so csv properly read, and add header
    count_to_print = [[header + "_key", header + "_count"]]
    for row in bar_data:
        count_to_print.append([row[0], row[1]])

    return count_to_print
Exemple #33
0
def main(argv=None):
    """
    Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    try:
        if args.list_file is None:
            file_list = []
            base_file_name = args.file
        else:
            file_list = file_rows_to_list(args.list_file)
            base_file_name = args.list_file
        if args.file is not None:
            file_list.append(args.file)

        dists = OrderedDict()
        pairs = parse_pairs(args.pair_files)
        write_mode = 'w'
        for l_file in file_list:
            dists.update(atom_distances(l_file, pairs))
            if len(dists) > 0:
                write_results(create_out_fname(base_file_name,
                                               prefix='pairs_',
                                               ext='.csv'),
                              dists,
                              pairs,
                              write_mode=write_mode)
                write_mode = 'a'
    except IOError as e:
        warning("Problems reading file: {}".format(e))
        return IO_ERROR
    except InvalidDataError as e:
        warning("Invalid Data Error: {}".format(e))
        return IO_ERROR

    return GOOD_RET  # success
Exemple #34
0
def comp_files(cfg, atom_id_dict, type_dicts):
    """
    Compares each section of data files
    @param cfg: configuration information for current run
    @param atom_id_dict: dictionary for changing the atom id
    @param type_dicts: dictionary for changing atom and interaction types
    @return:
    """
    first_content, first_section_order = proc_data_file(cfg, cfg[DATA_FILE], atom_id_dict, type_dicts,)
    second_content, second_section_order = proc_data_file(cfg, cfg[DATA_COMP], atom_id_dict, type_dicts,)

    for section in second_section_order:
        if section not in first_section_order:
            warning("Skipping section '{}'; section found in the file: {}\n"
                    "   but not in file: {}".format(section, cfg[DATA_COMP], cfg[DATA_FILE]))

    diffs = ["Differences in head section:"]
    compare_heads(first_content[SEC_HEAD], second_content[SEC_HEAD], diffs)

    for section in first_section_order:
        if section not in second_section_order:
            warning("Skipping section '{}'; section found in the file: {}\n"
                    "   but not in file: {}".format(section, cfg[DATA_FILE], cfg[DATA_COMP]))
        elif section in [SEC_VELOS]:
            diffs.append("\nSkipping section '{}'".format(section))
        elif section in COMP_ORD_SEC_COL_DICT:
            diffs.append("\nDifferences in section '{}':".format(section))
            num_col_to_compare = COMP_ORD_SEC_COL_DICT[section]
            compare_lists(first_content[section], second_content[section], 0, num_col_to_compare, diffs,
                          SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1])
        elif section in NUM_SEC_DICT:
            diffs.append("\nDifferences in section '{}':".format(section))
            num_col_to_compare = NUM_SEC_DICT[section][1]
            compare_lists(first_content[section], second_content[section], 1, num_col_to_compare, diffs,
                          SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1])
        else:
            print("Encountered unexpected section '{}'".format(section))

    f_name = create_out_fname(cfg[DATA_COMP], prefix='diffs_', ext='.txt')
    list_to_file(diffs, f_name)
    print('Completed writing {}'.format(f_name))
Exemple #35
0
def process_log_files(source_name, log_file_list):
    """
    Loops through all files and prints output
    @param source_name: the source name to use as the base for creating an outfile name
    @param log_file_list: list of file names to read and process
    """

    result_list = []
    out_fname = create_out_fname(source_name, suffix='_sum', ext=".csv")

    for log_file in log_file_list:
        result_list += process_log(log_file)

    if len(result_list) == 0:
        warning(
            "Found no lammps log data to process from: {}".format(source_name))
    else:
        write_csv(result_list,
                  out_fname,
                  LOG_FIELDNAMES,
                  extrasaction="ignore")
Exemple #36
0
def plot_corr(f_name):
    """
    Given a csv, plot it as a heat map
    @param f_name: file name to save the correlation
    @return:
    """
    corr_data = pd.read_csv(f_name, index_col=0)
    i_name = create_out_fname(f_name, ext='.png')

    # Generate a mask for the upper triangle
    plot_mask = np.zeros_like(corr_data, dtype=np.bool)
    plot_mask[np.triu_indices_from(plot_mask)] = True

    # Set up the matplotlib figure
    sns.set(style="white")
    # f, ax = plt.subplots(figsize=(11, 9))
    plt.subplots(figsize=(11, 9))
    # Draw the heatmap with the plot_mask and correct aspect ratio

    sns.heatmap(
        corr_data,
        mask=plot_mask,
        vmin=0.0,
        vmax=100.0,
        square=True,
        # xticklabels=2,
        # yticklabels=2,
        linewidths=.5,
        cbar_kws={
            "shrink": .5,
        },
    )

    plt.xticks(rotation='vertical')
    plt.yticks(rotation='horizontal')

    # print output

    plt.savefig(i_name)
    print("Wrote file: {}".format(i_name))
Exemple #37
0
def print_gofr(cfg, gofr_data):
    g_dr = cfg[GOFR_DR]
    dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2
    gofr_out_fieldnames = [GOFR_R]
    gofr_output = dr_array
    if cfg[CALC_HO_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HO)
        gofr_output = np.column_stack((gofr_output, gofr_ho))
    if cfg[CALC_OO_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OO)
        gofr_output = np.column_stack((gofr_output, gofr_oo))
    if cfg[CALC_HH_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HH)
        gofr_output = np.column_stack((gofr_output, gofr_hh))
    if cfg[CALC_OH_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OH)
        gofr_output = np.column_stack((gofr_output, gofr_oh))
    if cfg[CALC_TYPE_GOFR]:
        if gofr_data[TYPE_STEPS_COUNTED] > 0:
            normal_fac = np.square(dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr
            gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac)
            gofr_out_fieldnames.append(GOFR_TYPE)
            gofr_output = np.column_stack((gofr_output, gofr_type))
        else:
            warning(
                "Did not find any timesteps with the pairs in {}. "
                "This output will not be printed.".format(CALC_TYPE_GOFR)
            )

    f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix="_gofrs", ext=".csv", base_dir=cfg[OUT_BASE_DIR])
    # list_to_file([gofr_out_fieldnames] + gofr_output.tolist(), f_out, delimiter=',')
    list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out)
Exemple #38
0
def adjust_atom_dist(cfg, data_tpl_content):
    """
    If this options is selected, adjust the xyz coordinates to specified distances
    @param cfg: configuration for the run
    @param data_tpl_content: processed data from the template
    @return: will print new data files or raise InvalidDataError
    """
    for atom_num in cfg[ATOMS_DIST]:
        if atom_num > data_tpl_content[NUM_ATOMS]:
            raise InvalidDataError(
                "Keyword '{}' specified atom indexes {} but found only "
                "{} atoms in the data template file: {}".format(
                    ATOMS_DIST, cfg[ATOMS_DIST], data_tpl_content[NUM_ATOMS],
                    cfg[DATA_TPL_FILE]))
    # since python is zero-based, must subtract 1
    pivot_atom_num = cfg[ATOMS_DIST][0] - 1
    pivot_atom = data_tpl_content[ATOMS_CONTENT][pivot_atom_num]
    pivot_xyz = np.array(pivot_atom[4:7])

    moving_atom_num = cfg[ATOMS_DIST][1] - 1
    moving_atom = data_tpl_content[ATOMS_CONTENT][moving_atom_num]
    moving_xyz = np.array(moving_atom[4:7])

    diff_vector = pbc_calc_vector(moving_xyz, pivot_xyz,
                                  data_tpl_content[BOX_SIZE])
    base_dist = np.linalg.norm(diff_vector)

    head_content = data_tpl_content[HEAD_CONTENT]
    atoms_content = data_tpl_content[ATOMS_CONTENT]
    tail_content = data_tpl_content[TAIL_CONTENT]

    for new_dist in cfg[NEW_DIST_LIST]:
        multiplier = new_dist / base_dist
        f_name = create_out_fname(cfg[DATA_TPL_FILE],
                                  suffix='_' + str(new_dist),
                                  ext='.data')
        atoms_content[moving_atom_num][4:7] = np.round(
            multiplier * diff_vector + pivot_xyz, 6)
        list_to_file(head_content + atoms_content + tail_content, f_name)
Exemple #39
0
def find_rel_e(extracted_data, cfg, ref_e_dict):
    """
    calculate relative energy, if data found
    @param extracted_data: dictionary of data found from chk file
    @param cfg: configuration for run
    @param ref_e_dict: reference energies, if available
    @return:
    """

    tot_resid = 0
    num_resid = 0

    for data_dict in extracted_data:
        this_group = data_dict[REL_E_GROUP]
        if this_group:
            rel_ene_ref = cfg[REL_E_SEC][this_group][REL_E_REF]
        if this_group is None or np.isnan(rel_ene_ref):
            data_dict[REL_E] = np.nan
        else:
            rel_e = data_dict[ENV_ENE] - rel_ene_ref
            data_dict[REL_E] = rel_e
            file_name = data_dict[FILE_NAME]
            if file_name in ref_e_dict:
                ref_e = ref_e_dict[file_name]
                resid = np.round(np.sqrt((ref_e - rel_e)**2), 6)

                data_dict[REF_E] = ref_e
                data_dict[E_RESID] = resid
                tot_resid += resid
                num_resid += 1

    f_out = create_out_fname(cfg[CHK_FILE_LIST],
                             suffix='_sum',
                             ext='.csv',
                             base_dir=cfg[OUT_BASE_DIR])
    write_csv(extracted_data, f_out, ENE_FIELD_NAMES, extrasaction="ignore")
    if len(ref_e_dict) > 1:
        print("Calculated total energy residual from {} files: {}".format(
            num_resid, tot_resid))
Exemple #40
0
def process_file(base_file, data_file):
    # TODO: add in reading vectors
    base_dict = read_csv(base_file, quote_style=csv.QUOTE_NONNUMERIC)[0]
    data_dict_list = read_csv(data_file, quote_style=csv.QUOTE_NONNUMERIC)

    data_headers = [INDEX, RMSD] + read_csv_header(data_file)

    num_vals = len(base_dict.values())
    for data_id, data_dict in enumerate(data_dict_list):
        rmsd = 0.0
        for key, val in base_dict.items():
            try:
                rmsd += (data_dict[key] - val)**2
            except KeyError:
                raise InvalidDataError(
                    "Could not find key '{}' from base file in compared data file."
                    .format(key))

        data_dict[INDEX] = data_id
        data_dict[RMSD] = round((rmsd / num_vals)**0.5, 2)

    out_name = create_out_fname(data_file, prefix=RMSD + '_')
    write_csv(data_dict_list, out_name, data_headers)
Exemple #41
0
def process_pdb_files(cfg, data_tpl_content):
    # # For printing a dictionary
    # new_atom_type_dict = {}
    with open(cfg[PDBS_FILE]) as f:
        for pdb_file in f.readlines():
            pdb_atom_line = []
            pdb_file = pdb_file.strip()
            with open(pdb_file) as d:
                atom_num = 0
                for line in d.readlines():
                    pdb_section = line[:cfg[PDB_SECTION_LAST_CHAR]]
                    if pdb_section == 'ATOM  ':
                        # atom_nums = line[cfg[PDB_SECTION_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                        # atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_INFO_LAST_CHAR]]
                        # There is already a try when calling the subroutine, so maybe I don't need to?
                        # mol_num = int(line[cfg[PDB_ATOM_INFO_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                        pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                        pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                        pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                        # last_cols = line[cfg[PDB_Z_LAST_CHAR]:]
                        # if data_tpl_content[ATOMS_CONTENT][atom_num][2] !=data_tpl_content[ATOM_TYPE_DICT][atom_type]:
                        #     print(atom_num,atom_type, data_tpl_content[ATOMS_CONTENT][atom_num][2],
                        # data_tpl_content[ATOM_TYPE_DICT][atom_type])
                        # # For printing a dictionary
                        # new_atom_type_dict[atom_type] = data_tpl_content[ATOMS_CONTENT][atom_num][2]
                        pdb_atom_line.append(data_tpl_content[ATOMS_CONTENT][atom_num][:4] +
                                             [pdb_x, pdb_y, pdb_z] + data_tpl_content[ATOMS_CONTENT][atom_num][4:])
                        atom_num += 1
            if atom_num != data_tpl_content[NUM_ATOMS]:
                raise InvalidDataError('The length of the "Atoms" section ({}) in the pdb does not equal '
                                       'the number of atoms in the data template file ({}).'
                                       ''.format(len(atom_num),
                                                 data_tpl_content[NUM_ATOMS]))
            d_out = create_out_fname(pdb_file, suffix='_from_py', ext='.data')
            list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_atom_line + data_tpl_content[TAIL_CONTENT],
                         d_out)
            print('Wrote file: {}'.format(d_out))
Exemple #42
0
def create_hists(data_file, header_row, hist_data, out_dir):
    counts_to_print = []
    if len(hist_data) > 0:
        for col in hist_data:
            count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file)

            if len(counts_to_print) == 0:
                counts_to_print = count_to_print
            else:
                len1 = len(counts_to_print)
                len2 = len(count_to_print)
                width1 = len(counts_to_print[0])
                width2 = len(count_to_print[0])
                combined_list = []
                for row in range(min(len1, len2)):
                    combined_list.append(counts_to_print[row] + count_to_print[row])
                for row in range(len2, len1):
                    combined_list.append(counts_to_print[row] + [""] * width2)
                for row in range(len1, len2):
                    # noinspection PyTypeChecker
                    combined_list.append([""] * width1 + count_to_print[row])
                counts_to_print = copy.deepcopy(combined_list)
    f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir)
    list_to_csv(counts_to_print, f_name, delimiter=',')
Exemple #43
0
def create_hists(data_file, header_row, hist_data, out_dir):
    counts_to_print = []
    if len(hist_data) > 0:
        for col in hist_data:
            count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file)

            if len(counts_to_print) == 0:
                counts_to_print = count_to_print
            else:
                len1 = len(counts_to_print)
                len2 = len(count_to_print)
                width1 = len(counts_to_print[0])
                width2 = len(count_to_print[0])
                combined_list = []
                for row in range(min(len1, len2)):
                    combined_list.append(counts_to_print[row] + count_to_print[row])
                for row in range(len2, len1):
                    combined_list.append(counts_to_print[row] + [""] * width2)
                for row in range(len1, len2):
                    # noinspection PyTypeChecker
                    combined_list.append([""] * width1 + count_to_print[row])
                counts_to_print = copy.deepcopy(combined_list)
    f_name = create_out_fname(data_file, prefix="counts_", ext=".csv", base_dir=out_dir)
    list_to_csv(counts_to_print, f_name, delimiter=",")
Exemple #44
0
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False):
    try:
        dim_vectors, header_row, hist_data = np_float_array_from_file(
            data_file, delimiter=delimiter, header=header, gather_hist=make_hist
        )

    except InvalidDataError as e:
        raise InvalidDataError(
            "{}\n"
            "Run program with '-h' to see options, such as specifying header row (-n) "
            "and/or delimiter (-d)".format(e)
        )

    if header:
        to_print = [[""] + header_row]
    else:
        to_print = []

    max_vector = dim_vectors.max(axis=0)
    min_vector = dim_vectors.min(axis=0)
    avg_vector = dim_vectors.mean(axis=0)
    med_vector = np.percentile(dim_vectors, 50, axis=0)

    # noinspection PyTypeChecker
    to_print += [
        ["Min values:"] + min_vector.tolist(),
        ["Max values:"] + max_vector.tolist(),
        ["Avg values:"] + avg_vector.tolist(),
        ["Std dev:"] + dim_vectors.std(axis=0, ddof=1).tolist(),
        ["5% percentile:"] + np.percentile(dim_vectors, 4.55, axis=0).tolist(),
        ["32% percentile:"] + np.percentile(dim_vectors, 31.73, axis=0).tolist(),
        ["50% percentile:"] + med_vector.tolist(),
        ["68% percentile:"] + np.percentile(dim_vectors, 68.27, axis=0).tolist(),
        ["95% percentile:"] + np.percentile(dim_vectors, 95.45, axis=0).tolist(),
    ]
    if len_buffer is not None:
        to_print.append(["Max plus {} buffer:".format(len_buffer)] + (max_vector + len_buffer).tolist())

    if min_max_dict is not None:
        nan_list = [np.nan] * len(header_row)
        avg_ini_diff = ["Avg % Diff:"] + nan_list
        med_ini_diff = ["Med % Diff:"] + nan_list
        med_is_min = ["Median is Min:"] + nan_list
        med_is_max = ["Median is Max:"] + nan_list
        for col_num, header in enumerate(to_print[0]):
            if header in min_max_dict[0]:
                ini_val = min_max_dict[0][header]
                low_val = min_max_dict[1][header]
                upp_val = min_max_dict[2][header]
                avg_val = avg_vector[col_num - 1]
                med_val = med_vector[col_num - 1]
                min_val = min_vector[col_num - 1]
                max_val = max_vector[col_num - 1]
                min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL)
                med_tol = max(TOL * abs(med_val), TOL)
                max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL)
                if (low_val - min_val) > min_tol:
                    warning(
                        "Minimum value found for header '{}' ({}) is less than lower bound ({})"
                        "".format(header, min_val, low_val)
                    )
                if (max_val - upp_val) > max_tol:
                    warning(
                        "Maximum value found for header '{}' ({}) is greater than upper bound ({})"
                        "".format(header, max_val, upp_val)
                    )
                avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100
                med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100
                if abs(med_val - low_val) > med_tol:
                    med_is_min[col_num] = 0
                else:
                    med_is_min[col_num] = 1
                if abs(med_val - upp_val) > med_tol:
                    med_is_max[col_num] = 0
                else:
                    med_is_max[col_num] = 1
                    # else:
                    #     for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
                    #         min_max_list.append(np.nan)
        for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
            to_print.append(min_max_list)

    # Printing to standard out: do not print quotes around strings because using csv writer
    # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file))
    if len(dim_vectors[0]) < 12:
        for index, row in enumerate(to_print):
            # formatting for header
            if index == 0 and header:
                print("{:>20s} {}".format(row[0], " ".join(["{:>16s}".format(x.strip()) for x in row[1:]])))
            # formatting for vals
            else:
                print("{:>20s} {}".format(row[0], " ".join(["{:16.6f}".format(x) for x in row[1:]])))

    f_name = create_out_fname(data_file, prefix="stats_", ext=".csv", base_dir=out_dir)
    list_to_csv(to_print, f_name)
    # list_to_file(to_print, f_name, delimiter=',')

    if make_hist:
        create_hists(data_file, header_row, hist_data, out_dir)
Exemple #45
0
def process_dump_file(cfg, dump_file, atom_num_dict, atom_type_dict, mol_num_dict):
    section = None
    box = np.zeros((3,))
    counter = 1
    num_atoms = 0
    head_content = []
    steps_count = 0
    step_stop = cfg[MAX_STEPS] * cfg[OUT_FREQ]
    timestep = None
    with open(dump_file) as d:
        d_out = create_out_fname(dump_file, suffix='_reorder', base_dir=cfg[OUT_BASE_DIR])
        write_mode = 'w'
        for line in d:
            line = line.strip()

            if section == SEC_ATOMS:
                split_line = line.split()
                # If there is an incomplete line in a dump file, move on to the next file
                if len(split_line) < 7:
                    break

                atom_num = int(split_line[0])
                if atom_num in atom_num_dict:
                    atom_num = atom_num_dict[atom_num]

                mol_num = int(split_line[1])
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]
                # Default RENUM_START_MOL is neg 1; if still less than zero, user did not specify renumbering
                if 0 <= cfg[RENUM_START_MOL] <= mol_num:
                    mol_num += cfg[RENUM_SHIFT]

                atom_type = int(split_line[2])
                if atom_type in atom_type_dict:
                    atom_type = atom_type_dict[atom_type]

                charge = float(split_line[3])
                x, y, z = map(float, split_line[4:7])
                atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z]
                atom_data.append(atom_struct)
                if counter == num_atoms:
                    if len(atom_num_dict) > 0:
                        atom_data = sorted(atom_data, key=lambda atom: atom[0])
                    steps_count += 1
                    if steps_count % cfg[OUT_FREQ] == 0:
                        print_to_dump_file(head_content, atom_data, d_out, mode=write_mode)
                        if write_mode == 'w':
                            write_mode = 'a'
                    if steps_count == step_stop:
                        print("Reached the maximum number of steps ({})".format(cfg[MAX_STEPS]))
                        counter = 1
                        break
                    # reset for next timestep
                    head_content = []
                    counter = 0
                    section = None
                counter += 1

            else:
                head_content.append(line)
                if section is None:
                    section = find_dump_section_state(line)
                    if section is None:
                        raise InvalidDataError('Unexpected line in file {}: {}'.format(d, line))
                elif section == SEC_TIMESTEP:
                    timestep = line
                    # Reset variables
                    atom_data = []
                    section = None
                elif section == SEC_NUM_ATOMS:
                    num_atoms = int(line)
                    section = None
                elif section == SEC_BOX_SIZE:
                    split_line = line.split()
                    diff = float(split_line[1]) - float(split_line[0])
                    box[counter - 1] = diff
                    if counter == 3:
                        counter = 0
                        section = None
                    counter += 1
    if counter == 1:
        print("Completed reading: {}".format(dump_file))
    else:
        warning("Dump file {} step {} did not have the full list of atom numbers. "
                "Continuing program.".format(dump_file, timestep))
Exemple #46
0
def process_data_file(cfg, chk_atom_type, data_dict, data_file,
                      data_tpl_content):
    with open(data_file) as d:
        pdb_data_section = copy.deepcopy(data_tpl_content[ATOMS_CONTENT])
        pdb_atom_num = len(pdb_data_section)
        section = SEC_HEAD
        atom_id = 0
        num_atoms = None
        atom_types = []

        for line in d:
            line = line.strip()
            # not currently keeping anything from the header; just check num atoms
            if section == SEC_HEAD:
                if ATOMS_PAT.match(line):
                    section = SEC_ATOMS
                elif num_atoms is None:
                    atoms_match = NUM_ATOMS_PAT.match(line)
                    if atoms_match:
                        # regex is 1-based
                        num_atoms = int(atoms_match.group(1))
                        if num_atoms != pdb_atom_num:
                            raise InvalidDataError(
                                "Mismatched numbers of atoms: \n"
                                "  Found {} atoms in file: {}\n"
                                "    and {} atoms in file: {}\n"
                                "".format(pdb_atom_num, cfg[PDB_TPL_FILE],
                                          num_atoms, data_file))

            # atoms_content to contain only xyz; also perform some checking
            elif section == SEC_ATOMS:
                if len(line) == 0:
                    continue
                split_line = line.split()

                # Not currently checking molecule number
                # If decide to do so, should make a count from 1 as the PDB is read; the PDB does not
                # have to start from 1, but the data file counts molecules from 1. For now, decided
                # checking atom type is a sufficient check
                # mol_num = int(split_line[1])

                # Keep as string; json save as string and this helps compare
                atom_types.append(split_line[2])
                pdb_data_section[atom_id][5:8] = map(float, split_line[4:7])
                atom_id += 1
                # Check after increment because the counter started at 0
                if atom_id == num_atoms:
                    # Since the tail will come only from the template, nothing more is needed.
                    break

    # Now that finished reading the file...
    if atom_id != num_atoms:
        raise InvalidDataError(
            'In data file: {}\n'
            '  header section lists {} atoms, but found {} atoms'.format(
                data_file, num_atoms, atom_id))
    if chk_atom_type:
        for data_type, atom in zip(atom_types, pdb_data_section):
            try:
                pdb_type = atom[2] + atom[3]
                if pdb_type not in data_dict[data_type]:
                    warning(
                        'Did not find type {} in dictionary of values for atom_type {}: ({})'
                        ''.format(pdb_type, data_type, data_dict[data_type]))
                    # print("atom", atom_type, data_dict[atom_type])
            except KeyError:
                warning(
                    'Did not find data file atom type {} in the atom type dictionary {}'
                    ''.format(data_type, cfg[ATOM_TYPE_DICT_FILE]))
    f_name = create_out_fname(data_file,
                              ext='.pdb',
                              base_dir=cfg[OUT_BASE_DIR])
    list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_data_section +
                 data_tpl_content[TAIL_CONTENT],
                 f_name,
                 list_format=cfg[PDB_FORMAT])
Exemple #47
0
def process_pdb_tpl(cfg):
    tpl_loc = cfg[PDB_TPL_FILE]
    tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}

    atom_id = 0

    with open(tpl_loc) as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            # match 5 letters so don't need to set up regex for the ones that have numbers following the letters
            # noinspection SpellCheckingInspection
            if line_head[:-1] in [
                    'HEADE',
                    'TITLE',
                    'REMAR',
                    'CRYST',
                    'MODEL',
                    'COMPN',
                    'NUMMD',
                    'ORIGX',
                    'SCALE',
                    'SOURC',
                    'AUTHO',
                    'CAVEA',
                    'EXPDT',
                    'MDLTY',
                    'KEYWD',
                    'OBSLT',
                    'SPLIT',
                    'SPRSD',
                    'REVDA',
                    'JRNL ',
                    'DBREF',
                    'SEQRE',
                    'HET  ',
                    'HETNA',
                    'HETSY',
                    'FORMU',
                    'HELIX',
                    'SHEET',
                    'SSBON',
                    'LINK ',
                    'CISPE',
                    'SITE ',
            ]:
                tpl_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # By renumbering, handles the case when a PDB template has ***** after atom_id 99999.
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_id += 1
                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                else:
                    atom_num = '{:5d}'.format(atom_id)
                # Alternately, use this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]

                atom_type = line[
                    cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                res_type = line[
                    cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                # There is already a try when calling the subroutine, so maybe I don't need to?
                mol_num = int(line[
                    cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(
                    line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                last_cols = line[cfg[PDB_Z_LAST_CHAR]:]

                line_struct = [
                    line_head, atom_num, atom_type, res_type, mol_num, pdb_x,
                    pdb_y, pdb_z, last_cols
                ]
                tpl_data[ATOMS_CONTENT].append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                tpl_data[TAIL_CONTENT].append(line)

    if logger.isEnabledFor(logging.DEBUG):
        f_name = create_out_fname('reproduced_tpl',
                                  ext='.pdb',
                                  base_dir=cfg[OUT_BASE_DIR])
        list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT] +
                     tpl_data[TAIL_CONTENT],
                     f_name,
                     list_format=cfg[PDB_FORMAT])
    return tpl_data
Exemple #48
0
def process_evb_files(cfg):
    """
    Want to grab the timestep and highest prot ci^2, highest wat ci^2, and print them
    @param cfg: configuration data read from ini file
    @return: @raise InvalidDataError:
    """
    first_file_flag = True
    evb_file_list = []

    if cfg[EVB_FILE] is not None:
        evb_file_list.append(cfg[EVB_FILE])

    # Separate try-catch block here because want it to continue rather than exit; exit below if there are no files to
    # process
    try:
        with open(cfg[EVB_FILES]) as f:
            for evb_file in f:
                evb_file_list.append(evb_file.strip())
    except IOError as e:
        warning("Problems reading file:", e)

    if len(evb_file_list) == 0:
        raise InvalidDataError("Found no evb file names to read. Specify one file with the keyword '{}' or \n"
                               "a file containing a list of evb files with the keyword '{}'.".format(EVB_FILE,
                                                                                                     EVB_FILES))

    for evb_file in evb_file_list:
        data_to_print, subset_to_print, wat_mol_data_to_print = process_evb_file(evb_file, cfg)
        no_print = []
        if cfg[PRINT_PER_FILE] is True:
            if cfg[PRINT_KEY_PROPS]:
                if len(data_to_print) > 0:
                    f_out = create_out_fname(evb_file, suffix='_evb_info', ext='.csv',
                                             base_dir=cfg[OUT_BASE_DIR])
                    write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore")
                else:
                    no_print.append(PRINT_KEY_PROPS)
            if cfg[PRINT_CI_SUBSET]:
                if len(subset_to_print) > 0:
                    f_out = create_out_fname(evb_file, suffix='_ci_sq_ts', ext='.csv',
                                             base_dir=cfg[OUT_BASE_DIR])
                    write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore")
                else:
                    no_print.append(PRINT_CI_SUBSET)
            if cfg[PRINT_CI_SQ]:
                if len(data_to_print) > 0:
                    f_out = create_out_fname(evb_file, suffix='_ci_sq', ext='.csv', base_dir=cfg[OUT_BASE_DIR])
                    write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore")
                else:
                    no_print.append(PRINT_CI_SQ)
            if cfg[PRINT_CEC]:
                if len(data_to_print) > 0:
                    f_out = create_out_fname(evb_file, suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR])
                    write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore")
                else:
                    no_print.append(PRINT_CEC)
            if cfg[PRINT_WAT_MOL]:
                if len(wat_mol_data_to_print) > 0:
                    f_out = create_out_fname(evb_file, suffix='_wat_mols', ext='.csv',
                                             base_dir=cfg[OUT_BASE_DIR])
                    write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore")
                else:
                    no_print.append(PRINT_WAT_MOL)
        if len(no_print) > 0:
            warning("{} set to true, but found no data from: {} \n"
                    "No output will be printed for this file.".format(",".join(map(single_quote, no_print)), evb_file))
        if cfg[PRINT_PER_LIST]:
            if first_file_flag:
                print_mode = 'w'
                first_file_flag = False
            else:
                print_mode = 'a'
            if cfg[PRINT_CI_SQ]:
                f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq', ext='.csv',
                                         base_dir=cfg[OUT_BASE_DIR])
                write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode)
            if cfg[PRINT_CI_SUBSET]:
                f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq_ts', ext='.csv',
                                         base_dir=cfg[OUT_BASE_DIR])
                write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode)
            if cfg[PRINT_WAT_MOL]:
                f_out = create_out_fname(cfg[EVB_FILES], suffix='_wat_mols', ext='.csv',
                                         base_dir=cfg[OUT_BASE_DIR])
                write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore", mode=print_mode)
            if cfg[PRINT_CEC]:

                f_out = create_out_fname(cfg[EVB_FILES], suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR])
                write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore", mode=print_mode)
            if cfg[PRINT_KEY_PROPS]:
                f_out = create_out_fname(cfg[EVB_FILES], suffix='_evb_info', ext='.csv',
                                         base_dir=cfg[OUT_BASE_DIR])
                write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore", mode=print_mode)
Exemple #49
0
def process_data_tpl(cfg):
    tpl_loc = cfg[DATA_TPL_FILE]
    tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: [], PROT_RES_MOL: [], H3O_MOL: [],
                WATER_MOLS: defaultdict(list), FIRST_H3O_H_INDEX: None}
    section = SEC_HEAD
    num_atoms_pat = re.compile(r"(\d+).*atoms$")
    atoms_pat = re.compile(r"^Atoms.*")
    # put in dummy x y z
    x = 0.0
    y = 0.0
    z = 0.0

    total_charge = 0.0

    # For debugging total charge
    calc_charge_atom_nums = {}
    for name in CALC_CHARGE_NAMES:
        calc_charge_atom_nums[cfg[name]] = name

    with open(tpl_loc) as f:
        for line in f:
            line = line.strip()
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if section == SEC_HEAD:
                tpl_data[HEAD_CONTENT].append(line)
                if NUM_ATOMS not in tpl_data:
                    atoms_match = num_atoms_pat.match(line)
                    if atoms_match:
                        # regex is 1-based
                        tpl_data[NUM_ATOMS] = int(atoms_match.group(1))
                if atoms_pat.match(line):
                    section = SEC_ATOMS
                    tpl_data[HEAD_CONTENT].append('')
            # atoms_content to contain everything but the xyz: atom_num, mol_num, atom_type, charge, type'
            elif section == SEC_ATOMS:
                if len(line) == 0:
                    continue
                split_line = line.split()
                atom_num = int(split_line[0])
                mol_num = int(split_line[1])
                atom_type = int(split_line[2])
                charge = float(split_line[3])
                description = ' '.join(split_line[7:])
                atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z, description]
                tpl_data[ATOMS_CONTENT].append(atom_struct)
                total_charge += charge

                if atom_type == cfg[H3O_O_TYPE]:
                    tpl_data[H3O_MOL].append(atom_struct)
                    tpl_data[H3O_O_CHARGE] = charge
                elif atom_type == cfg[H3O_H_TYPE]:
                    if tpl_data[FIRST_H3O_H_INDEX] is None:
                        tpl_data[FIRST_H3O_H_INDEX] = len(tpl_data[H3O_MOL])
                    tpl_data[H3O_MOL].append(atom_struct)
                    tpl_data[H3O_H_CHARGE] = charge
                elif mol_num == cfg[PROT_RES_MOL_ID]:
                    tpl_data[PROT_RES_MOL].append(atom_struct)
                elif atom_type == cfg[WAT_O_TYPE] or atom_type == cfg[WAT_H_TYPE]:
                    tpl_data[WATER_MOLS][mol_num].append(atom_struct)
                if atom_num == tpl_data[NUM_ATOMS]:
                    section = SEC_TAIL
                    # Perform checks total charge
                    if abs(total_charge) < TOL:
                        print('The data file system is neutral (total charge {:.2e})'.format(total_charge))
                    else:
                        warning('The data file system is not neutral. Total charge {0:.6f}'.format(total_charge))
                    if len(tpl_data[PROT_RES_MOL]) == 0:
                        raise InvalidDataError('Did not find the input {} ({}).'.format(PROT_RES_MOL,
                                                                                        cfg[PROT_RES_MOL]))
                    for mol_list in [H3O_MOL, WATER_MOLS]:
                        if len(tpl_data[mol_list]) == 0:
                            raise InvalidDataError('In reading the data file, found no {}. Check the data file and '
                                                   'the input atom types: \n{} = {}\n{} = {}\n{} = {}\n'
                                                   '{} = {}\n{} = {}.'
                                                   ''.format(mol_list,
                                                             PROT_H_TYPE, cfg[PROT_H_TYPE],
                                                             H3O_O_TYPE, cfg[H3O_O_TYPE],
                                                             H3O_H_TYPE, cfg[H3O_H_TYPE],
                                                             WAT_O_TYPE, cfg[WAT_O_TYPE],
                                                             WAT_H_TYPE, cfg[WAT_H_TYPE]))

                elif atom_num in calc_charge_atom_nums:
                    print('After atom {0} ({1}), the total charge is: {2:.3f}'.format(atom_num,
                                                                                      calc_charge_atom_nums[atom_num],
                                                                                      total_charge))

            # tail_content to contain everything after the 'Atoms' section
            elif section == SEC_TAIL:
                tpl_data[TAIL_CONTENT].append(line)

    # Validate data section
    if len(tpl_data[ATOMS_CONTENT]) != tpl_data[NUM_ATOMS]:
        raise InvalidDataError('In the file {}, The length of the "Atoms" section ({}) does not equal '
                               'the number of atoms ({}).'.format(tpl_loc,
                                                                  len(tpl_data[ATOMS_CONTENT]),
                                                                  tpl_data[NUM_ATOMS]))

    if cfg[REPROD_TPL]:
        f_out = create_out_fname('reproduced_tpl', base_dir=cfg[OUT_BASE_DIR], ext='.data')
        list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT][:] + tpl_data[TAIL_CONTENT],
                     f_out)

    return tpl_data
Exemple #50
0
def process_dump_file(cfg, data_tpl_content, dump_file):
    section = None
    box = np.zeros((3,))
    counter = 1
    atom_list_order = [PRE_RES, PROT_RES, POST_RES, HYD_MOL, WAT_MOL, POST_WAT]
    dump_atom_data = []
    atom_lists = {PRE_RES: [],
                  PROT_RES: [],
                  POST_RES: [],
                  HYD_MOL: [],
                  WAT_MOL: [],
                  POST_WAT: []
                  }

    with open(dump_file) as d:
        for line in d:
            line = line.strip()
            if section is None:
                section = find_dump_section_state(line)
                if section is None:
                    raise InvalidDataError('Unexpected line in file {}: {}'.format(dump_file, line))
            elif section == SEC_TIMESTEP:
                timestep = line
                # Reset variables
                water_dict = defaultdict(list)
                dump_atom_data = []
                excess_proton = None
                hydronium = []
                for a_list in atom_lists:
                    atom_lists[a_list] = []
                section = None
            elif section == SEC_NUM_ATOMS:
                if data_tpl_content[NUM_ATOMS] != int(line):
                    raise InvalidDataError('At timestep {} in file {}, the listed number of atoms ({}) does '
                                           'not equal the number of atoms in the template data file '
                                           '({}).'.format(timestep, dump_file, line, data_tpl_content[NUM_ATOMS]))
                section = None
            elif section == SEC_BOX_SIZE:
                split_line = line.split()
                diff = float(split_line[1]) - float(split_line[0])
                box[counter - 1] = diff
                if counter == 3:
                    counter = 0
                    section = None
                counter += 1
            elif section == SEC_ATOMS:
                split_line = line.split()
                # If there is an incomplete line in a dump file, move on to the next file
                if len(split_line) < 7:
                    continue
                atom_num = int(split_line[0])
                mol_num = int(split_line[1])
                atom_type = int(split_line[2])
                charge = float(split_line[3])
                x, y, z = map(float, split_line[4:7])
                description = ''
                atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z, description]

                # Keep track of separate portions of the system to allow sorting and processing
                if mol_num == cfg[PROT_RES_MOL_ID]:
                    if atom_type == cfg[PROT_H_TYPE] and atom_num not in cfg[PROT_H_IGNORE]:
                        excess_proton = atom_struct
                    else:
                        atom_lists[PROT_RES].append(atom_struct)
                elif atom_type == cfg[H3O_O_TYPE] or atom_type == cfg[H3O_H_TYPE]:
                    hydronium.append(atom_struct)
                elif atom_type == cfg[WAT_O_TYPE] or atom_type == cfg[WAT_H_TYPE]:
                    water_dict[mol_num].append(atom_struct)
                # Save everything else in three chunks for recombining sections post-processing
                elif len(atom_lists[PROT_RES]) == 0:
                    atom_lists[PRE_RES].append(atom_struct)
                elif len(water_dict) == 0:
                    atom_lists[POST_RES].append(atom_struct)
                else:
                    atom_lists[POST_WAT].append(atom_struct)

                if counter == data_tpl_content[NUM_ATOMS]:
                    counter = 0
                    section = None

                    # Now that finished reading all atom lines...
                    # Check and process!
                    if len(water_dict) == 0:
                        raise InvalidDataError('Found no water molecules. Check that the input types {} = {} '
                                               'and {} = {} are in the dump '
                                               'file.'.format(WAT_O_TYPE, cfg[WAT_O_TYPE],
                                                              WAT_H_TYPE, cfg[WAT_H_TYPE]))
                    if excess_proton is None:
                        if len(hydronium) != 4:
                            raise InvalidDataError('Did not find an excess proton or one hydronium ion. Check dump '
                                                   'file and input types: {} = {}; {} = {}; {} = {}'
                                                   .format(PROT_H_TYPE, cfg[PROT_H_TYPE],
                                                           H3O_O_TYPE, cfg[H3O_O_TYPE],
                                                           H3O_H_TYPE, cfg[H3O_H_TYPE]))
                    else:
                        if len(hydronium) != 0:
                            raise InvalidDataError('Found an excess proton and a hydronium atoms. Check dump file '
                                                   'and input types: {} = {}; {} = {}; {} = {}'
                                                   .format(PROT_H_TYPE, cfg[PROT_H_TYPE],
                                                           H3O_O_TYPE, cfg[H3O_O_TYPE],
                                                           H3O_H_TYPE, cfg[H3O_H_TYPE]))
                        deprotonate(cfg, atom_lists[PROT_RES], excess_proton, hydronium,
                                    water_dict, box, data_tpl_content)

                    # Ensure in correct order for printing
                    atom_lists[HYD_MOL] = assign_hyd_mol(cfg, hydronium)
                    atom_lists[WAT_MOL] = sort_wat_mols(cfg, water_dict)

                    for a_list in atom_list_order:
                        dump_atom_data += atom_lists[a_list]

                    # overwrite atom_num, mol_num, atom_type, charge, then description
                    for index in range(len(dump_atom_data)):
                        if dump_atom_data[index][3] == data_tpl_content[ATOMS_CONTENT][index][3] or \
                                dump_atom_data[index][0] in cfg[PROT_TYPE_IGNORE_ATOMS]:
                            dump_atom_data[index][0:4] = data_tpl_content[ATOMS_CONTENT][index][0:4]
                            dump_atom_data[index][7] = ' '.join(data_tpl_content[ATOMS_CONTENT][index][7:])
                        else:
                            raise InvalidDataError("In reading file: {}\n found atom index {} with charge {} which "
                                                   "does not match the charge in the data template ({}). \n"
                                                   "To ignore this mis-match, list "
                                                   "the atom's index number in the keyword '{}' in the ini file."
                                                   "".format(dump_file,
                                                             dump_atom_data[index][0], dump_atom_data[index][3],
                                                             data_tpl_content[ATOMS_CONTENT][index][3],
                                                             PROT_TYPE_IGNORE_ATOMS))

                    d_out = create_out_fname(dump_file, suffix='_' + str(timestep),
                                             ext='.data', base_dir=cfg[OUT_BASE_DIR])
                    data_tpl_content[HEAD_CONTENT][0] = "Created by evbdump2data from {} " \
                                                        "timestep {}".format(dump_file, timestep)
                    list_to_file(data_tpl_content[HEAD_CONTENT] + dump_atom_data + data_tpl_content[TAIL_CONTENT],
                                 d_out)
                counter += 1
    if counter == 1:
        print("Completed reading dumpfile {}".format(dump_file))
    else:
        warning("Dump file {} step {} did not have the full list of atom numbers. "
                "Continuing program.".format(dump_file, timestep))
Exemple #51
0
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode):
    f_out = create_out_fname(dump_file, suffix="_sum", ext=".csv", base_dir=cfg[OUT_BASE_DIR])
    write_csv(data_to_print, f_out, out_fieldnames, extrasaction="ignore", mode=write_mode)
Exemple #52
0
def process_data_file(cfg, chk_atom_type, data_dict, data_file, data_tpl_content):
    with open(data_file) as d:
        pdb_data_section = copy.deepcopy(data_tpl_content[ATOMS_CONTENT])
        pdb_atom_num = len(pdb_data_section)
        section = SEC_HEAD
        atom_id = 0
        num_atoms = None
        atom_types = []

        for line in d:
            line = line.strip()
            # not currently keeping anything from the header; just check num atoms
            if section == SEC_HEAD:
                if ATOMS_PAT.match(line):
                    section = SEC_ATOMS
                elif num_atoms is None:
                    atoms_match = NUM_ATOMS_PAT.match(line)
                    if atoms_match:
                        # regex is 1-based
                        num_atoms = int(atoms_match.group(1))
                        if num_atoms != pdb_atom_num:
                            raise InvalidDataError("Mismatched numbers of atoms: \n"
                                                   "  Found {} atoms in file: {}\n"
                                                   "    and {} atoms in file: {}\n"
                                                   "".format(pdb_atom_num, cfg[PDB_TPL_FILE],
                                                             num_atoms, data_file))

            # atoms_content to contain only xyz; also perform some checking
            elif section == SEC_ATOMS:
                if len(line) == 0:
                    continue
                split_line = line.split()

                # Not currently checking molecule number
                # If decide to do so, should make a count from 1 as the PDB is read; the PDB does not
                # have to start from 1, but the data file counts molecules from 1. For now, decided
                # checking atom type is a sufficient check
                # mol_num = int(split_line[1])

                # Keep as string; json save as string and this helps compare
                atom_types.append(split_line[2])
                pdb_data_section[atom_id][5:8] = map(float, split_line[4:7])
                atom_id += 1
                # Check after increment because the counter started at 0
                if atom_id == num_atoms:
                    # Since the tail will come only from the template, nothing more is needed.
                    break

    # Now that finished reading the file...
    if atom_id != num_atoms:
        raise InvalidDataError('In data file: {}\n'
                               '  header section lists {} atoms, but found {} atoms'.format(data_file,
                                                                                            num_atoms, atom_id))
    if chk_atom_type:
        for data_type, atom in zip(atom_types, pdb_data_section):
            try:
                pdb_type = atom[2] + atom[3]
                if pdb_type not in data_dict[data_type]:
                    warning('Did not find type {} in dictionary of values for atom_type {}: ({})'
                            ''.format(pdb_type, data_type, data_dict[data_type]))
                    # print("atom", atom_type, data_dict[atom_type])
            except KeyError:
                warning('Did not find data file atom type {} in the atom type dictionary {}'
                        ''.format(data_type, cfg[ATOM_TYPE_DICT_FILE]))
    f_name = create_out_fname(data_file, ext='.pdb', base_dir=cfg[OUT_BASE_DIR])
    list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_data_section + data_tpl_content[TAIL_CONTENT],
                 f_name,
                 list_format=cfg[PDB_FORMAT])
Exemple #53
0
def make_summary(output_file, summary_file, cfg):
    low, high, headers = get_param_info(cfg)
    latest_output = np.loadtxt(output_file, dtype=np.float64)

    # append last best resid
    low = np.append(low, np.nan)
    high = np.append(high, np.nan)
    headers.append('resid')
    base_dir = os.path.dirname(output_file)
    latest_output = np.append(latest_output, get_resid(base_dir))

    if os.path.isfile(summary_file):
        last_row = None
        percent_diffs = []
        previous_output = np.loadtxt(summary_file, dtype=np.float64)
        all_output = np.vstack((previous_output, latest_output))
        for row in all_output:
            if last_row is not None:
                diff = row - last_row
                percent_diff = {}
                # Check data for small values, hitting upper or lower bound, and calc % diff
                for index, val in enumerate(np.nditer(row)):
                    if abs(val) < TOL:
                        warning("Small value ({}) encountered for parameter {} (col {})"
                                "".format(val, headers[index], index))
                    if abs(diff[index]) > TOL:
                        if abs(last_row[index]) > TOL:
                            percent_diff[headers[index]] = "%8.2f" % (diff[index] / last_row[index] * 100)
                        else:
                            percent_diff[headers[index]] = '        '
                        if abs(val-low[index]) < TOL:
                            warning("Value ({}) near lower bound ({}) encountered for parameter {} (col {})."
                                    "".format(val, low[index], headers[index], index))
                        if abs(val-high[index]) < TOL:
                            warning("Value ({}) near upper bound ({}) encountered for parameter {} (col {})."
                                    "".format(val, high[index], headers[index], index))
                    else:
                        percent_diff[headers[index]] = '        '
                percent_diffs.append(percent_diff)
            last_row = row

        # format for gnuplot and np.loadtxt
        f_out = create_out_fname(summary_file, suffix='_perc_diff', ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR])
        write_csv(percent_diffs, f_out, headers, extrasaction="ignore")
        print('Wrote file: {}'.format(f_out))

        f_out = create_out_fname(summary_file, ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR])
        with open(f_out, 'w') as s_file:
            s_file.write(','.join(headers)+'\n')
            np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',')
        print('Wrote file: {}'.format(f_out))

        # in addition to csv (above), print format for gnuplot and np.loadtxt
        with open(summary_file, 'w') as s_file:
            np.savetxt(s_file, all_output, fmt='%12.6f')
            print(summary_file)
        print("Wrote summary file {}".format(summary_file))
    else:
        # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ')
        with open(summary_file, 'w') as s_file:
            np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ')
        print("Wrote results from {} to new summary file {}".format(output_file, summary_file))
Exemple #54
0
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict):
    pdb_loc = cfg[PDB_FILE]
    pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
    # to allow warning to be printed once and only once
    missing_types = []
    qmmm_elem_id_dict = {}
    ca_res_atom_id_dict = {}
    cb_res_atom_id_dict = {}
    atoms_for_vmd = []

    with open(pdb_loc) as f:
        wat_count = 0
        atom_count = 0
        mol_count = 1

        current_mol = None
        last_mol_num = None
        atoms_content = []

        for line in f:
            line = line.strip()
            line_len = len(line)
            if line_len == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if line_head == 'REMARK' or line_head == 'CRYST1':
                pdb_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_count += 1

                # For reordering atoms
                if atom_count in atom_num_dict:
                    atom_id = atom_num_dict[atom_count]
                else:
                    atom_id = atom_count

                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                    if len(atom_num) > 5:
                        warning("Hex representation of {} is {}, which is greater than 5 characters. This"
                                "will affect the PDB output formatting.".format(atom_id, atom_num))
                else:
                    atom_num = '{:5d}'.format(atom_id)

                atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]]
                element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]]
                last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:]

                # For user-specified changing of molecule number
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]

                # If doing water molecule checking...
                if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]:
                    if (wat_count % 3) == 0:
                        current_mol = mol_num
                        if atom_type != '  OH2 ':
                                warning('Expected an OH2 atom to be the first atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        # last_cols = '  0.00  0.00      S2   O'
                    else:
                        if current_mol != mol_num:
                            warning('Water not in order on line:', line)
                        if (wat_count % 3) == 1:
                            if atom_type != '  H1  ':
                                warning('Expected an H1 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        else:
                            if atom_type != '  H2  ':
                                warning('Expected an H2 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                    wat_count += 1

                if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES:
                    if atom_type == C_ALPHA:
                        ca_res_atom_id_dict[mol_num] = atom_id
                    else:
                        if atom_type == C_BETA:
                            cb_res_atom_id_dict[mol_num] = atom_id
                        if atom_type in element_dict:
                            element = element_dict[atom_type]
                        else:
                            raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please "
                                                   "provide a new atom type, element dictionary (using keyword {} "
                                                   "in the configuration file) that includes all atom types in the "
                                                   "residues identified with the '{}' key."
                                                   "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_id)
                        else:
                            qmmm_elem_id_dict[element] = [atom_id]
                        atoms_for_vmd.append(atom_id - 1)

                if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]:
                    if atom_type in element_dict:
                        element = element_dict[atom_type]
                    else:
                        if atom_type not in missing_types:
                            warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite "
                                    "element type in the pdb output.".format(atom_type))
                            missing_types.append(atom_type)

                # For numbering molecules from 1 to end
                if cfg[RENUM_MOL]:
                    if last_mol_num is None:
                        last_mol_num = mol_num

                    if mol_num != last_mol_num:
                        last_mol_num = mol_num
                        mol_count += 1
                        if mol_count == 10000:
                            warning("Molecule numbers greater than 9999 will be printed in hex")

                    # Due to PDB format constraints, need to print in hex starting at 9999 molecules.
                    if mol_count > 9999:
                        mol_num = format(mol_count, 'x')
                        if len(mol_num) > 4:
                            warning("Hex representation of {} is {}, which is greater than 4 characters. This"
                                    "will affect the PDB output formatting.".format(atom_id, atom_num))
                    else:
                        mol_num = '{:4d}'.format(mol_count)

                line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z,
                               occ_t, element, last_cols]
                atoms_content.append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                pdb_data[TAIL_CONTENT].append(line)

    # Only sort if there is renumbering
    if len(atom_num_dict) > 0:
        pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1])
    else:
        pdb_data[ATOMS_CONTENT] = atoms_content

    if cfg[PDB_NEW_FILE] is None:
        f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR])
    else:
        f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR])
    print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT],
              f_name, cfg[PDB_FORMAT])

    if len(cfg[RESID_QMMM]) > 0:
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in qmmm_elem_id_dict:
            print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode)
        f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
Exemple #55
0
def process_data_file(atom_type_dict, data_file, data_tpl_content, new_data_section):
    with open(data_file) as d:
        section = SEC_HEAD
        atom_id = 0
        num_atoms = None
        for line in d.readlines():
            line = line.strip()
            # not keeping anything from the header
            if section == SEC_HEAD:
                if ATOMS_PAT.match(line):
                    section = SEC_ATOMS
                elif num_atoms is None:
                    atoms_match = NUM_ATOMS_PAT.match(line)
                    if atoms_match:
                        # regex is 1-based
                        num_atoms = int(atoms_match.group(1))
                        if num_atoms != len(data_tpl_content[ATOMS_CONTENT]):
                            raise InvalidDataError('The number of atoms in the template file ({}) does '
                                                   'not equal the number of atoms ({}) in the data file file: {}.'
                                                   ''.format(data_tpl_content[NUM_ATOMS], num_atoms, data_file))
            # atoms_content to grab xyz and pbc rep; also perform some checking
            elif section == SEC_ATOMS:
                if len(line) == 0:
                    continue
                split_line = line.split()

                # Not currently checking molecule number; the number may be wrong and the data still correct,
                # because of the reordering I did to match the template ordering.
                # Thus, I don't need:
                # mol_num = int(split_line[1])

                # Perform checking that the atom type in the corresponding line of the template file matches
                # the current file
                try:
                    old_atom_type = int(split_line[2])
                    # Add in the xyz coordinates
                    new_data_section[atom_id][4:7] = map(float, split_line[4:7])
                except (IndexError, ValueError):
                    raise InvalidDataError("In attempting to read {} atoms from file: {}\n  "
                                           "expected, but did not find, three ints followed by four floats on"
                                           "line: {}\n  "
                                           "Check input".format(data_tpl_content[NUM_ATOMS], data_file, line))

                # If there is an atom_type_dict, and the read atom type is in it....
                if old_atom_type in atom_type_dict:
                    new_atom_type = data_tpl_content[ATOMS_CONTENT][atom_id][2]
                    matching_new_atom_type = atom_type_dict[old_atom_type]

                    if new_atom_type != matching_new_atom_type:
                        print('Data mismatch on atom_id {:3d}, line: {}\n  Expected type {} but found type {}'
                              ''.format(atom_id + 1, line, matching_new_atom_type, new_atom_type))

                # and pbc ids, if they are there, before comments
                try:
                    new_data_section[atom_id][7] = ' '.join(map(int, split_line[8:10] + [new_data_section[atom_id][7]]))
                except (ValueError, IndexError):
                    # if there is no pdb id info and/or comment info, no problem. Keep on.
                    pass
                atom_id += 1
                # Check after increment because the counter started at 0
                if atom_id == num_atoms:
                    # Since the tail will come only from the template, nothing more is needed.
                    break

    # Now that finished reading the file...
    # Check total length
    # (will be wrong if got to tail before reaching num_atoms)
    if atom_id != num_atoms:
        raise InvalidDataError('The number of atoms read from the file {} ({}) does not equal '
                               'the listed number of atoms ({}).'.format(data_file, atom_id, num_atoms))
        # Now make new file
    f_name = create_out_fname(data_file, suffix='_new', ext='.data')
    list_to_file(data_tpl_content[HEAD_CONTENT] + new_data_section + data_tpl_content[TAIL_CONTENT],
                 f_name)
    print('Completed writing {}'.format(f_name))
Exemple #56
0
def make_summary(cfg):
    """
    If the option is specified, add the last best fit output file to the list of outputs and evaluate changes
    @param cfg: configuration for the run
    @return:
    """
    best_file = cfg[MAIN_SEC][BEST_FILE]
    summary_file = cfg[MAIN_SEC][SUMMARY_FILE]

    low, high, headers = get_param_info(cfg)
    latest_output = np.loadtxt(best_file, dtype=np.float64)

    if os.path.isfile(summary_file):
        last_row = None
        percent_diffs = []
        previous_output = np.loadtxt(summary_file, dtype=np.float64)
        all_output = np.vstack((previous_output, latest_output))
        for row in all_output:
            if last_row is not None:
                diff = row - last_row
                percent_diff = {}
                # Check data for small values, hitting upper or lower bound, and calc % diff
                for index, val in enumerate(np.nditer(row)):
                    if abs(val) < TOL:
                        warning(
                            "Small value ({}) encountered for parameter {} (col {})"
                            "".format(val, headers[index], index))
                    if abs(diff[index]) > TOL:
                        if abs(last_row[index]) > TOL:
                            percent_diff[headers[index]] = round(
                                diff[index] / last_row[index] * 100, 2)
                        else:
                            if abs(diff[index]) > TOL:
                                percent_diff[headers[index]] = np.inf
                        if abs(val - low[index]) < TOL:
                            warning(
                                "Value ({}) near lower bound ({}) encountered for parameter {} (col {})."
                                "".format(val, low[index], headers[index],
                                          index))
                        if abs(val - high[index]) < TOL:
                            warning(
                                "Value ({}) near upper bound ({}) encountered for parameter {} (col {})."
                                "".format(val, high[index], headers[index],
                                          index))
                    else:
                        percent_diff[headers[index]] = np.nan
                percent_diffs.append(percent_diff)
            last_row = row
        if len(percent_diffs) > 0:
            max_percent_diff = 0
            max_diff_param = None
            for param, val in percent_diffs[-1].items():
                if abs(val) > abs(max_percent_diff):
                    max_percent_diff = val
                    max_diff_param = param
            print(
                "Maximum (absolute value) percent difference from last read line is {} % for parameter '{}'."
                "".format(max_percent_diff, max_diff_param))
            if cfg[MAIN_SEC][RESID_IN_BEST]:
                print("Percent change in residual: {} %"
                      "".format(
                          percent_diffs[-1][RESIDUAL +
                                            cfg[MAIN_SEC][SUM_HEAD_SUFFIX]]))

        # format for gnuplot and np.loadtxt
        f_out = create_out_fname(summary_file,
                                 suffix='_perc_diff',
                                 ext='.csv',
                                 base_dir=cfg[MAIN_SEC][OUT_BASE_DIR])
        write_csv(percent_diffs, f_out, headers, extrasaction="ignore")

        f_out = create_out_fname(summary_file,
                                 ext='.csv',
                                 base_dir=cfg[MAIN_SEC][OUT_BASE_DIR])
        with open(f_out, 'w') as s_file:
            s_file.write(','.join(headers) + '\n')
            np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',')
        print('Wrote file: {}'.format(f_out))

        # in addition to csv (above), print format for gnuplot and np.loadtxt
        with open(summary_file, 'w') as s_file:
            np.savetxt(s_file, all_output, fmt='%12.6f')
        print("Wrote file: {}".format(summary_file))
    else:
        # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ')
        with open(summary_file, 'w') as s_file:
            np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ')
        print("Wrote results from {} to new summary file {}".format(
            best_file, summary_file))
Exemple #57
0
def process_files(comp_f_list, col_name, base_out_name, delimiter, sep_out_flag, out_location):
    """
    Want to grab the timestep, first and 2nd mole found, first and 2nd ci^2
    print the timestep, residue ci^2
    @param comp_f_list: a list of lists of file names to process (file read during input processing)
    @param col_name: name of column to use for alignment
    @param base_out_name: name of file to be created, or suffix if multiple files to be created
    @param delimiter: string, delimiter separating file names on lines of the comp_f_list
    @param sep_out_flag: a boolean to note if separate output files should be made based on each row of input
    @param out_location: user-specified location for the output files, if specified
    @return: @raise InvalidDataError:
    """
    all_dicts = defaultdict(dict)

    # if need multiple output files, designate them by adding a prefix
    prefix = ''
    # if there will be multiple output files, make sure do not reuse a prefix, so keep copy of used names
    prefix_used = []
    # if one output file from multiple sets of file to combine, will change write_mode to append later
    write_mode = 'w'

    # we don't have to specify run names in the output if there one row set of files to combine,
    #  or if there will be separate output files
    if len(comp_f_list) < 2 or sep_out_flag:
        add_run_name = False
        headers = []
    else:
        add_run_name = True
        headers = [RUN_NAME]

    for line_num, line in enumerate(comp_f_list):
        dict_keys = None
        if sep_out_flag:
            headers = []
            all_dicts = defaultdict(dict)
        # separate on delimiter, strip any white space, and also get rid of empty entries
        comp_files = filter(None, [c_file.strip() for c_file in line.split(delimiter)])

        # get the common part of the name, if it exists; otherwise, give the name the line index
        for file_index, file_name in enumerate(comp_files):
            base_name = os.path.splitext(os.path.basename(file_name))[0]
            if file_index == 0:
                run_name = base_name
            else:
                run_name = longest_common_substring(run_name, base_name)
        if run_name == '':
            # because will use run_name as a string, need to convert it
            run_name = str(line_num) + "_"

        for c_file in comp_files:
            new_dict = read_csv_to_dict(c_file, col_name)
            if dict_keys is None:
                dict_keys = new_dict.keys()
            else:
                dict_keys = set(dict_keys).intersection(new_dict.keys())
            new_dict_keys = six.next(six.itervalues(new_dict)).keys()
            # Get the keys for the inner dictionary; diff methods for python 2 and 3 so use six
            # expect to only get new headers when making a new file (write_mode == 'w')
            # for the next file, will not gather more headers. When printed, extra cols will be skipped, and
            #    missing columns will have no data shown
            if write_mode == 'w':
                for key in new_dict_keys:
                    if key in headers:
                        # okay if already have header if the header is the column.
                        # If we are going to append, we also expect to already have the header name
                        if key != col_name:
                            warning("Non-unique column name {} found in {}. "
                                    "Values will be overwritten.".format(key, c_file))
                    else:
                        headers.append(key)
            for new_key in new_dict.items():
                all_dicts[new_key[0]].update(new_key[1])

        final_dict = []
        for key in sorted(dict_keys):
            final_dict.append(all_dicts[key])
            # final_dict.append(all_dicts[key].update({RUN_NAME: run_name}))

        if add_run_name:
            for each_dict in final_dict:
                each_dict.update({RUN_NAME: run_name})

        # Possible to have no overlap in align column
        if len(final_dict) > 0:
            # make sure col_name appears first by taking it out before sorting
            if sep_out_flag:
                prefix = run_name
                if prefix == '' or prefix in prefix_used:
                    prefix = str(line_num) + "_"
            # have a consistent output by sorting the headers, but keep the aligning column first
            # only needs to be done for printing the first time
            if write_mode == 'w':
                headers.remove(col_name)
                headers = [col_name] + sorted(headers)
                if add_run_name:
                    headers.remove(RUN_NAME)
                    headers = [RUN_NAME] + headers
            f_name = create_out_fname(base_out_name, prefix=prefix, base_dir=out_location)
            prefix_used.append(prefix)
            write_csv(final_dict, f_name, headers, mode=write_mode)
            if not sep_out_flag and write_mode == 'w':
                write_mode = 'a'
        else:
            raise InvalidDataError("No common values found for column {} among files: {}"
                                   "".format(col_name, ", ".join(comp_files)))
Exemple #58
0
def process_file(data_file,  mcfg, delimiter=','):
    list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True)

    col_index_dict = {}
    for section in SUB_SECTIONS:
        col_index_dict[section] = {}
        for key, val in mcfg[section].items():
            if key in headers:
                # Parser already made sure that unique entries
                col_index_dict[section][headers.index(key)] = val
            else:
                raise InvalidDataError("Key '{}' found in configuration file but not in data file: "
                                       "{}".format(key, data_file))

    # set up bins, if needed
    bin_arrays = {}
    bin_labels = {}
    bin_counts = {}
    bin_ctrs = {}
    max_bins = {}
    for bin_col, col_bin_data in col_index_dict[BIN_SEC].items():
        bin_min = col_bin_data[0]
        bin_max = col_bin_data[1]
        num_bins = col_bin_data[2]
        max_bins[bin_col] = col_bin_data[3]
        # already checked that 1 or more bins, so will not divide by zero
        bin_width = (bin_max - bin_min) / num_bins
        # set up for np.searchsorted, not np.histogram
        col_bins = np.arange(bin_min + bin_width, bin_max, bin_width)
        # set up for recording assigned bin center
        bin_ctrs[bin_col] = [round_to_print(ctr) for ctr in np.arange(bin_min + bin_width/2, bin_max, bin_width)]
        bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col])
        bin_arrays[bin_col] = col_bins
        bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col])
        headers = [bin_labels[bin_col]] + headers
        # allow filtering based on min and max
        col_index_dict[MIN_SEC][bin_col] = bin_min
        col_index_dict[MAX_SEC][bin_col] = bin_max

    initial_row_num = len(list_vectors)
    filtered_vectors = []
    for row in list_vectors:
        keep_row = True
        for col, max_val in col_index_dict[MAX_SEC].items():
            if row[col] > max_val:
                keep_row = False
        for col, min_val in col_index_dict[MIN_SEC].items():
            if row[col] < min_val:
                keep_row = False
        if keep_row:
            for col_id, col_bins in bin_arrays.items():
                bin_index = np.searchsorted(col_bins, row[col_id])
                row = [bin_ctrs[col_id][bin_index]] + row
                bin_counts[col_id][bin_index] += 1
            filtered_vectors.append(row)
    print("Keeping {} of {} rows based on filtering criteria".format(len(filtered_vectors), initial_row_num))

    # Print output and determine if the output needs to be adjusted because of a max number of entries per bin
    ctr_format = "{:^11} {:^8}"
    ctr_format_max = "{:^11} {:^8} {:^7}"
    excess_bins = {}
    for col_bin in bin_arrays:
        print("Histogram data for column '{}': ".format(bin_labels[col_bin]))
        if max_bins[col_bin] is None:
            print(ctr_format.format('bin_ctr', 'count'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                print(ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index]))
        else:
            bin_max = max_bins[col_bin]
            excess_bins[col_bin] = {}
            print(ctr_format_max.format('bin_ctr', 'found', 'keep'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                num_found = bin_counts[col_bin][bin_index]
                if num_found > bin_max:
                    num_keep = bin_max
                    # use bin_ctr as key because that is what is saved on the row
                    excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = {QUOT: num_found / bin_max,
                                                                          MOD: num_found % bin_max}
                else:
                    num_keep = num_found
                print(ctr_format_max.format(bin_ctr, num_found, num_keep))

    if len(excess_bins) == 1:
        count_bin = {}
        delete_rows = []
        mod_r = {}
        quot_r = {}
        for col_bin in excess_bins:
            for bin_remove, bin_dict in excess_bins[col_bin].items():
                mod_r[bin_remove] = bin_dict[MOD]
                quot_r[bin_remove] = bin_dict[QUOT]
                count_bin[bin_remove] = 0
            r_count = 0
            for row_id, row in enumerate(filtered_vectors):
                bin_name = row[0]
                # print(bin_name)
                if bin_name in excess_bins[col_bin]:
                    count_bin[bin_name] += 1
                    if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[bin_name] <= mod_r[bin_name]:
                        delete_rows.append(row_id)
                        # print(row_id)
                r_count += 1
            filtered_vectors = [row for row_id, row in enumerate(filtered_vectors) if row_id not in delete_rows]
    if len(excess_bins) > 1:
        warning("No filtering based on a max number of entries will be done; this feature is currently implemented "
                "only for binning with one column's values.")

    f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv')
    list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')