Example #1
0
def process_file(file_to_process, cfg):
    """
    Will complete the work of this script based on the provided cfg
    @param file_to_process: the file with column to be combined
    @param cfg: the configuration of this run
    @return: errors or nothing
    """
    to_print = []

    # determine if any type conversion has been specified & create conv dict if needed
    if cfg[COL1_CONV] is None and cfg[COL2_CONV] is None:
        conv_dict = None
    else:
        conv_dict = {}
        if cfg[COL1_CONV] is not None:
            conv_dict[cfg[COL1]] = cfg[COL1_CONV]
        if cfg[COL2_CONV] is not None:
            conv_dict[cfg[COL2]] = cfg[COL2_CONV]

    raw_col_data = read_csv(file_to_process, data_conv=conv_dict, quote_style=csv.QUOTE_NONNUMERIC)
    for header in cfg[COL1], cfg[COL2]:
        if header not in raw_col_data[0]:
            raise InvalidDataError("Specified column header '{}' was not found in file: {}"
                                   "".format(header, file_to_process))
    for row in raw_col_data:
        to_print.append(["".join(map(str, [cfg[PREFIX], row[cfg[COL1]], cfg[MIDDLE], row[cfg[COL2]], cfg[SUFFIX]]))])

    list_to_csv(to_print, cfg[OUT_FILE], delimiter=',', quote_style=csv.QUOTE_MINIMAL)
Example #2
0
def process_file(file_to_process, cfg):
    """
    Will complete the work of this script based on the provided cfg
    @param file_to_process: the file with column to be combined
    @param cfg: the configuration of this run
    @return: errors or nothing
    """
    to_print = []

    # determine if any type conversion has been specified & create conv dict if needed
    if cfg[COL1_CONV] is None and cfg[COL2_CONV] is None:
        conv_dict = None
    else:
        conv_dict = {}
        if cfg[COL1_CONV] is not None:
            conv_dict[cfg[COL1]] = cfg[COL1_CONV]
        if cfg[COL2_CONV] is not None:
            conv_dict[cfg[COL2]] = cfg[COL2_CONV]

    raw_col_data = read_csv(file_to_process, data_conv=conv_dict, quote_style=csv.QUOTE_NONNUMERIC)
    for header in cfg[COL1], cfg[COL2]:
        if header not in raw_col_data[0]:
            raise InvalidDataError("Specified column header '{}' was not found in file: {}"
                                   "".format(header, file_to_process))
    for row in raw_col_data:
        to_print.append(["".join(map(str, [cfg[PREFIX], row[cfg[COL1]], cfg[MIDDLE], row[cfg[COL2]], cfg[SUFFIX]]))])

    list_to_csv(to_print, cfg[OUT_FILE], delimiter=',', quote_style=csv.QUOTE_MINIMAL)
Example #3
0
def process_file(data_file, mcfg, delimiter=','):
    list_vectors, headers = read_csv_to_list(data_file,
                                             delimiter=delimiter,
                                             header=True)

    col_index_dict = {}
    for section in SUB_SECTIONS:
        col_index_dict[section] = {}
        for key, val in mcfg[section].items():
            if key in headers:
                # Parser already made sure that unique entries
                col_index_dict[section][headers.index(key)] = val
            else:
                raise InvalidDataError(
                    "Key '{}' found in configuration file but not in data file: "
                    "{}".format(key, data_file))

    edited_vectors = []
    for row in list_vectors:
        for col, max_val in col_index_dict[MAX_SEC].items():
            if row[col] > max_val:
                row[col] = max_val
        for col, min_val in col_index_dict[MIN_SEC].items():
            if row[col] < min_val:
                row[col] = min_val
        edited_vectors.append(row)

    f_name = create_out_fname(data_file, ext='.csv')
    list_to_csv([headers] + edited_vectors, f_name, delimiter=',')
Example #4
0
def print_gofr(cfg, gofr_data):
    g_dr = cfg[GOFR_DR]
    dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2
    gofr_out_fieldnames = [GOFR_R]
    gofr_output = dr_array
    if cfg[CALC_HO_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HO)
        gofr_output = np.column_stack((gofr_output, gofr_ho))
    if cfg[CALC_OO_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OO)
        gofr_output = np.column_stack((gofr_output, gofr_oo))
    if cfg[CALC_HH_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HH)
        gofr_output = np.column_stack((gofr_output, gofr_hh))
    if cfg[CALC_OH_GOFR]:
        normal_fac = np.square(
            dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OH)
        gofr_output = np.column_stack((gofr_output, gofr_oh))
    if cfg[CALC_TYPE_GOFR]:
        if gofr_data[TYPE_STEPS_COUNTED] > 0:
            normal_fac = np.square(
                dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr
            gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac)
            gofr_out_fieldnames.append(GOFR_TYPE)
            gofr_output = np.column_stack((gofr_output, gofr_type))
        else:
            warning("Did not find any timesteps with the pairs in {}. "
                    "This output will not be printed.".format(CALC_TYPE_GOFR))

    f_out = create_out_fname(cfg[DUMP_FILE_LIST],
                             suffix='_gofrs',
                             ext='.csv',
                             base_dir=cfg[OUT_BASE_DIR])
    # am not using the dict writer because the gofr output is a np.array
    list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(),
                f_out,
                print_message=cfg[PRINT_PROGRESS],
                round_digits=ROUND_DIGITS)
Example #5
0
def print_gofr(cfg, gofr_data):
    g_dr = cfg[GOFR_DR]
    dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2
    gofr_out_fieldnames = [GOFR_R]
    gofr_output = dr_array
    if cfg[CALC_HO_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HO)
        gofr_output = np.column_stack((gofr_output, gofr_ho))
    if cfg[CALC_OO_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OO)
        gofr_output = np.column_stack((gofr_output, gofr_oo))
    if cfg[CALC_HH_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_HH)
        gofr_output = np.column_stack((gofr_output, gofr_hh))
    if cfg[CALC_OH_GOFR]:
        normal_fac = np.square(dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr
        gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac)
        gofr_out_fieldnames.append(GOFR_OH)
        gofr_output = np.column_stack((gofr_output, gofr_oh))
    if cfg[CALC_TYPE_GOFR]:
        if gofr_data[TYPE_STEPS_COUNTED] > 0:
            normal_fac = np.square(dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr
            gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac)
            gofr_out_fieldnames.append(GOFR_TYPE)
            gofr_output = np.column_stack((gofr_output, gofr_type))
        else:
            warning(
                "Did not find any timesteps with the pairs in {}. "
                "This output will not be printed.".format(CALC_TYPE_GOFR)
            )

    f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix="_gofrs", ext=".csv", base_dir=cfg[OUT_BASE_DIR])
    # list_to_file([gofr_out_fieldnames] + gofr_output.tolist(), f_out, delimiter=',')
    list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out)
Example #6
0
def create_hists(data_file, header_row, hist_data, out_dir):
    counts_to_print = []
    if len(hist_data) > 0:
        for col in hist_data:
            count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file)

            if len(counts_to_print) == 0:
                counts_to_print = count_to_print
            else:
                len1 = len(counts_to_print)
                len2 = len(count_to_print)
                width1 = len(counts_to_print[0])
                width2 = len(count_to_print[0])
                combined_list = []
                for row in range(min(len1, len2)):
                    combined_list.append(counts_to_print[row] + count_to_print[row])
                for row in range(len2, len1):
                    combined_list.append(counts_to_print[row] + [""] * width2)
                for row in range(len1, len2):
                    # noinspection PyTypeChecker
                    combined_list.append([""] * width1 + count_to_print[row])
                counts_to_print = copy.deepcopy(combined_list)
    f_name = create_out_fname(data_file, prefix="counts_", ext=".csv", base_dir=out_dir)
    list_to_csv(counts_to_print, f_name, delimiter=",")
Example #7
0
def create_hists(data_file, header_row, hist_data, out_dir):
    counts_to_print = []
    if len(hist_data) > 0:
        for col in hist_data:
            count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file)

            if len(counts_to_print) == 0:
                counts_to_print = count_to_print
            else:
                len1 = len(counts_to_print)
                len2 = len(count_to_print)
                width1 = len(counts_to_print[0])
                width2 = len(count_to_print[0])
                combined_list = []
                for row in range(min(len1, len2)):
                    combined_list.append(counts_to_print[row] + count_to_print[row])
                for row in range(len2, len1):
                    combined_list.append(counts_to_print[row] + [""] * width2)
                for row in range(len1, len2):
                    # noinspection PyTypeChecker
                    combined_list.append([""] * width1 + count_to_print[row])
                counts_to_print = copy.deepcopy(combined_list)
    f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir)
    list_to_csv(counts_to_print, f_name, delimiter=',')
Example #8
0
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False):
    try:
        dim_vectors, header_row, hist_data = np_float_array_from_file(
            data_file, delimiter=delimiter, header=header, gather_hist=make_hist
        )

    except InvalidDataError as e:
        raise InvalidDataError(
            "{}\n"
            "Run program with '-h' to see options, such as specifying header row (-n) "
            "and/or delimiter (-d)".format(e)
        )

    if header:
        to_print = [[""] + header_row]
    else:
        to_print = []

    max_vector = dim_vectors.max(axis=0)
    min_vector = dim_vectors.min(axis=0)
    avg_vector = dim_vectors.mean(axis=0)
    med_vector = np.percentile(dim_vectors, 50, axis=0)

    # noinspection PyTypeChecker
    to_print += [
        ["Min values:"] + min_vector.tolist(),
        ["Max values:"] + max_vector.tolist(),
        ["Avg values:"] + avg_vector.tolist(),
        ["Std dev:"] + dim_vectors.std(axis=0, ddof=1).tolist(),
        ["5% percentile:"] + np.percentile(dim_vectors, 4.55, axis=0).tolist(),
        ["32% percentile:"] + np.percentile(dim_vectors, 31.73, axis=0).tolist(),
        ["50% percentile:"] + med_vector.tolist(),
        ["68% percentile:"] + np.percentile(dim_vectors, 68.27, axis=0).tolist(),
        ["95% percentile:"] + np.percentile(dim_vectors, 95.45, axis=0).tolist(),
    ]
    if len_buffer is not None:
        to_print.append(["Max plus {} buffer:".format(len_buffer)] + (max_vector + len_buffer).tolist())

    if min_max_dict is not None:
        nan_list = [np.nan] * len(header_row)
        avg_ini_diff = ["Avg % Diff:"] + nan_list
        med_ini_diff = ["Med % Diff:"] + nan_list
        med_is_min = ["Median is Min:"] + nan_list
        med_is_max = ["Median is Max:"] + nan_list
        for col_num, header in enumerate(to_print[0]):
            if header in min_max_dict[0]:
                ini_val = min_max_dict[0][header]
                low_val = min_max_dict[1][header]
                upp_val = min_max_dict[2][header]
                avg_val = avg_vector[col_num - 1]
                med_val = med_vector[col_num - 1]
                min_val = min_vector[col_num - 1]
                max_val = max_vector[col_num - 1]
                min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL)
                med_tol = max(TOL * abs(med_val), TOL)
                max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL)
                if (low_val - min_val) > min_tol:
                    warning(
                        "Minimum value found for header '{}' ({}) is less than lower bound ({})"
                        "".format(header, min_val, low_val)
                    )
                if (max_val - upp_val) > max_tol:
                    warning(
                        "Maximum value found for header '{}' ({}) is greater than upper bound ({})"
                        "".format(header, max_val, upp_val)
                    )
                avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100
                med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100
                if abs(med_val - low_val) > med_tol:
                    med_is_min[col_num] = 0
                else:
                    med_is_min[col_num] = 1
                if abs(med_val - upp_val) > med_tol:
                    med_is_max[col_num] = 0
                else:
                    med_is_max[col_num] = 1
                    # else:
                    #     for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
                    #         min_max_list.append(np.nan)
        for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
            to_print.append(min_max_list)

    # Printing to standard out: do not print quotes around strings because using csv writer
    # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file))
    if len(dim_vectors[0]) < 12:
        for index, row in enumerate(to_print):
            # formatting for header
            if index == 0 and header:
                print("{:>20s} {}".format(row[0], " ".join(["{:>16s}".format(x.strip()) for x in row[1:]])))
            # formatting for vals
            else:
                print("{:>20s} {}".format(row[0], " ".join(["{:16.6f}".format(x) for x in row[1:]])))

    f_name = create_out_fname(data_file, prefix="stats_", ext=".csv", base_dir=out_dir)
    list_to_csv(to_print, f_name)
    # list_to_file(to_print, f_name, delimiter=',')

    if make_hist:
        create_hists(data_file, header_row, hist_data, out_dir)
Example #9
0
def process_file(data_file,  mcfg, delimiter=','):
    list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True)

    col_index_dict = {}
    for section in SUB_SECTIONS:
        col_index_dict[section] = {}
        for key, val in mcfg[section].items():
            if key in headers:
                # Parser already made sure that unique entries
                col_index_dict[section][headers.index(key)] = val
            else:
                raise InvalidDataError("Key '{}' found in configuration file but not in data file: "
                                       "{}".format(key, data_file))

    # set up bins, if needed
    bin_arrays = {}
    bin_labels = {}
    bin_counts = {}
    bin_ctrs = {}
    max_bins = {}
    for bin_col, col_bin_data in col_index_dict[BIN_SEC].items():
        bin_min = col_bin_data[0]
        bin_max = col_bin_data[1]
        num_bins = col_bin_data[2]
        max_bins[bin_col] = col_bin_data[3]
        # already checked that 1 or more bins, so will not divide by zero
        bin_width = (bin_max - bin_min) / num_bins
        # set up for np.searchsorted, not np.histogram
        col_bins = np.arange(bin_min + bin_width, bin_max, bin_width)
        # set up for recording assigned bin center
        bin_ctrs[bin_col] = [round_to_print(ctr) for ctr in np.arange(bin_min + bin_width/2, bin_max, bin_width)]
        bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col])
        bin_arrays[bin_col] = col_bins
        bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col])
        headers = [bin_labels[bin_col]] + headers
        # allow filtering based on min and max
        col_index_dict[MIN_SEC][bin_col] = bin_min
        col_index_dict[MAX_SEC][bin_col] = bin_max

    initial_row_num = len(list_vectors)
    filtered_vectors = []
    for row in list_vectors:
        keep_row = True
        for col, max_val in col_index_dict[MAX_SEC].items():
            if row[col] > max_val:
                keep_row = False
        for col, min_val in col_index_dict[MIN_SEC].items():
            if row[col] < min_val:
                keep_row = False
        if keep_row:
            for col_id, col_bins in bin_arrays.items():
                bin_index = np.searchsorted(col_bins, row[col_id])
                row = [bin_ctrs[col_id][bin_index]] + row
                bin_counts[col_id][bin_index] += 1
            filtered_vectors.append(row)
    print("Keeping {} of {} rows based on filtering criteria".format(len(filtered_vectors), initial_row_num))

    # Print output and determine if the output needs to be adjusted because of a max number of entries per bin
    ctr_format = "{:^11} {:^8}"
    ctr_format_max = "{:^11} {:^8} {:^7}"
    excess_bins = {}
    for col_bin in bin_arrays:
        print("Histogram data for column '{}': ".format(bin_labels[col_bin]))
        if max_bins[col_bin] is None:
            print(ctr_format.format('bin_ctr', 'count'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                print(ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index]))
        else:
            bin_max = max_bins[col_bin]
            excess_bins[col_bin] = {}
            print(ctr_format_max.format('bin_ctr', 'found', 'keep'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                num_found = bin_counts[col_bin][bin_index]
                if num_found > bin_max:
                    num_keep = bin_max
                    # use bin_ctr as key because that is what is saved on the row
                    excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = {QUOT: num_found / bin_max,
                                                                          MOD: num_found % bin_max}
                else:
                    num_keep = num_found
                print(ctr_format_max.format(bin_ctr, num_found, num_keep))

    if len(excess_bins) == 1:
        count_bin = {}
        delete_rows = []
        mod_r = {}
        quot_r = {}
        for col_bin in excess_bins:
            for bin_remove, bin_dict in excess_bins[col_bin].items():
                mod_r[bin_remove] = bin_dict[MOD]
                quot_r[bin_remove] = bin_dict[QUOT]
                count_bin[bin_remove] = 0
            r_count = 0
            for row_id, row in enumerate(filtered_vectors):
                bin_name = row[0]
                # print(bin_name)
                if bin_name in excess_bins[col_bin]:
                    count_bin[bin_name] += 1
                    if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[bin_name] <= mod_r[bin_name]:
                        delete_rows.append(row_id)
                        # print(row_id)
                r_count += 1
            filtered_vectors = [row for row_id, row in enumerate(filtered_vectors) if row_id not in delete_rows]
    if len(excess_bins) > 1:
        warning("No filtering based on a max number of entries will be done; this feature is currently implemented "
                "only for binning with one column's values.")

    f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv')
    list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')
Example #10
0
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict):
    pdb_loc = cfg[PDB_FILE]
    pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
    # to allow warning to be printed once and only once
    missing_types = []
    qmmm_elem_id_dict = {}
    ca_res_atom_id_dict = {}
    cb_res_atom_id_dict = {}
    atoms_for_vmd = []

    with open(pdb_loc) as f:
        wat_count = 0
        atom_count = 0
        mol_count = 1

        current_mol = None
        last_mol_num = None
        atoms_content = []

        for line in f:
            line = line.strip()
            line_len = len(line)
            if line_len == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if line_head == 'REMARK' or line_head == 'CRYST1':
                pdb_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_count += 1

                # For reordering atoms
                if atom_count in atom_num_dict:
                    atom_id = atom_num_dict[atom_count]
                else:
                    atom_id = atom_count

                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                    if len(atom_num) > 5:
                        warning("Hex representation of {} is {}, which is greater than 5 characters. This"
                                "will affect the PDB output formatting.".format(atom_id, atom_num))
                else:
                    atom_num = '{:5d}'.format(atom_id)

                atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]]
                element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]]
                last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:]

                # For user-specified changing of molecule number
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]

                # If doing water molecule checking...
                if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]:
                    if (wat_count % 3) == 0:
                        current_mol = mol_num
                        if atom_type != '  OH2 ':
                                warning('Expected an OH2 atom to be the first atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        # last_cols = '  0.00  0.00      S2   O'
                    else:
                        if current_mol != mol_num:
                            warning('Water not in order on line:', line)
                        if (wat_count % 3) == 1:
                            if atom_type != '  H1  ':
                                warning('Expected an H1 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        else:
                            if atom_type != '  H2  ':
                                warning('Expected an H2 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                    wat_count += 1

                if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES:
                    if atom_type == C_ALPHA:
                        ca_res_atom_id_dict[mol_num] = atom_id
                    else:
                        if atom_type == C_BETA:
                            cb_res_atom_id_dict[mol_num] = atom_id
                        if atom_type in element_dict:
                            element = element_dict[atom_type]
                        else:
                            raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please "
                                                   "provide a new atom type, element dictionary (using keyword {} "
                                                   "in the configuration file) that includes all atom types in the "
                                                   "residues identified with the '{}' key."
                                                   "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_id)
                        else:
                            qmmm_elem_id_dict[element] = [atom_id]
                        atoms_for_vmd.append(atom_id - 1)

                if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]:
                    if atom_type in element_dict:
                        element = element_dict[atom_type]
                    else:
                        if atom_type not in missing_types:
                            warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite "
                                    "element type in the pdb output.".format(atom_type))
                            missing_types.append(atom_type)

                # For numbering molecules from 1 to end
                if cfg[RENUM_MOL]:
                    if last_mol_num is None:
                        last_mol_num = mol_num

                    if mol_num != last_mol_num:
                        last_mol_num = mol_num
                        mol_count += 1
                        if mol_count == 10000:
                            warning("Molecule numbers greater than 9999 will be printed in hex")

                    # Due to PDB format constraints, need to print in hex starting at 9999 molecules.
                    if mol_count > 9999:
                        mol_num = format(mol_count, 'x')
                        if len(mol_num) > 4:
                            warning("Hex representation of {} is {}, which is greater than 4 characters. This"
                                    "will affect the PDB output formatting.".format(atom_id, atom_num))
                    else:
                        mol_num = '{:4d}'.format(mol_count)

                line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z,
                               occ_t, element, last_cols]
                atoms_content.append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                pdb_data[TAIL_CONTENT].append(line)

    # Only sort if there is renumbering
    if len(atom_num_dict) > 0:
        pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1])
    else:
        pdb_data[ATOMS_CONTENT] = atoms_content

    if cfg[PDB_NEW_FILE] is None:
        f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR])
    else:
        f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR])
    print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT],
              f_name, cfg[PDB_FORMAT])

    if len(cfg[RESID_QMMM]) > 0:
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in qmmm_elem_id_dict:
            print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode)
        f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
Example #11
0
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict):
    pdb_loc = cfg[PDB_FILE]
    pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
    # to allow warning to be printed once and only once
    missing_types = []
    qmmm_elem_id_dict = {}
    ca_res_atom_id_dict = {}
    cb_res_atom_id_dict = {}
    atoms_for_vmd = []

    with open(pdb_loc) as f:
        wat_count = 0
        atom_count = 0
        mol_count = 1

        current_mol = None
        last_mol_num = None
        atoms_content = []

        for line in f:
            line = line.strip()
            line_len = len(line)
            if line_len == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if line_head == 'REMARK' or line_head == 'CRYST1':
                pdb_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_count += 1

                # For reordering atoms
                if atom_count in atom_num_dict:
                    atom_id = atom_num_dict[atom_count]
                else:
                    atom_id = atom_count

                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                    if len(atom_num) > 5:
                        warning("Hex representation of {} is {}, which is greater than 5 characters. This"
                                "will affect the PDB output formatting.".format(atom_id, atom_num))
                else:
                    atom_num = '{:5d}'.format(atom_id)

                atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]]
                element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]]
                last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:]

                # For user-specified changing of molecule number
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]

                # If doing water molecule checking...
                if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]:
                    if (wat_count % 3) == 0:
                        current_mol = mol_num
                        if atom_type != '  OH2 ':
                                warning('Expected an OH2 atom to be the first atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        # last_cols = '  0.00  0.00      S2   O'
                    else:
                        if current_mol != mol_num:
                            warning('Water not in order on line:', line)
                        if (wat_count % 3) == 1:
                            if atom_type != '  H1  ':
                                warning('Expected an H1 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                        else:
                            if atom_type != '  H2  ':
                                warning('Expected an H2 atom to be the second atom of a water molecule. '
                                        'Check line: {}'.format(line))
                    wat_count += 1

                if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES:
                    if atom_type == C_ALPHA:
                        ca_res_atom_id_dict[mol_num] = atom_id
                    else:
                        if atom_type == C_BETA:
                            cb_res_atom_id_dict[mol_num] = atom_id
                        if atom_type in element_dict:
                            element = element_dict[atom_type]
                        else:
                            raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please "
                                                   "provide a new atom type, element dictionary (using keyword {} "
                                                   "in the configuration file) that includes all atom types in the "
                                                   "residues identified with the '{}' key."
                                                   "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_id)
                        else:
                            qmmm_elem_id_dict[element] = [atom_id]
                        atoms_for_vmd.append(atom_id - 1)

                if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]:
                    if atom_type in element_dict:
                        element = element_dict[atom_type]
                    else:
                        if atom_type not in missing_types:
                            warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite "
                                    "element type in the pdb output.".format(atom_type))
                            missing_types.append(atom_type)

                # For numbering molecules from 1 to end
                if cfg[RENUM_MOL]:
                    if last_mol_num is None:
                        last_mol_num = mol_num

                    if mol_num != last_mol_num:
                        last_mol_num = mol_num
                        mol_count += 1
                        if mol_count == 10000:
                            warning("Molecule numbers greater than 9999 will be printed in hex")

                    # Due to PDB format constraints, need to print in hex starting at 9999 molecules.
                    if mol_count > 9999:
                        mol_num = format(mol_count, 'x')
                        if len(mol_num) > 4:
                            warning("Hex representation of {} is {}, which is greater than 4 characters. This"
                                    "will affect the PDB output formatting.".format(atom_id, atom_num))
                    else:
                        mol_num = '{:4d}'.format(mol_count)

                line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z,
                               occ_t, element, last_cols]
                atoms_content.append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                pdb_data[TAIL_CONTENT].append(line)

    # Only sort if there is renumbering
    if len(atom_num_dict) > 0:
        pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1])
    else:
        pdb_data[ATOMS_CONTENT] = atoms_content

    if cfg[PDB_NEW_FILE] is None:
        f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR])
    else:
        f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR])
    print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT],
              f_name, cfg[PDB_FORMAT])

    if len(cfg[RESID_QMMM]) > 0:
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in qmmm_elem_id_dict:
            print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode)
        f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
Example #12
0
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False):
    try:
        dim_vectors, header_row, hist_data = np_float_array_from_file(data_file, delimiter=delimiter,
                                                                      header=header, gather_hist=make_hist)

    except InvalidDataError as e:
        raise InvalidDataError("{}\n"
                               "Run program with '-h' to see options, such as specifying header row (-n) "
                               "and/or delimiter (-d)".format(e))

    if header:
        to_print = [[''] + header_row]
    else:
        to_print = []

    max_vector = dim_vectors.max(axis=0)
    min_vector = dim_vectors.min(axis=0)
    avg_vector = dim_vectors.mean(axis=0)
    med_vector = np.percentile(dim_vectors, 50, axis=0)

    # noinspection PyTypeChecker
    to_print += [['Min values:'] + min_vector.tolist(),
                 ['Max values:'] + max_vector.tolist(),
                 ['Avg values:'] + avg_vector.tolist(),
                 ['Std dev:'] + dim_vectors.std(axis=0, ddof=1).tolist(),
                 ['5% percentile:'] + np.percentile(dim_vectors, 4.55, axis=0).tolist(),
                 ['32% percentile:'] + np.percentile(dim_vectors, 31.73, axis=0).tolist(),
                 ['50% percentile:'] + med_vector.tolist(),
                 ['68% percentile:'] + np.percentile(dim_vectors, 68.27, axis=0).tolist(),
                 ['95% percentile:'] + np.percentile(dim_vectors, 95.45, axis=0).tolist(),
                 ]
    if len_buffer is not None:
        to_print.append(['Max plus {} buffer:'.format(len_buffer)] + (max_vector + len_buffer).tolist())

    if min_max_dict is not None:
        nan_list = [np.nan] * len(header_row)
        avg_ini_diff = ['Avg % Diff:'] + nan_list
        med_ini_diff = ['Med % Diff:'] + nan_list
        med_is_min = ['Median is Min:'] + nan_list
        med_is_max = ['Median is Max:'] + nan_list
        for col_num, header in enumerate(to_print[0]):
            if header in min_max_dict[0]:
                ini_val = min_max_dict[0][header]
                low_val = min_max_dict[1][header]
                upp_val = min_max_dict[2][header]
                avg_val = avg_vector[col_num - 1]
                med_val = med_vector[col_num - 1]
                min_val = min_vector[col_num - 1]
                max_val = max_vector[col_num - 1]
                min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL)
                med_tol = max(TOL * abs(med_val), TOL)
                max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL)
                if (low_val - min_val) > min_tol:
                    warning("Minimum value found for header '{}' ({}) is less than lower bound ({})"
                            "".format(header, min_val, low_val))
                if (max_val - upp_val) > max_tol:
                    warning("Maximum value found for header '{}' ({}) is greater than upper bound ({})"
                            "".format(header, max_val, upp_val))
                avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100
                med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100
                if abs(med_val - low_val) > med_tol:
                    med_is_min[col_num] = 0
                else:
                    med_is_min[col_num] = 1
                if abs(med_val - upp_val) > med_tol:
                    med_is_max[col_num] = 0
                else:
                    med_is_max[col_num] = 1
                    # else:
                    #     for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
                    #         min_max_list.append(np.nan)
        for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
            to_print.append(min_max_list)

    # Printing to standard out: do not print quotes around strings because using csv writer
    # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file))
    if len(dim_vectors[0]) < 12:
        for index, row in enumerate(to_print):
            # formatting for header
            if index == 0 and header:
                print("{:>20s} {}".format(row[0],
                                          ' '.join(['{:>16s}'.format(x.strip()) for x in row[1:]])))
            # formatting for vals
            else:
                print("{:>20s} {}".format(row[0], ' '.join(['{:16.6f}'.format(x) for x in row[1:]])))

    f_name = create_out_fname(data_file, prefix='stats_', ext='.csv', base_dir=out_dir)
    list_to_csv(to_print, f_name)
    # list_to_file(to_print, f_name, delimiter=',')

    if make_hist:
        create_hists(data_file, header_row, hist_data, out_dir)
Example #13
0
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict):

    with open(cfg[PSF_FILE]) as f:
        psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
        num_atoms_pat = re.compile(r"(\d+).*NATOM$")

        num_atoms = 1
        section = SEC_HEAD

        # for printing qmmm info
        qmmm_elem_id_dict = {}
        ca_res_atom_id_dict = {}
        cb_res_atom_id_dict = {}
        atoms_for_vmd = []
        types_for_mm_kind = set()
        qmmm_charge = 0

        # for RENUM_MOL
        last_resid = None
        cur_mol_num = 0

        for line in f.readlines():
            s_line = line.strip()
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if section == SEC_HEAD:
                psf_data[HEAD_CONTENT].append(line.rstrip())

                atoms_match = num_atoms_pat.match(s_line)
                if atoms_match:
                    # regex is 1-based
                    num_atoms = int(atoms_match.group(1))
                    section = SEC_ATOMS

            elif section == SEC_ATOMS:
                if len(s_line) == 0:
                    continue
                split_line = s_line.split()
                atom_num = int(split_line[0])
                segid = split_line[1]
                resid = int(split_line[2])
                resname = split_line[3]
                atom_type = split_line[4]
                charmm_type = split_line[5]
                charge = float(split_line[6])
                atom_wt = float(split_line[7])
                zero = split_line[8]

                # For reordering atoms
                if atom_num in atom_num_dict:
                    atom_num = atom_num_dict[atom_num]

                # For user-specified changing of molecule number
                if resid in mol_num_dict:
                    resid = mol_num_dict[resid]

                if cfg[RENUM_MOL]:
                    if resid != last_resid:
                        last_resid = resid
                        cur_mol_num += 1
                    resid = cur_mol_num

                atom_struct = [
                    atom_num, segid, resid, resname, atom_type, charmm_type,
                    charge, atom_wt, zero
                ]
                psf_data[ATOMS_CONTENT].append(atom_struct)

                if resid in cfg[RESID_QM] or resid in cfg[
                        RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]:
                    if resid in cfg[RESID_QMMM]:
                        if atom_type == C_ALPHA:
                            ca_res_atom_id_dict[resid] = atom_num

                    if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA:
                        ca_res_atom_id_dict[resid] = atom_num
                    else:
                        if resid in cfg[RESID_QMMM] and atom_type == C_BETA:
                            cb_res_atom_id_dict[resid] = atom_num
                        if atom_type in element_dict:
                            element = element_dict[atom_type]
                        else:
                            raise InvalidDataError(
                                "Did not find atom type '{}' in the element dictionary. Please "
                                "provide a new atom type, element dictionary (using keyword {} "
                                "in the configuration file) that includes all atom types in the "
                                "residues identified with the '{}' key."
                                "".format(atom_type, ELEMENT_DICT_FILE,
                                          RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_num)
                        else:
                            qmmm_elem_id_dict[element] = [atom_num]
                        qmmm_charge += charge
                        atoms_for_vmd.append(atom_num - 1)

                if cfg[PRINT_FOR_CP2K]:
                    types_for_mm_kind.add(atom_type)

                if len(psf_data[ATOMS_CONTENT]) == num_atoms:
                    section = SEC_TAIL
            # tail_content to contain everything after the 'Atoms' section
            elif section == SEC_TAIL:
                psf_data[TAIL_CONTENT].append(line.rstrip())

    if len(atom_num_dict) > 0:
        warning(
            "This program does not yet edit any sections other than the atoms section."
            "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and"
            "cross-terms sections will not match.")
        psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT],
                                         key=lambda entry: entry[0])

    if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0:
        if cfg[PSF_NEW_FILE] is None:
            f_name = create_out_fname(cfg[PSF_FILE],
                                      suffix="_new",
                                      base_dir=cfg[OUT_BASE_DIR])
        else:
            f_name = cfg[PSF_NEW_FILE]
        list_to_file(psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] +
                     psf_data[TAIL_CONTENT],
                     f_name,
                     list_format=cfg[PSF_FORMAT])

    if cfg[PRINT_FOR_CP2K]:
        print("Total charge from QM atoms: {:.2f}".format(qmmm_charge))
        # create CP2K input listing amino atom ids
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in qmmm_elem_id_dict:
            print_qm_kind(qmmm_elem_id_dict[elem],
                          elem,
                          f_name,
                          mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict,
                       cb_res_atom_id_dict,
                       f_name,
                       mode=print_mode)
        # create CP2K input listing MM atom type radii
        f_name = create_out_fname('mm_kinds.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"

        for atom_type in types_for_mm_kind:
            try:
                print_mm_kind(atom_type,
                              radii_dict[atom_type],
                              f_name,
                              mode=print_mode)
                print_mode = 'a'
            except KeyError:
                warning(
                    "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n"
                    "    '{}' printed without this type; user may manually add its radius specification.\n"
                    "    To print this file with all MM types, use the keyword '{}' in the configuration file \n"
                    "    to identify a file with atom_type,radius (one per line, comma-separated) with all "
                    "MM types in the psf".format(atom_type,
                                                 cfg[RADII_DICT_FILE],
                                                 'mm_kinds.dat',
                                                 RADII_DICT_FILE))

        # create VMD input listing amino atom indexes (base-zero counting)
        f_name = create_out_fname('vmd_protein_atoms.dat',
                                  base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
Example #14
0
def process_file(data_file, mcfg, delimiter=','):
    list_vectors, headers = read_csv_to_list(data_file,
                                             delimiter=delimiter,
                                             header=True)

    col_index_dict = {}
    for section in SUB_SECTIONS:
        col_index_dict[section] = {}
        for key, val in mcfg[section].items():
            if key in headers:
                # Parser already made sure that unique entries
                col_index_dict[section][headers.index(key)] = val
            else:
                raise InvalidDataError(
                    "Key '{}' found in configuration file but not in data file: "
                    "{}".format(key, data_file))

    # set up bins, if needed
    bin_arrays = {}
    bin_labels = {}
    bin_counts = {}
    bin_ctrs = {}
    max_bins = {}
    for bin_col, col_bin_data in col_index_dict[BIN_SEC].items():
        bin_min = col_bin_data[0]
        bin_max = col_bin_data[1]
        num_bins = col_bin_data[2]
        max_bins[bin_col] = col_bin_data[3]
        # already checked that 1 or more bins, so will not divide by zero
        bin_width = (bin_max - bin_min) / num_bins
        # set up for np.searchsorted, not np.histogram
        col_bins = np.arange(bin_min + bin_width, bin_max, bin_width)
        # set up for recording assigned bin center
        bin_ctrs[bin_col] = [
            round_to_print(ctr)
            for ctr in np.arange(bin_min + bin_width / 2, bin_max, bin_width)
        ]
        bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col])
        bin_arrays[bin_col] = col_bins
        bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col])
        headers = [bin_labels[bin_col]] + headers
        # allow filtering based on min and max
        col_index_dict[MIN_SEC][bin_col] = bin_min
        col_index_dict[MAX_SEC][bin_col] = bin_max

    initial_row_num = len(list_vectors)
    filtered_vectors = []
    for row in list_vectors:
        keep_row = True
        for col, max_val in col_index_dict[MAX_SEC].items():
            if row[col] > max_val:
                keep_row = False
        for col, min_val in col_index_dict[MIN_SEC].items():
            if row[col] < min_val:
                keep_row = False
        if keep_row:
            for col_id, col_bins in bin_arrays.items():
                bin_index = np.searchsorted(col_bins, row[col_id])
                row = [bin_ctrs[col_id][bin_index]] + row
                bin_counts[col_id][bin_index] += 1
            filtered_vectors.append(row)
    print("Keeping {} of {} rows based on filtering criteria".format(
        len(filtered_vectors), initial_row_num))

    # Print output and determine if the output needs to be adjusted because of a max number of entries per bin
    ctr_format = "{:^11} {:^8}"
    ctr_format_max = "{:^11} {:^8} {:^7}"
    excess_bins = {}
    for col_bin in bin_arrays:
        print("Histogram data for column '{}': ".format(bin_labels[col_bin]))
        if max_bins[col_bin] is None:
            print(ctr_format.format('bin_ctr', 'count'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                print(
                    ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index]))
        else:
            bin_max = max_bins[col_bin]
            excess_bins[col_bin] = {}
            print(ctr_format_max.format('bin_ctr', 'found', 'keep'))
            for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]):
                num_found = bin_counts[col_bin][bin_index]
                if num_found > bin_max:
                    num_keep = bin_max
                    # use bin_ctr as key because that is what is saved on the row
                    excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = {
                        QUOT: num_found / bin_max,
                        MOD: num_found % bin_max
                    }
                else:
                    num_keep = num_found
                print(ctr_format_max.format(bin_ctr, num_found, num_keep))

    if len(excess_bins) == 1:
        count_bin = {}
        delete_rows = []
        mod_r = {}
        quot_r = {}
        for col_bin in excess_bins:
            for bin_remove, bin_dict in excess_bins[col_bin].items():
                mod_r[bin_remove] = bin_dict[MOD]
                quot_r[bin_remove] = bin_dict[QUOT]
                count_bin[bin_remove] = 0
            r_count = 0
            for row_id, row in enumerate(filtered_vectors):
                bin_name = row[0]
                # print(bin_name)
                if bin_name in excess_bins[col_bin]:
                    count_bin[bin_name] += 1
                    if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[
                            bin_name] <= mod_r[bin_name]:
                        delete_rows.append(row_id)
                        # print(row_id)
                r_count += 1
            filtered_vectors = [
                row for row_id, row in enumerate(filtered_vectors)
                if row_id not in delete_rows
            ]
    if len(excess_bins) > 1:
        warning(
            "No filtering based on a max number of entries will be done; this feature is currently implemented "
            "only for binning with one column's values.")

    f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv')
    list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')
Example #15
0
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict):

    with open(cfg[PSF_FILE]) as f:
        psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
        num_atoms_pat = re.compile(r"(\d+).*NATOM$")

        num_atoms = 1
        section = SEC_HEAD

        # for printing qmmm info
        qmmm_elem_id_dict = {}
        ca_res_atom_id_dict = {}
        cb_res_atom_id_dict = {}
        atoms_for_vmd = []
        types_for_mm_kind = set()
        qmmm_charge = 0

        # for RENUM_MOL
        last_resid = None
        cur_mol_num = 0

        for line in f.readlines():
            s_line = line.strip()
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if section == SEC_HEAD:
                psf_data[HEAD_CONTENT].append(line.rstrip())

                atoms_match = num_atoms_pat.match(s_line)
                if atoms_match:
                    # regex is 1-based
                    num_atoms = int(atoms_match.group(1))
                    section = SEC_ATOMS

            elif section == SEC_ATOMS:
                if len(s_line) == 0:
                    continue
                split_line = s_line.split()
                atom_num = int(split_line[0])
                segid = split_line[1]
                resid = int(split_line[2])
                resname = split_line[3]
                atom_type = split_line[4]
                charmm_type = split_line[5]
                charge = float(split_line[6])
                atom_wt = float(split_line[7])
                zero = split_line[8]

                # For reordering atoms
                if atom_num in atom_num_dict:
                    atom_num = atom_num_dict[atom_num]

                # For user-specified changing of molecule number
                if resid in mol_num_dict:
                    resid = mol_num_dict[resid]

                if cfg[RENUM_MOL]:
                    if resid != last_resid:
                        last_resid = resid
                        cur_mol_num += 1
                    resid = cur_mol_num

                atom_struct = [atom_num, segid, resid, resname, atom_type, charmm_type, charge, atom_wt, zero]
                psf_data[ATOMS_CONTENT].append(atom_struct)

                if resid in cfg[RESID_QM] or resid in cfg[RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]:
                    if resid in cfg[RESID_QMMM]:
                        if atom_type == C_ALPHA:
                            ca_res_atom_id_dict[resid] = atom_num

                    if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA:
                        ca_res_atom_id_dict[resid] = atom_num
                    else:
                        if resid in cfg[RESID_QMMM] and atom_type == C_BETA:
                            cb_res_atom_id_dict[resid] = atom_num
                        if atom_type in element_dict:
                            element = element_dict[atom_type]
                        else:
                            raise InvalidDataError(
                                "Did not find atom type '{}' in the element dictionary. Please "
                                "provide a new atom type, element dictionary (using keyword {} "
                                "in the configuration file) that includes all atom types in the "
                                "residues identified with the '{}' key."
                                "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)
                            )
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_num)
                        else:
                            qmmm_elem_id_dict[element] = [atom_num]
                        qmmm_charge += charge
                        atoms_for_vmd.append(atom_num - 1)

                if cfg[PRINT_FOR_CP2K]:
                    types_for_mm_kind.add(atom_type)

                if len(psf_data[ATOMS_CONTENT]) == num_atoms:
                    section = SEC_TAIL
            # tail_content to contain everything after the 'Atoms' section
            elif section == SEC_TAIL:
                psf_data[TAIL_CONTENT].append(line.rstrip())

    if len(atom_num_dict) > 0:
        warning(
            "This program does not yet edit any sections other than the atoms section."
            "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and"
            "cross-terms sections will not match."
        )
        psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT], key=lambda entry: entry[0])

    if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0:
        if cfg[PSF_NEW_FILE] is None:
            f_name = create_out_fname(cfg[PSF_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR])
        else:
            f_name = cfg[PSF_NEW_FILE]
        list_to_file(
            psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] + psf_data[TAIL_CONTENT],
            f_name,
            list_format=cfg[PSF_FORMAT],
        )

    if cfg[PRINT_FOR_CP2K]:
        print("Total charge from QM atoms: {:.2f}".format(qmmm_charge))
        # create CP2K input listing amino atom ids
        f_name = create_out_fname("amino_id.dat", base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in qmmm_elem_id_dict:
            print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode)
            print_mode = "a"
        print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode)
        # create CP2K input listing MM atom type radii
        f_name = create_out_fname("mm_kinds.dat", base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"

        for atom_type in types_for_mm_kind:
            try:
                print_mm_kind(atom_type, radii_dict[atom_type], f_name, mode=print_mode)
                print_mode = "a"
            except KeyError:
                warning(
                    "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n"
                    "    '{}' printed without this type; user may manually add its radius specification.\n"
                    "    To print this file with all MM types, use the keyword '{}' in the configuration file \n"
                    "    to identify a file with atom_type,radius (one per line, comma-separated) with all "
                    "MM types in the psf".format(atom_type, cfg[RADII_DICT_FILE], "mm_kinds.dat", RADII_DICT_FILE)
                )

        # create VMD input listing amino atom indexes (base-zero counting)
        f_name = create_out_fname("vmd_protein_atoms.dat", base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=" ")