def process_file(file_to_process, cfg): """ Will complete the work of this script based on the provided cfg @param file_to_process: the file with column to be combined @param cfg: the configuration of this run @return: errors or nothing """ to_print = [] # determine if any type conversion has been specified & create conv dict if needed if cfg[COL1_CONV] is None and cfg[COL2_CONV] is None: conv_dict = None else: conv_dict = {} if cfg[COL1_CONV] is not None: conv_dict[cfg[COL1]] = cfg[COL1_CONV] if cfg[COL2_CONV] is not None: conv_dict[cfg[COL2]] = cfg[COL2_CONV] raw_col_data = read_csv(file_to_process, data_conv=conv_dict, quote_style=csv.QUOTE_NONNUMERIC) for header in cfg[COL1], cfg[COL2]: if header not in raw_col_data[0]: raise InvalidDataError("Specified column header '{}' was not found in file: {}" "".format(header, file_to_process)) for row in raw_col_data: to_print.append(["".join(map(str, [cfg[PREFIX], row[cfg[COL1]], cfg[MIDDLE], row[cfg[COL2]], cfg[SUFFIX]]))]) list_to_csv(to_print, cfg[OUT_FILE], delimiter=',', quote_style=csv.QUOTE_MINIMAL)
def process_file(data_file, mcfg, delimiter=','): list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True) col_index_dict = {} for section in SUB_SECTIONS: col_index_dict[section] = {} for key, val in mcfg[section].items(): if key in headers: # Parser already made sure that unique entries col_index_dict[section][headers.index(key)] = val else: raise InvalidDataError( "Key '{}' found in configuration file but not in data file: " "{}".format(key, data_file)) edited_vectors = [] for row in list_vectors: for col, max_val in col_index_dict[MAX_SEC].items(): if row[col] > max_val: row[col] = max_val for col, min_val in col_index_dict[MIN_SEC].items(): if row[col] < min_val: row[col] = min_val edited_vectors.append(row) f_name = create_out_fname(data_file, ext='.csv') list_to_csv([headers] + edited_vectors, f_name, delimiter=',')
def print_gofr(cfg, gofr_data): g_dr = cfg[GOFR_DR] dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2 gofr_out_fieldnames = [GOFR_R] gofr_output = dr_array if cfg[CALC_HO_GOFR]: normal_fac = np.square( dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HO) gofr_output = np.column_stack((gofr_output, gofr_ho)) if cfg[CALC_OO_GOFR]: normal_fac = np.square( dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OO) gofr_output = np.column_stack((gofr_output, gofr_oo)) if cfg[CALC_HH_GOFR]: normal_fac = np.square( dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HH) gofr_output = np.column_stack((gofr_output, gofr_hh)) if cfg[CALC_OH_GOFR]: normal_fac = np.square( dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OH) gofr_output = np.column_stack((gofr_output, gofr_oh)) if cfg[CALC_TYPE_GOFR]: if gofr_data[TYPE_STEPS_COUNTED] > 0: normal_fac = np.square( dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_TYPE) gofr_output = np.column_stack((gofr_output, gofr_type)) else: warning("Did not find any timesteps with the pairs in {}. " "This output will not be printed.".format(CALC_TYPE_GOFR)) f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix='_gofrs', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) # am not using the dict writer because the gofr output is a np.array list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out, print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS)
def print_gofr(cfg, gofr_data): g_dr = cfg[GOFR_DR] dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2 gofr_out_fieldnames = [GOFR_R] gofr_output = dr_array if cfg[CALC_HO_GOFR]: normal_fac = np.square(dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HO) gofr_output = np.column_stack((gofr_output, gofr_ho)) if cfg[CALC_OO_GOFR]: normal_fac = np.square(dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OO) gofr_output = np.column_stack((gofr_output, gofr_oo)) if cfg[CALC_HH_GOFR]: normal_fac = np.square(dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HH) gofr_output = np.column_stack((gofr_output, gofr_hh)) if cfg[CALC_OH_GOFR]: normal_fac = np.square(dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OH) gofr_output = np.column_stack((gofr_output, gofr_oh)) if cfg[CALC_TYPE_GOFR]: if gofr_data[TYPE_STEPS_COUNTED] > 0: normal_fac = np.square(dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_TYPE) gofr_output = np.column_stack((gofr_output, gofr_type)) else: warning( "Did not find any timesteps with the pairs in {}. " "This output will not be printed.".format(CALC_TYPE_GOFR) ) f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix="_gofrs", ext=".csv", base_dir=cfg[OUT_BASE_DIR]) # list_to_file([gofr_out_fieldnames] + gofr_output.tolist(), f_out, delimiter=',') list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out)
def create_hists(data_file, header_row, hist_data, out_dir): counts_to_print = [] if len(hist_data) > 0: for col in hist_data: count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file) if len(counts_to_print) == 0: counts_to_print = count_to_print else: len1 = len(counts_to_print) len2 = len(count_to_print) width1 = len(counts_to_print[0]) width2 = len(count_to_print[0]) combined_list = [] for row in range(min(len1, len2)): combined_list.append(counts_to_print[row] + count_to_print[row]) for row in range(len2, len1): combined_list.append(counts_to_print[row] + [""] * width2) for row in range(len1, len2): # noinspection PyTypeChecker combined_list.append([""] * width1 + count_to_print[row]) counts_to_print = copy.deepcopy(combined_list) f_name = create_out_fname(data_file, prefix="counts_", ext=".csv", base_dir=out_dir) list_to_csv(counts_to_print, f_name, delimiter=",")
def create_hists(data_file, header_row, hist_data, out_dir): counts_to_print = [] if len(hist_data) > 0: for col in hist_data: count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file) if len(counts_to_print) == 0: counts_to_print = count_to_print else: len1 = len(counts_to_print) len2 = len(count_to_print) width1 = len(counts_to_print[0]) width2 = len(count_to_print[0]) combined_list = [] for row in range(min(len1, len2)): combined_list.append(counts_to_print[row] + count_to_print[row]) for row in range(len2, len1): combined_list.append(counts_to_print[row] + [""] * width2) for row in range(len1, len2): # noinspection PyTypeChecker combined_list.append([""] * width1 + count_to_print[row]) counts_to_print = copy.deepcopy(combined_list) f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir) list_to_csv(counts_to_print, f_name, delimiter=',')
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False): try: dim_vectors, header_row, hist_data = np_float_array_from_file( data_file, delimiter=delimiter, header=header, gather_hist=make_hist ) except InvalidDataError as e: raise InvalidDataError( "{}\n" "Run program with '-h' to see options, such as specifying header row (-n) " "and/or delimiter (-d)".format(e) ) if header: to_print = [[""] + header_row] else: to_print = [] max_vector = dim_vectors.max(axis=0) min_vector = dim_vectors.min(axis=0) avg_vector = dim_vectors.mean(axis=0) med_vector = np.percentile(dim_vectors, 50, axis=0) # noinspection PyTypeChecker to_print += [ ["Min values:"] + min_vector.tolist(), ["Max values:"] + max_vector.tolist(), ["Avg values:"] + avg_vector.tolist(), ["Std dev:"] + dim_vectors.std(axis=0, ddof=1).tolist(), ["5% percentile:"] + np.percentile(dim_vectors, 4.55, axis=0).tolist(), ["32% percentile:"] + np.percentile(dim_vectors, 31.73, axis=0).tolist(), ["50% percentile:"] + med_vector.tolist(), ["68% percentile:"] + np.percentile(dim_vectors, 68.27, axis=0).tolist(), ["95% percentile:"] + np.percentile(dim_vectors, 95.45, axis=0).tolist(), ] if len_buffer is not None: to_print.append(["Max plus {} buffer:".format(len_buffer)] + (max_vector + len_buffer).tolist()) if min_max_dict is not None: nan_list = [np.nan] * len(header_row) avg_ini_diff = ["Avg % Diff:"] + nan_list med_ini_diff = ["Med % Diff:"] + nan_list med_is_min = ["Median is Min:"] + nan_list med_is_max = ["Median is Max:"] + nan_list for col_num, header in enumerate(to_print[0]): if header in min_max_dict[0]: ini_val = min_max_dict[0][header] low_val = min_max_dict[1][header] upp_val = min_max_dict[2][header] avg_val = avg_vector[col_num - 1] med_val = med_vector[col_num - 1] min_val = min_vector[col_num - 1] max_val = max_vector[col_num - 1] min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL) med_tol = max(TOL * abs(med_val), TOL) max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL) if (low_val - min_val) > min_tol: warning( "Minimum value found for header '{}' ({}) is less than lower bound ({})" "".format(header, min_val, low_val) ) if (max_val - upp_val) > max_tol: warning( "Maximum value found for header '{}' ({}) is greater than upper bound ({})" "".format(header, max_val, upp_val) ) avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100 med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100 if abs(med_val - low_val) > med_tol: med_is_min[col_num] = 0 else: med_is_min[col_num] = 1 if abs(med_val - upp_val) > med_tol: med_is_max[col_num] = 0 else: med_is_max[col_num] = 1 # else: # for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: # min_max_list.append(np.nan) for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: to_print.append(min_max_list) # Printing to standard out: do not print quotes around strings because using csv writer # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file)) if len(dim_vectors[0]) < 12: for index, row in enumerate(to_print): # formatting for header if index == 0 and header: print("{:>20s} {}".format(row[0], " ".join(["{:>16s}".format(x.strip()) for x in row[1:]]))) # formatting for vals else: print("{:>20s} {}".format(row[0], " ".join(["{:16.6f}".format(x) for x in row[1:]]))) f_name = create_out_fname(data_file, prefix="stats_", ext=".csv", base_dir=out_dir) list_to_csv(to_print, f_name) # list_to_file(to_print, f_name, delimiter=',') if make_hist: create_hists(data_file, header_row, hist_data, out_dir)
def process_file(data_file, mcfg, delimiter=','): list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True) col_index_dict = {} for section in SUB_SECTIONS: col_index_dict[section] = {} for key, val in mcfg[section].items(): if key in headers: # Parser already made sure that unique entries col_index_dict[section][headers.index(key)] = val else: raise InvalidDataError("Key '{}' found in configuration file but not in data file: " "{}".format(key, data_file)) # set up bins, if needed bin_arrays = {} bin_labels = {} bin_counts = {} bin_ctrs = {} max_bins = {} for bin_col, col_bin_data in col_index_dict[BIN_SEC].items(): bin_min = col_bin_data[0] bin_max = col_bin_data[1] num_bins = col_bin_data[2] max_bins[bin_col] = col_bin_data[3] # already checked that 1 or more bins, so will not divide by zero bin_width = (bin_max - bin_min) / num_bins # set up for np.searchsorted, not np.histogram col_bins = np.arange(bin_min + bin_width, bin_max, bin_width) # set up for recording assigned bin center bin_ctrs[bin_col] = [round_to_print(ctr) for ctr in np.arange(bin_min + bin_width/2, bin_max, bin_width)] bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col]) bin_arrays[bin_col] = col_bins bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col]) headers = [bin_labels[bin_col]] + headers # allow filtering based on min and max col_index_dict[MIN_SEC][bin_col] = bin_min col_index_dict[MAX_SEC][bin_col] = bin_max initial_row_num = len(list_vectors) filtered_vectors = [] for row in list_vectors: keep_row = True for col, max_val in col_index_dict[MAX_SEC].items(): if row[col] > max_val: keep_row = False for col, min_val in col_index_dict[MIN_SEC].items(): if row[col] < min_val: keep_row = False if keep_row: for col_id, col_bins in bin_arrays.items(): bin_index = np.searchsorted(col_bins, row[col_id]) row = [bin_ctrs[col_id][bin_index]] + row bin_counts[col_id][bin_index] += 1 filtered_vectors.append(row) print("Keeping {} of {} rows based on filtering criteria".format(len(filtered_vectors), initial_row_num)) # Print output and determine if the output needs to be adjusted because of a max number of entries per bin ctr_format = "{:^11} {:^8}" ctr_format_max = "{:^11} {:^8} {:^7}" excess_bins = {} for col_bin in bin_arrays: print("Histogram data for column '{}': ".format(bin_labels[col_bin])) if max_bins[col_bin] is None: print(ctr_format.format('bin_ctr', 'count')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): print(ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index])) else: bin_max = max_bins[col_bin] excess_bins[col_bin] = {} print(ctr_format_max.format('bin_ctr', 'found', 'keep')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): num_found = bin_counts[col_bin][bin_index] if num_found > bin_max: num_keep = bin_max # use bin_ctr as key because that is what is saved on the row excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = {QUOT: num_found / bin_max, MOD: num_found % bin_max} else: num_keep = num_found print(ctr_format_max.format(bin_ctr, num_found, num_keep)) if len(excess_bins) == 1: count_bin = {} delete_rows = [] mod_r = {} quot_r = {} for col_bin in excess_bins: for bin_remove, bin_dict in excess_bins[col_bin].items(): mod_r[bin_remove] = bin_dict[MOD] quot_r[bin_remove] = bin_dict[QUOT] count_bin[bin_remove] = 0 r_count = 0 for row_id, row in enumerate(filtered_vectors): bin_name = row[0] # print(bin_name) if bin_name in excess_bins[col_bin]: count_bin[bin_name] += 1 if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[bin_name] <= mod_r[bin_name]: delete_rows.append(row_id) # print(row_id) r_count += 1 filtered_vectors = [row for row_id, row in enumerate(filtered_vectors) if row_id not in delete_rows] if len(excess_bins) > 1: warning("No filtering based on a max number of entries will be done; this feature is currently implemented " "only for binning with one column's values.") f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv') list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict): pdb_loc = cfg[PDB_FILE] pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} # to allow warning to be printed once and only once missing_types = [] qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] with open(pdb_loc) as f: wat_count = 0 atom_count = 0 mol_count = 1 current_mol = None last_mol_num = None atoms_content = [] for line in f: line = line.strip() line_len = len(line) if line_len == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if line_head == 'REMARK' or line_head == 'CRYST1': pdb_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] # For renumbering, making sure prints in the correct format, including num of characters: atom_count += 1 # For reordering atoms if atom_count in atom_num_dict: atom_id = atom_num_dict[atom_count] else: atom_id = atom_count if atom_id > 99999: atom_num = format(atom_id, 'x') if len(atom_num) > 5: warning("Hex representation of {} is {}, which is greater than 5 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: atom_num = '{:5d}'.format(atom_id) atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]] element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]] last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:] # For user-specified changing of molecule number if mol_num in mol_num_dict: mol_num = mol_num_dict[mol_num] # If doing water molecule checking... if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]: if (wat_count % 3) == 0: current_mol = mol_num if atom_type != ' OH2 ': warning('Expected an OH2 atom to be the first atom of a water molecule. ' 'Check line: {}'.format(line)) # last_cols = ' 0.00 0.00 S2 O' else: if current_mol != mol_num: warning('Water not in order on line:', line) if (wat_count % 3) == 1: if atom_type != ' H1 ': warning('Expected an H1 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) else: if atom_type != ' H2 ': warning('Expected an H2 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) wat_count += 1 if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES: if atom_type == C_ALPHA: ca_res_atom_id_dict[mol_num] = atom_id else: if atom_type == C_BETA: cb_res_atom_id_dict[mol_num] = atom_id if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_id) else: qmmm_elem_id_dict[element] = [atom_id] atoms_for_vmd.append(atom_id - 1) if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]: if atom_type in element_dict: element = element_dict[atom_type] else: if atom_type not in missing_types: warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite " "element type in the pdb output.".format(atom_type)) missing_types.append(atom_type) # For numbering molecules from 1 to end if cfg[RENUM_MOL]: if last_mol_num is None: last_mol_num = mol_num if mol_num != last_mol_num: last_mol_num = mol_num mol_count += 1 if mol_count == 10000: warning("Molecule numbers greater than 9999 will be printed in hex") # Due to PDB format constraints, need to print in hex starting at 9999 molecules. if mol_count > 9999: mol_num = format(mol_count, 'x') if len(mol_num) > 4: warning("Hex representation of {} is {}, which is greater than 4 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: mol_num = '{:4d}'.format(mol_count) line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, occ_t, element, last_cols] atoms_content.append(line_struct) # tail_content to contain everything after the 'Atoms' section else: pdb_data[TAIL_CONTENT].append(line) # Only sort if there is renumbering if len(atom_num_dict) > 0: pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1]) else: pdb_data[ATOMS_CONTENT] = atoms_content if cfg[PDB_NEW_FILE] is None: f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR]) print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT]) if len(cfg[RESID_QMMM]) > 0: f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False): try: dim_vectors, header_row, hist_data = np_float_array_from_file(data_file, delimiter=delimiter, header=header, gather_hist=make_hist) except InvalidDataError as e: raise InvalidDataError("{}\n" "Run program with '-h' to see options, such as specifying header row (-n) " "and/or delimiter (-d)".format(e)) if header: to_print = [[''] + header_row] else: to_print = [] max_vector = dim_vectors.max(axis=0) min_vector = dim_vectors.min(axis=0) avg_vector = dim_vectors.mean(axis=0) med_vector = np.percentile(dim_vectors, 50, axis=0) # noinspection PyTypeChecker to_print += [['Min values:'] + min_vector.tolist(), ['Max values:'] + max_vector.tolist(), ['Avg values:'] + avg_vector.tolist(), ['Std dev:'] + dim_vectors.std(axis=0, ddof=1).tolist(), ['5% percentile:'] + np.percentile(dim_vectors, 4.55, axis=0).tolist(), ['32% percentile:'] + np.percentile(dim_vectors, 31.73, axis=0).tolist(), ['50% percentile:'] + med_vector.tolist(), ['68% percentile:'] + np.percentile(dim_vectors, 68.27, axis=0).tolist(), ['95% percentile:'] + np.percentile(dim_vectors, 95.45, axis=0).tolist(), ] if len_buffer is not None: to_print.append(['Max plus {} buffer:'.format(len_buffer)] + (max_vector + len_buffer).tolist()) if min_max_dict is not None: nan_list = [np.nan] * len(header_row) avg_ini_diff = ['Avg % Diff:'] + nan_list med_ini_diff = ['Med % Diff:'] + nan_list med_is_min = ['Median is Min:'] + nan_list med_is_max = ['Median is Max:'] + nan_list for col_num, header in enumerate(to_print[0]): if header in min_max_dict[0]: ini_val = min_max_dict[0][header] low_val = min_max_dict[1][header] upp_val = min_max_dict[2][header] avg_val = avg_vector[col_num - 1] med_val = med_vector[col_num - 1] min_val = min_vector[col_num - 1] max_val = max_vector[col_num - 1] min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL) med_tol = max(TOL * abs(med_val), TOL) max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL) if (low_val - min_val) > min_tol: warning("Minimum value found for header '{}' ({}) is less than lower bound ({})" "".format(header, min_val, low_val)) if (max_val - upp_val) > max_tol: warning("Maximum value found for header '{}' ({}) is greater than upper bound ({})" "".format(header, max_val, upp_val)) avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100 med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100 if abs(med_val - low_val) > med_tol: med_is_min[col_num] = 0 else: med_is_min[col_num] = 1 if abs(med_val - upp_val) > med_tol: med_is_max[col_num] = 0 else: med_is_max[col_num] = 1 # else: # for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: # min_max_list.append(np.nan) for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: to_print.append(min_max_list) # Printing to standard out: do not print quotes around strings because using csv writer # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file)) if len(dim_vectors[0]) < 12: for index, row in enumerate(to_print): # formatting for header if index == 0 and header: print("{:>20s} {}".format(row[0], ' '.join(['{:>16s}'.format(x.strip()) for x in row[1:]]))) # formatting for vals else: print("{:>20s} {}".format(row[0], ' '.join(['{:16.6f}'.format(x) for x in row[1:]]))) f_name = create_out_fname(data_file, prefix='stats_', ext='.csv', base_dir=out_dir) list_to_csv(to_print, f_name) # list_to_file(to_print, f_name, delimiter=',') if make_hist: create_hists(data_file, header_row, hist_data, out_dir)
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict): with open(cfg[PSF_FILE]) as f: psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} num_atoms_pat = re.compile(r"(\d+).*NATOM$") num_atoms = 1 section = SEC_HEAD # for printing qmmm info qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] types_for_mm_kind = set() qmmm_charge = 0 # for RENUM_MOL last_resid = None cur_mol_num = 0 for line in f.readlines(): s_line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: psf_data[HEAD_CONTENT].append(line.rstrip()) atoms_match = num_atoms_pat.match(s_line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(s_line) == 0: continue split_line = s_line.split() atom_num = int(split_line[0]) segid = split_line[1] resid = int(split_line[2]) resname = split_line[3] atom_type = split_line[4] charmm_type = split_line[5] charge = float(split_line[6]) atom_wt = float(split_line[7]) zero = split_line[8] # For reordering atoms if atom_num in atom_num_dict: atom_num = atom_num_dict[atom_num] # For user-specified changing of molecule number if resid in mol_num_dict: resid = mol_num_dict[resid] if cfg[RENUM_MOL]: if resid != last_resid: last_resid = resid cur_mol_num += 1 resid = cur_mol_num atom_struct = [ atom_num, segid, resid, resname, atom_type, charmm_type, charge, atom_wt, zero ] psf_data[ATOMS_CONTENT].append(atom_struct) if resid in cfg[RESID_QM] or resid in cfg[ RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]: if resid in cfg[RESID_QMMM]: if atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num else: if resid in cfg[RESID_QMMM] and atom_type == C_BETA: cb_res_atom_id_dict[resid] = atom_num if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError( "Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_num) else: qmmm_elem_id_dict[element] = [atom_num] qmmm_charge += charge atoms_for_vmd.append(atom_num - 1) if cfg[PRINT_FOR_CP2K]: types_for_mm_kind.add(atom_type) if len(psf_data[ATOMS_CONTENT]) == num_atoms: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: psf_data[TAIL_CONTENT].append(line.rstrip()) if len(atom_num_dict) > 0: warning( "This program does not yet edit any sections other than the atoms section." "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and" "cross-terms sections will not match.") psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT], key=lambda entry: entry[0]) if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0: if cfg[PSF_NEW_FILE] is None: f_name = create_out_fname(cfg[PSF_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = cfg[PSF_NEW_FILE] list_to_file(psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] + psf_data[TAIL_CONTENT], f_name, list_format=cfg[PSF_FORMAT]) if cfg[PRINT_FOR_CP2K]: print("Total charge from QM atoms: {:.2f}".format(qmmm_charge)) # create CP2K input listing amino atom ids f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) # create CP2K input listing MM atom type radii f_name = create_out_fname('mm_kinds.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for atom_type in types_for_mm_kind: try: print_mm_kind(atom_type, radii_dict[atom_type], f_name, mode=print_mode) print_mode = 'a' except KeyError: warning( "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n" " '{}' printed without this type; user may manually add its radius specification.\n" " To print this file with all MM types, use the keyword '{}' in the configuration file \n" " to identify a file with atom_type,radius (one per line, comma-separated) with all " "MM types in the psf".format(atom_type, cfg[RADII_DICT_FILE], 'mm_kinds.dat', RADII_DICT_FILE)) # create VMD input listing amino atom indexes (base-zero counting) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def process_file(data_file, mcfg, delimiter=','): list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True) col_index_dict = {} for section in SUB_SECTIONS: col_index_dict[section] = {} for key, val in mcfg[section].items(): if key in headers: # Parser already made sure that unique entries col_index_dict[section][headers.index(key)] = val else: raise InvalidDataError( "Key '{}' found in configuration file but not in data file: " "{}".format(key, data_file)) # set up bins, if needed bin_arrays = {} bin_labels = {} bin_counts = {} bin_ctrs = {} max_bins = {} for bin_col, col_bin_data in col_index_dict[BIN_SEC].items(): bin_min = col_bin_data[0] bin_max = col_bin_data[1] num_bins = col_bin_data[2] max_bins[bin_col] = col_bin_data[3] # already checked that 1 or more bins, so will not divide by zero bin_width = (bin_max - bin_min) / num_bins # set up for np.searchsorted, not np.histogram col_bins = np.arange(bin_min + bin_width, bin_max, bin_width) # set up for recording assigned bin center bin_ctrs[bin_col] = [ round_to_print(ctr) for ctr in np.arange(bin_min + bin_width / 2, bin_max, bin_width) ] bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col]) bin_arrays[bin_col] = col_bins bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col]) headers = [bin_labels[bin_col]] + headers # allow filtering based on min and max col_index_dict[MIN_SEC][bin_col] = bin_min col_index_dict[MAX_SEC][bin_col] = bin_max initial_row_num = len(list_vectors) filtered_vectors = [] for row in list_vectors: keep_row = True for col, max_val in col_index_dict[MAX_SEC].items(): if row[col] > max_val: keep_row = False for col, min_val in col_index_dict[MIN_SEC].items(): if row[col] < min_val: keep_row = False if keep_row: for col_id, col_bins in bin_arrays.items(): bin_index = np.searchsorted(col_bins, row[col_id]) row = [bin_ctrs[col_id][bin_index]] + row bin_counts[col_id][bin_index] += 1 filtered_vectors.append(row) print("Keeping {} of {} rows based on filtering criteria".format( len(filtered_vectors), initial_row_num)) # Print output and determine if the output needs to be adjusted because of a max number of entries per bin ctr_format = "{:^11} {:^8}" ctr_format_max = "{:^11} {:^8} {:^7}" excess_bins = {} for col_bin in bin_arrays: print("Histogram data for column '{}': ".format(bin_labels[col_bin])) if max_bins[col_bin] is None: print(ctr_format.format('bin_ctr', 'count')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): print( ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index])) else: bin_max = max_bins[col_bin] excess_bins[col_bin] = {} print(ctr_format_max.format('bin_ctr', 'found', 'keep')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): num_found = bin_counts[col_bin][bin_index] if num_found > bin_max: num_keep = bin_max # use bin_ctr as key because that is what is saved on the row excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = { QUOT: num_found / bin_max, MOD: num_found % bin_max } else: num_keep = num_found print(ctr_format_max.format(bin_ctr, num_found, num_keep)) if len(excess_bins) == 1: count_bin = {} delete_rows = [] mod_r = {} quot_r = {} for col_bin in excess_bins: for bin_remove, bin_dict in excess_bins[col_bin].items(): mod_r[bin_remove] = bin_dict[MOD] quot_r[bin_remove] = bin_dict[QUOT] count_bin[bin_remove] = 0 r_count = 0 for row_id, row in enumerate(filtered_vectors): bin_name = row[0] # print(bin_name) if bin_name in excess_bins[col_bin]: count_bin[bin_name] += 1 if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[ bin_name] <= mod_r[bin_name]: delete_rows.append(row_id) # print(row_id) r_count += 1 filtered_vectors = [ row for row_id, row in enumerate(filtered_vectors) if row_id not in delete_rows ] if len(excess_bins) > 1: warning( "No filtering based on a max number of entries will be done; this feature is currently implemented " "only for binning with one column's values.") f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv') list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict): with open(cfg[PSF_FILE]) as f: psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} num_atoms_pat = re.compile(r"(\d+).*NATOM$") num_atoms = 1 section = SEC_HEAD # for printing qmmm info qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] types_for_mm_kind = set() qmmm_charge = 0 # for RENUM_MOL last_resid = None cur_mol_num = 0 for line in f.readlines(): s_line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: psf_data[HEAD_CONTENT].append(line.rstrip()) atoms_match = num_atoms_pat.match(s_line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(s_line) == 0: continue split_line = s_line.split() atom_num = int(split_line[0]) segid = split_line[1] resid = int(split_line[2]) resname = split_line[3] atom_type = split_line[4] charmm_type = split_line[5] charge = float(split_line[6]) atom_wt = float(split_line[7]) zero = split_line[8] # For reordering atoms if atom_num in atom_num_dict: atom_num = atom_num_dict[atom_num] # For user-specified changing of molecule number if resid in mol_num_dict: resid = mol_num_dict[resid] if cfg[RENUM_MOL]: if resid != last_resid: last_resid = resid cur_mol_num += 1 resid = cur_mol_num atom_struct = [atom_num, segid, resid, resname, atom_type, charmm_type, charge, atom_wt, zero] psf_data[ATOMS_CONTENT].append(atom_struct) if resid in cfg[RESID_QM] or resid in cfg[RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]: if resid in cfg[RESID_QMMM]: if atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num else: if resid in cfg[RESID_QMMM] and atom_type == C_BETA: cb_res_atom_id_dict[resid] = atom_num if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError( "Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM) ) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_num) else: qmmm_elem_id_dict[element] = [atom_num] qmmm_charge += charge atoms_for_vmd.append(atom_num - 1) if cfg[PRINT_FOR_CP2K]: types_for_mm_kind.add(atom_type) if len(psf_data[ATOMS_CONTENT]) == num_atoms: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: psf_data[TAIL_CONTENT].append(line.rstrip()) if len(atom_num_dict) > 0: warning( "This program does not yet edit any sections other than the atoms section." "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and" "cross-terms sections will not match." ) psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT], key=lambda entry: entry[0]) if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0: if cfg[PSF_NEW_FILE] is None: f_name = create_out_fname(cfg[PSF_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = cfg[PSF_NEW_FILE] list_to_file( psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] + psf_data[TAIL_CONTENT], f_name, list_format=cfg[PSF_FORMAT], ) if cfg[PRINT_FOR_CP2K]: print("Total charge from QM atoms: {:.2f}".format(qmmm_charge)) # create CP2K input listing amino atom ids f_name = create_out_fname("amino_id.dat", base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = "a" print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) # create CP2K input listing MM atom type radii f_name = create_out_fname("mm_kinds.dat", base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for atom_type in types_for_mm_kind: try: print_mm_kind(atom_type, radii_dict[atom_type], f_name, mode=print_mode) print_mode = "a" except KeyError: warning( "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n" " '{}' printed without this type; user may manually add its radius specification.\n" " To print this file with all MM types, use the keyword '{}' in the configuration file \n" " to identify a file with atom_type,radius (one per line, comma-separated) with all " "MM types in the psf".format(atom_type, cfg[RADII_DICT_FILE], "mm_kinds.dat", RADII_DICT_FILE) ) # create VMD input listing amino atom indexes (base-zero counting) f_name = create_out_fname("vmd_protein_atoms.dat", base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=" ")