def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret kbt = calc_kbt(args.temp) if args.src_file is not None: proc_data = to_zero_point(calc_rad(args.src_file, kbt)) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), RAD_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '{}' dirs with files to process".format( len(found_files))) # noinspection PyCompatibility for f_dir, files in found_files.items(): if not files: logger.warn("No files found for dir '{}'".format(f_dir)) continue for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = to_zero_point(calc_rad(pmf_path, kbt)) f_name = create_out_fname(pmf_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): write_csv(proc_data, f_name, RAD_KEY_SEQ) return GOOD_RET # success
def print_content(atom_id_dict, cfg, content, data_file, highlight_content, section_order, type_dict): data_content = content[SEC_HEAD] select_data_content = [] for section in section_order: # empty list will become an empty line data_content += [''] + [section, ''] select_data_content += [section] sec_format = SEC_FORMAT_DICT[section][0] comment_col = SEC_FORMAT_DICT[section][1] for line in content[section]: data_content.append(sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:])) for line in highlight_content[section]: select_data_content.append(sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:])) # Only print a "new" data file if something is changed dict_lens = len(atom_id_dict) for name, t_dict in type_dict.items(): dict_lens += len(t_dict) if dict_lens > 0 or cfg[SORT_ME]: f_name = create_out_fname(data_file, suffix='_new', ext='.data') list_to_file(data_content, f_name) print('Completed writing {}'.format(f_name)) if (len(cfg[PRINT_DATA_ATOMS]) + len(cfg[PRINT_OWN_ATOMS])) > 0: f_name = create_out_fname(data_file, suffix='_selected', ext='.txt') list_to_file(select_data_content, f_name) print('Completed writing {}'.format(f_name))
def main(argv=None): """ Runs the main program. @param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != 0: return ret if args.src_file is not None: proc_data = calc_for_wham(args.src_file) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), COLVAR_WHAM_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '%d' dirs with files to process", len(found_files)) # noinspection PyCompatibility for f_dir, files in found_files.iteritems(): if not files: logger.warn("No files found for dir '%s'", f_dir) continue for colvar_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = calc_for_wham(colvar_path) f_name = create_out_fname(colvar_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): list_to_file([str(d['r']) for d in proc_data if 'r' in d], f_name) # write_csv(proc_data, f_name, COLVAR_WHAM_KEY_SEQ, extrasaction="ignore") return 0 # success
def print_content(atom_id_dict, cfg, content, data_file, highlight_content, section_order, type_dict): data_content = content[SEC_HEAD] select_data_content = [] for section in section_order: # empty list will become an empty line data_content += [''] + [section, ''] select_data_content += [section] sec_format = SEC_FORMAT_DICT[section][0] comment_col = SEC_FORMAT_DICT[section][1] for line in content[section]: data_content.append( sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:])) for line in highlight_content[section]: select_data_content.append( sec_format.format(*line[:comment_col]) + " ".join(line[comment_col:])) # Only print a "new" data file if something is changed dict_lens = len(atom_id_dict) for name, t_dict in type_dict.items(): dict_lens += len(t_dict) if dict_lens > 0 or cfg[SORT_ME]: f_name = create_out_fname(data_file, suffix='_new', ext='.data') list_to_file(data_content, f_name) print('Completed writing {}'.format(f_name)) if (len(cfg[PRINT_DATA_ATOMS]) + len(cfg[PRINT_OWN_ATOMS])) > 0: f_name = create_out_fname(data_file, suffix='_selected', ext='.txt') list_to_file(select_data_content, f_name) print('Completed writing {}'.format(f_name))
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret kbt = calc_kbt(args.temp) if args.src_file is not None: proc_data = to_zero_point(calc_rad(args.src_file, kbt)) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), RAD_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '{}' dirs with files to process".format(len(found_files))) # noinspection PyCompatibility for f_dir, files in found_files.iteritems(): if not files: logger.warn("No files found for dir '{}'".format(f_dir)) continue for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = to_zero_point(calc_rad(pmf_path, kbt)) f_name = create_out_fname(pmf_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): write_csv(proc_data, f_name, RAD_KEY_SEQ) return GOOD_RET # success
def adjust_atom_xyz(cfg, data_tpl_content): """ If this options is selected, adjust the xyz coordinates as specified @param cfg: configuration for the run @param data_tpl_content: processed data from the template @return: will print new data files or raise InvalidDataError """ if cfg[ADJUST_ATOM] > data_tpl_content[NUM_ATOMS]: raise InvalidDataError( "Keyword '{}' specified atom index {} to have its XYZ coordinates adjusted, " "but found only " "{} atoms in the data template file: {}".format( ADJUST_ATOM, cfg[ADJUST_ATOM], data_tpl_content[NUM_ATOMS], cfg[DATA_TPL_FILE])) diff_vector = np.asarray((np.subtract(cfg[XYZ2], cfg[XYZ1]))) inc_vector = np.divide(diff_vector, cfg[XYZ_STEPS]) head_content = data_tpl_content[HEAD_CONTENT] atoms_content = data_tpl_content[ATOMS_CONTENT] tail_content = data_tpl_content[TAIL_CONTENT] # since python is zero-based, must subtract 1 adjust_atom_num = cfg[ADJUST_ATOM] - 1 for multiplier in range(-cfg[XYZ_STEPS_EXTEND], cfg[XYZ_STEPS] + cfg[XYZ_STEPS_EXTEND]): f_name = create_out_fname(cfg[DATA_TPL_FILE], suffix='_' + str(multiplier), ext='.data') atoms_content[adjust_atom_num][4:7] = np.round( multiplier * inc_vector + cfg[XYZ1], 6) list_to_file(head_content + atoms_content + tail_content, f_name)
def fill_save_tpl(cfg, tpl_str, tpl_vals_dict, tpl_name, filled_tpl_name, print_info=True): """ use the dictionary to make the file name and filled template. Then save the file. @param cfg: configuration for run @param tpl_str: the string to be filled to make the filled tpl file @param tpl_vals_dict: dictionary of tpl keys and vals @param tpl_name: the cfg key for the template file name @param filled_tpl_name: the cfg key for the filled template file name @param print_info: print to standard out when a file is printed """ try: filled_tpl_str = tpl_str.format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Key '{}' not found in the configuration but required for template file: {}" "".format(e.message, tpl_name)) try: filled_fname_str = filled_tpl_name.format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Key '{}' not found in the configuration but required for filled template file name: {}" "".format(e.message, filled_tpl_name)) tpl_vals_dict[NEW_FNAME] = create_out_fname(filled_fname_str, base_dir=cfg[OUT_DIR]) str_to_file(filled_tpl_str, tpl_vals_dict[NEW_FNAME], print_info=print_info)
def process_cp2k_file(cp2k_file, data_tpl_content, data_template_fname): new_atoms_section = None with open(cp2k_file) as f: data_tpl_content[HEAD_CONTENT][0] = "Created on {} by {} version {} from template file {} and " \ "cp2k output file {}".format(datetime.now(), __name__, __version__, data_template_fname, cp2k_file ) for line in f: line = line.strip() if ENERGY_PAT.match(line): qmmm_energy = line.split()[-1] if COORD_PAT.match(line): # Now advance to first line of coordinates for _ in range(3): next(f) new_atoms_section = process_coords(f, data_tpl_content) # If we successfully returned the new_atoms_section, make new file if new_atoms_section is None: raise InvalidDataError( "Did not file atoms coordinates in file: {}".format(cp2k_file)) print("{} energy: {}".format(cp2k_file, qmmm_energy)) f_name = create_out_fname(cp2k_file, ext='.data') list_to_file(data_tpl_content[HEAD_CONTENT] + new_atoms_section + data_tpl_content[TAIL_CONTENT], f_name, print_message=False)
def process_file(f_list, new_f_name): value_dict = {} print("hello world") with open(f_list) as f: for f_name in f.readlines(): f_name = f_name.strip() with open(f_name) as d: for line in d.readlines(): # string2 = string1.strip('\n') line = line.strip() split_line = line.split() entries = len(split_line) # For this purpose, subtract 1 (hydronium) and divide by 3 water_mol_number = (entries - 1) / 3 if water_mol_number in value_dict: value_dict[water_mol_number] += 1 else: value_dict[water_mol_number] = 1 if new_f_name is None: new_f_name = create_out_fname(f_list, suffix='_count') with open(new_f_name, 'w') as w_file: for key in value_dict: w_file.write(str(key) + "," + str(value_dict[key]) + "\n") print(key, value_dict[key])
def copy_par_result_file(cfg, tpl_vals_dict, print_info=False): """ To keep a copy of a par file, make the new file name and copy the previously created par file @param cfg: configuration for run @param tpl_vals_dict: dictionary to fill strings @param print_info: boolean to determine if to print to standard out that a copy was made @return: KeyError if required variable is not defined """ if cfg[TRIAL_NAME] is not None: try: tpl_vals_dict[TRIAL_NAME] = cfg[TRIAL_NAME].format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Missing key name {} required for '{}': '{}'. Program will terminate." "".format(e, TRIAL_NAME, cfg[TRIAL_NAME])) for copy_name in [PAR_COPY_NAME, RESULT_COPY]: if cfg[copy_name] is not None: try: base_name = cfg[copy_name].format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Missing key name {} required for '{}': '{}'. File will not be copied." "".format(e, copy_name, cfg[copy_name])) new_fname = create_out_fname(base_name, base_dir=cfg[COPY_DIR]) if copy_name == PAR_COPY_NAME: shutil.copyfile(tpl_vals_dict[NEW_FNAME], new_fname) else: # if os.path.isfile(tpl_vals_dict[RESULT_FILE]): shutil.copyfile(cfg[RESULT_FILE], new_fname) if print_info: print(" Copied to: {}".format(new_fname))
def process_file(data_file, mcfg, delimiter=','): list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True) col_index_dict = {} for section in SUB_SECTIONS: col_index_dict[section] = {} for key, val in mcfg[section].items(): if key in headers: # Parser already made sure that unique entries col_index_dict[section][headers.index(key)] = val else: raise InvalidDataError( "Key '{}' found in configuration file but not in data file: " "{}".format(key, data_file)) edited_vectors = [] for row in list_vectors: for col, max_val in col_index_dict[MAX_SEC].items(): if row[col] > max_val: row[col] = max_val for col, min_val in col_index_dict[MIN_SEC].items(): if row[col] < min_val: row[col] = min_val edited_vectors.append(row) f_name = create_out_fname(data_file, ext='.csv') list_to_csv([headers] + edited_vectors, f_name, delimiter=',')
def testOutFname(self): """ Check for prefix addition. """ self.assertTrue( create_out_fname(ORIG_WHAM_PATH, prefix=OUT_PFX).endswith(os.sep + OUT_PFX + ORIG_WHAM_FNAME))
def process_pdb_tpl(cfg): tpl_loc = cfg[PDB_TPL_FILE] tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} atom_id = 0 with open(tpl_loc) as f: for line in f: line = line.strip() if len(line) == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms # match 5 letters so don't need to set up regex for the ones that have numbers following the letters # noinspection SpellCheckingInspection if line_head[:-1] in ['HEADE', 'TITLE', 'REMAR', 'CRYST', 'MODEL', 'COMPN', 'NUMMD', 'ORIGX', 'SCALE', 'SOURC', 'AUTHO', 'CAVEA', 'EXPDT', 'MDLTY', 'KEYWD', 'OBSLT', 'SPLIT', 'SPRSD', 'REVDA', 'JRNL ', 'DBREF', 'SEQRE', 'HET ', 'HETNA', 'HETSY', 'FORMU', 'HELIX', 'SHEET', 'SSBON', 'LINK ', 'CISPE', 'SITE ', ]: tpl_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # By renumbering, handles the case when a PDB template has ***** after atom_id 99999. # For renumbering, making sure prints in the correct format, including num of characters: atom_id += 1 if atom_id > 99999: atom_num = format(atom_id, 'x') else: atom_num = '{:5d}'.format(atom_id) # Alternately, use this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] # There is already a try when calling the subroutine, so maybe I don't need to? mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) last_cols = line[cfg[PDB_Z_LAST_CHAR]:] line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, last_cols] tpl_data[ATOMS_CONTENT].append(line_struct) # tail_content to contain everything after the 'Atoms' section else: tpl_data[TAIL_CONTENT].append(line) if logger.isEnabledFor(logging.DEBUG): f_name = create_out_fname('reproduced_tpl', ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT] + tpl_data[TAIL_CONTENT], f_name, list_format=cfg[PDB_FORMAT]) return tpl_data
def comp_files(cfg, atom_id_dict, type_dicts): """ Compares each section of data files @param cfg: configuration information for current run @param atom_id_dict: dictionary for changing the atom id @param type_dicts: dictionary for changing atom and interaction types @return: """ first_content, first_section_order = proc_data_file( cfg, cfg[DATA_FILE], atom_id_dict, type_dicts, ) second_content, second_section_order = proc_data_file( cfg, cfg[DATA_COMP], atom_id_dict, type_dicts, ) for section in second_section_order: if section not in first_section_order: warning("Skipping section '{}'; section found in the file: {}\n" " but not in file: {}".format(section, cfg[DATA_COMP], cfg[DATA_FILE])) diffs = ["Differences in head section:"] compare_heads(first_content[SEC_HEAD], second_content[SEC_HEAD], diffs) for section in first_section_order: if section not in second_section_order: warning("Skipping section '{}'; section found in the file: {}\n" " but not in file: {}".format(section, cfg[DATA_FILE], cfg[DATA_COMP])) elif section in [SEC_VELOS]: diffs.append("\nSkipping section '{}'".format(section)) elif section in COMP_ORD_SEC_COL_DICT: diffs.append("\nDifferences in section '{}':".format(section)) num_col_to_compare = COMP_ORD_SEC_COL_DICT[section] compare_lists(first_content[section], second_content[section], 0, num_col_to_compare, diffs, SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1]) elif section in NUM_SEC_DICT: diffs.append("\nDifferences in section '{}':".format(section)) num_col_to_compare = NUM_SEC_DICT[section][1] compare_lists(first_content[section], second_content[section], 1, num_col_to_compare, diffs, SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1]) else: print("Encountered unexpected section '{}'".format(section)) f_name = create_out_fname(cfg[DATA_COMP], prefix='diffs_', ext='.txt') list_to_file(diffs, f_name) print('Completed writing {}'.format(f_name))
def process_file(f_name, b_str, e_str, new_f_name): if new_f_name is None: new_f_name = create_out_fname(f_name, suffix='_amend') # open old file first; then if, there is a problem with it, no new file will be created with open(f_name) as f: with open(new_f_name, 'w') as w_file: for line in f: line = line.strip() w_file.write(b_str + line + e_str + "\n") print("Wrote file: {}".format(new_f_name))
def write_result(result, src_file, overwrite=False, basedir=None): """Writes the result to a file named for the given source file. :param result: The result to write. :param src_file: The original source file name. :param overwrite: Whether to overwrite an existing file name. :param basedir: The base directory to target (uses the source file's base directory if not specified) """ f_name = create_out_fname(src_file, prefix=OUT_PFX, base_dir=basedir) if allow_write(f_name, overwrite=overwrite): write_csv(result, f_name, OUT_KEY_SEQ)
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode): f_out = create_out_fname(dump_file, suffix='_sum', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, out_fieldnames, extrasaction="ignore", mode=write_mode, round_digits=ROUND_DIGITS, print_message=cfg[PRINT_PROGRESS])
def process_cv_file(cv_file, time_col, cv_col, row_index, time_conv): data_to_print = [] with open(cv_file) as f: for line in f: if row_index == 0: row_index = 1 else: data = [x.strip() for x in line.split()] try: timestep = int(float(data[time_col]) * time_conv) cv = float(data[cv_col]) data_to_print.append([timestep, cv]) except ValueError as e: warning("Excepted a number for the time_column ({}) and cv column({}). Found {} and {}." "".format(time_col, cv_col, data[time_col], data[cv_col]), e) return INVALID_DATA d_out = create_out_fname(cv_file, suffix='_converted', ext='.txt') list_to_file(data_to_print, d_out) print('Wrote file: {}'.format(d_out)) d_out = create_out_fname(cv_file, suffix='_converted', ext='.csv') list_to_file(data_to_print, d_out, delimiter=',') print('Wrote file: {}'.format(d_out))
def print_gofr(cfg, gofr_data): g_dr = cfg[GOFR_DR] dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2 gofr_out_fieldnames = [GOFR_R] gofr_output = dr_array if cfg[CALC_HO_GOFR]: normal_fac = np.square( dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HO) gofr_output = np.column_stack((gofr_output, gofr_ho)) if cfg[CALC_OO_GOFR]: normal_fac = np.square( dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OO) gofr_output = np.column_stack((gofr_output, gofr_oo)) if cfg[CALC_HH_GOFR]: normal_fac = np.square( dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HH) gofr_output = np.column_stack((gofr_output, gofr_hh)) if cfg[CALC_OH_GOFR]: normal_fac = np.square( dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OH) gofr_output = np.column_stack((gofr_output, gofr_oh)) if cfg[CALC_TYPE_GOFR]: if gofr_data[TYPE_STEPS_COUNTED] > 0: normal_fac = np.square( dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_TYPE) gofr_output = np.column_stack((gofr_output, gofr_type)) else: warning("Did not find any timesteps with the pairs in {}. " "This output will not be printed.".format(CALC_TYPE_GOFR)) f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix='_gofrs', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) # am not using the dict writer because the gofr output is a np.array list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out, print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS)
def proc_file(file_name): with open(file_name) as d: nodups_lines = [''] for line in d: line = line.strip() if len(line) == 0: continue elif line == nodups_lines[-1]: continue else: nodups_lines.append(line) print('Completed reading {}.\n'.format(file_name)) f_out_name = create_out_fname(file_name, suffix='_nodups') list_to_file(nodups_lines[1:], f_out_name) print('Wrote {}.\n'.format(f_out_name))
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET: return ret deduped = compress_dups(read_csv(args.file, all_conv=float), args.column) write_csv(deduped, create_out_fname(args.file, prefix=PREFIX), read_csv_header(args.file)) return GOOD_RET # success
def testWriteCsv(self): tmp_dir = None data = csv_data() try: tmp_dir = tempfile.mkdtemp() tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir) write_csv(data, tgt_fname, RAD_KEY_SEQ) csv_result = read_csv(tgt_fname, data_conv={FREE_KEY: str_to_bool, CORR_KEY: float, COORD_KEY: str, }) self.assertEqual(len(data), len(csv_result)) for i, csv_row in enumerate(csv_result): self.assertDictEqual(data[i], csv_row) finally: shutil.rmtree(tmp_dir)
def read_file_list(file_list, out_dir): """ @param file_list: the list of files to be read @param out_dir: user-specified output directory """ summary_header = ['num_atoms', 'sum_x', 'sum_y', 'sum_z', 'total'] summary_array = None with open(file_list) as f: for f_file in f: f_file = f_file.strip() if len(f_file) == 0: continue elif os.path.isfile(f_file): summary = process_cp2k_force_file(f_file, out_dir) if summary is not None: if summary_array is None: summary_array = summary else: summary_array = np.vstack((summary, summary_array)) else: warning('Could not read file {} in file list {}. ' 'Continuing to the next line in file list.'.format(f_file, file_list)) # print(np.amax(summary_array, axis=1)) if summary_array is None: warning("No valid cp2k force output files were read.") elif summary_array.size == 5: print('For the one CP2K force file read:') print(' ' + ' '.join(summary_header)) print(' '.join(['%10.0f' % summary_array[0]] + ['%10.3f' % F for F in summary_array[1:]])) else: f_out = create_out_fname(file_list, prefix='force_sums_', base_dir=out_dir, ext='.csv') list_to_file(summary_array, f_out) with open(f_out, 'w') as logfile: logfile.write(','.join(summary_header) + "\n") # noinspection PyTypeChecker for line in summary_array: logfile.write(','.join(['%d' % line[0]] + ['%f' % F for F in line[1:]]) + "\n") print('Finished reading all cp2k force files. Printed each atomic force sum to: {}'.format(f_out)) min_vals = np.amin(summary_array, axis=0) max_vals = np.amax(summary_array, axis=0) print(' ' + ' '.join(summary_header)) print('min_vals: ' + ' '.join(['%10.0f' % min_vals[0]] + ['%10.3f' % F for F in min_vals[1:]])) print('max_vals: ' + ' '.join(['%10.0f' % max_vals[0]] + ['%10.3f' % F for F in max_vals[1:]]))
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != 0: return ret pairs = parse_pairs(args.pair_files) dists = atom_distances(args.file, pairs) write_results(create_out_fname(args.file, prefix='pairs_', ext='.csv'), dists, pairs) return 0 # success
def create_hist_plot(hist_dict, header, out_dir, data_file): """ See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html @param hist_dict: dict of label, count @param header: name of dictionary @param out_dir: str, name of directory where files are to be saved @param data_file: name of data file @return: a list of lists (label, count) """ # remove spaces in name header = "".join(header.split()) # convert dict to list for creating bar chat bar_data = [[key, val] for key, val in hist_dict.items()] bar_data.sort(key=itemgetter(0)) bar_data.sort(key=itemgetter(1), reverse=True) # bar chart background style sns.set(style="whitegrid", font='Arial') # color options include pastel sns.set_color_codes("deep") # Initialize the matplotlib figure f, ax = plt.subplots(figsize=(6, 6)) # Create pandas dataframe new_df = pd.DataFrame(bar_data, columns=["key", "count"]) # Plot sns.barplot(x="count", y="key", data=new_df, label="Total", color="b") # other options: xlim=(0, 24) ax.set(xlabel="Count", ylabel="") ax.set_title(header) with warnings.catch_warnings(): warnings.simplefilter("ignore") plt.tight_layout() f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png") plt.savefig(f_name, dpi=300) print("Wrote file: {}".format(f_name)) # quote strings for printing so csv properly read, and add header count_to_print = [[header + "_key", header + "_count"]] for row in bar_data: count_to_print.append([row[0], row[1]]) return count_to_print
def create_hist_plot(hist_dict, header, out_dir, data_file): """ See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html @param hist_dict: dict of label, count @param header: name of dictionary @param out_dir: str, name of directory where files are to be saved @param data_file: name of data file @return: a list of lists (label, count) """ # remove spaces in name header = "".join(header.split()) # convert dict to list for creating bar chat bar_data = [[key, val] for key, val in hist_dict.items()] bar_data.sort(key=itemgetter(1), reverse=True) # bar chart background style sns.set(style="whitegrid", font="Arial") # color options include pastel sns.set_color_codes("deep") # Initialize the matplotlib figure f, ax = plt.subplots(figsize=(6, 6)) # Create pandas dataframe new_df = pd.DataFrame(bar_data, columns=["key", "count"]) # Plot sns.barplot(x="count", y="key", data=new_df, label="Total", color="b") # other options: xlim=(0, 24) ax.set(xlabel="Count", ylabel="") ax.set_title(header) with warnings.catch_warnings(): warnings.simplefilter("ignore") plt.tight_layout() f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png") plt.savefig(f_name, dpi=300) print("Wrote file: {}".format(f_name)) # quote strings for printing so csv properly read, and add header count_to_print = [[header + "_key", header + "_count"]] for row in bar_data: count_to_print.append([row[0], row[1]]) return count_to_print
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret try: if args.list_file is None: file_list = [] base_file_name = args.file else: file_list = file_rows_to_list(args.list_file) base_file_name = args.list_file if args.file is not None: file_list.append(args.file) dists = OrderedDict() pairs = parse_pairs(args.pair_files) write_mode = 'w' for l_file in file_list: dists.update(atom_distances(l_file, pairs)) if len(dists) > 0: write_results(create_out_fname(base_file_name, prefix='pairs_', ext='.csv'), dists, pairs, write_mode=write_mode) write_mode = 'a' except IOError as e: warning("Problems reading file: {}".format(e)) return IO_ERROR except InvalidDataError as e: warning("Invalid Data Error: {}".format(e)) return IO_ERROR return GOOD_RET # success
def comp_files(cfg, atom_id_dict, type_dicts): """ Compares each section of data files @param cfg: configuration information for current run @param atom_id_dict: dictionary for changing the atom id @param type_dicts: dictionary for changing atom and interaction types @return: """ first_content, first_section_order = proc_data_file(cfg, cfg[DATA_FILE], atom_id_dict, type_dicts,) second_content, second_section_order = proc_data_file(cfg, cfg[DATA_COMP], atom_id_dict, type_dicts,) for section in second_section_order: if section not in first_section_order: warning("Skipping section '{}'; section found in the file: {}\n" " but not in file: {}".format(section, cfg[DATA_COMP], cfg[DATA_FILE])) diffs = ["Differences in head section:"] compare_heads(first_content[SEC_HEAD], second_content[SEC_HEAD], diffs) for section in first_section_order: if section not in second_section_order: warning("Skipping section '{}'; section found in the file: {}\n" " but not in file: {}".format(section, cfg[DATA_FILE], cfg[DATA_COMP])) elif section in [SEC_VELOS]: diffs.append("\nSkipping section '{}'".format(section)) elif section in COMP_ORD_SEC_COL_DICT: diffs.append("\nDifferences in section '{}':".format(section)) num_col_to_compare = COMP_ORD_SEC_COL_DICT[section] compare_lists(first_content[section], second_content[section], 0, num_col_to_compare, diffs, SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1]) elif section in NUM_SEC_DICT: diffs.append("\nDifferences in section '{}':".format(section)) num_col_to_compare = NUM_SEC_DICT[section][1] compare_lists(first_content[section], second_content[section], 1, num_col_to_compare, diffs, SEC_FORMAT_DICT[section][0], SEC_FORMAT_DICT[section][1]) else: print("Encountered unexpected section '{}'".format(section)) f_name = create_out_fname(cfg[DATA_COMP], prefix='diffs_', ext='.txt') list_to_file(diffs, f_name) print('Completed writing {}'.format(f_name))
def process_log_files(source_name, log_file_list): """ Loops through all files and prints output @param source_name: the source name to use as the base for creating an outfile name @param log_file_list: list of file names to read and process """ result_list = [] out_fname = create_out_fname(source_name, suffix='_sum', ext=".csv") for log_file in log_file_list: result_list += process_log(log_file) if len(result_list) == 0: warning( "Found no lammps log data to process from: {}".format(source_name)) else: write_csv(result_list, out_fname, LOG_FIELDNAMES, extrasaction="ignore")
def plot_corr(f_name): """ Given a csv, plot it as a heat map @param f_name: file name to save the correlation @return: """ corr_data = pd.read_csv(f_name, index_col=0) i_name = create_out_fname(f_name, ext='.png') # Generate a mask for the upper triangle plot_mask = np.zeros_like(corr_data, dtype=np.bool) plot_mask[np.triu_indices_from(plot_mask)] = True # Set up the matplotlib figure sns.set(style="white") # f, ax = plt.subplots(figsize=(11, 9)) plt.subplots(figsize=(11, 9)) # Draw the heatmap with the plot_mask and correct aspect ratio sns.heatmap( corr_data, mask=plot_mask, vmin=0.0, vmax=100.0, square=True, # xticklabels=2, # yticklabels=2, linewidths=.5, cbar_kws={ "shrink": .5, }, ) plt.xticks(rotation='vertical') plt.yticks(rotation='horizontal') # print output plt.savefig(i_name) print("Wrote file: {}".format(i_name))
def print_gofr(cfg, gofr_data): g_dr = cfg[GOFR_DR] dr_array = gofr_data[GOFR_BINS][1:] - g_dr / 2 gofr_out_fieldnames = [GOFR_R] gofr_output = dr_array if cfg[CALC_HO_GOFR]: normal_fac = np.square(dr_array) * gofr_data[HO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_ho = np.divide(gofr_data[HO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HO) gofr_output = np.column_stack((gofr_output, gofr_ho)) if cfg[CALC_OO_GOFR]: normal_fac = np.square(dr_array) * gofr_data[OO_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oo = np.divide(gofr_data[OO_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OO) gofr_output = np.column_stack((gofr_output, gofr_oo)) if cfg[CALC_HH_GOFR]: normal_fac = np.square(dr_array) * gofr_data[HH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_hh = np.divide(gofr_data[HH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_HH) gofr_output = np.column_stack((gofr_output, gofr_hh)) if cfg[CALC_OH_GOFR]: normal_fac = np.square(dr_array) * gofr_data[OH_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_oh = np.divide(gofr_data[OH_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_OH) gofr_output = np.column_stack((gofr_output, gofr_oh)) if cfg[CALC_TYPE_GOFR]: if gofr_data[TYPE_STEPS_COUNTED] > 0: normal_fac = np.square(dr_array) * gofr_data[TYPE_STEPS_COUNTED] * 4 * np.pi * g_dr gofr_type = np.divide(gofr_data[TYPE_BIN_COUNT], normal_fac) gofr_out_fieldnames.append(GOFR_TYPE) gofr_output = np.column_stack((gofr_output, gofr_type)) else: warning( "Did not find any timesteps with the pairs in {}. " "This output will not be printed.".format(CALC_TYPE_GOFR) ) f_out = create_out_fname(cfg[DUMP_FILE_LIST], suffix="_gofrs", ext=".csv", base_dir=cfg[OUT_BASE_DIR]) # list_to_file([gofr_out_fieldnames] + gofr_output.tolist(), f_out, delimiter=',') list_to_csv([gofr_out_fieldnames] + gofr_output.tolist(), f_out)
def adjust_atom_dist(cfg, data_tpl_content): """ If this options is selected, adjust the xyz coordinates to specified distances @param cfg: configuration for the run @param data_tpl_content: processed data from the template @return: will print new data files or raise InvalidDataError """ for atom_num in cfg[ATOMS_DIST]: if atom_num > data_tpl_content[NUM_ATOMS]: raise InvalidDataError( "Keyword '{}' specified atom indexes {} but found only " "{} atoms in the data template file: {}".format( ATOMS_DIST, cfg[ATOMS_DIST], data_tpl_content[NUM_ATOMS], cfg[DATA_TPL_FILE])) # since python is zero-based, must subtract 1 pivot_atom_num = cfg[ATOMS_DIST][0] - 1 pivot_atom = data_tpl_content[ATOMS_CONTENT][pivot_atom_num] pivot_xyz = np.array(pivot_atom[4:7]) moving_atom_num = cfg[ATOMS_DIST][1] - 1 moving_atom = data_tpl_content[ATOMS_CONTENT][moving_atom_num] moving_xyz = np.array(moving_atom[4:7]) diff_vector = pbc_calc_vector(moving_xyz, pivot_xyz, data_tpl_content[BOX_SIZE]) base_dist = np.linalg.norm(diff_vector) head_content = data_tpl_content[HEAD_CONTENT] atoms_content = data_tpl_content[ATOMS_CONTENT] tail_content = data_tpl_content[TAIL_CONTENT] for new_dist in cfg[NEW_DIST_LIST]: multiplier = new_dist / base_dist f_name = create_out_fname(cfg[DATA_TPL_FILE], suffix='_' + str(new_dist), ext='.data') atoms_content[moving_atom_num][4:7] = np.round( multiplier * diff_vector + pivot_xyz, 6) list_to_file(head_content + atoms_content + tail_content, f_name)
def find_rel_e(extracted_data, cfg, ref_e_dict): """ calculate relative energy, if data found @param extracted_data: dictionary of data found from chk file @param cfg: configuration for run @param ref_e_dict: reference energies, if available @return: """ tot_resid = 0 num_resid = 0 for data_dict in extracted_data: this_group = data_dict[REL_E_GROUP] if this_group: rel_ene_ref = cfg[REL_E_SEC][this_group][REL_E_REF] if this_group is None or np.isnan(rel_ene_ref): data_dict[REL_E] = np.nan else: rel_e = data_dict[ENV_ENE] - rel_ene_ref data_dict[REL_E] = rel_e file_name = data_dict[FILE_NAME] if file_name in ref_e_dict: ref_e = ref_e_dict[file_name] resid = np.round(np.sqrt((ref_e - rel_e)**2), 6) data_dict[REF_E] = ref_e data_dict[E_RESID] = resid tot_resid += resid num_resid += 1 f_out = create_out_fname(cfg[CHK_FILE_LIST], suffix='_sum', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(extracted_data, f_out, ENE_FIELD_NAMES, extrasaction="ignore") if len(ref_e_dict) > 1: print("Calculated total energy residual from {} files: {}".format( num_resid, tot_resid))
def process_file(base_file, data_file): # TODO: add in reading vectors base_dict = read_csv(base_file, quote_style=csv.QUOTE_NONNUMERIC)[0] data_dict_list = read_csv(data_file, quote_style=csv.QUOTE_NONNUMERIC) data_headers = [INDEX, RMSD] + read_csv_header(data_file) num_vals = len(base_dict.values()) for data_id, data_dict in enumerate(data_dict_list): rmsd = 0.0 for key, val in base_dict.items(): try: rmsd += (data_dict[key] - val)**2 except KeyError: raise InvalidDataError( "Could not find key '{}' from base file in compared data file." .format(key)) data_dict[INDEX] = data_id data_dict[RMSD] = round((rmsd / num_vals)**0.5, 2) out_name = create_out_fname(data_file, prefix=RMSD + '_') write_csv(data_dict_list, out_name, data_headers)
def process_pdb_files(cfg, data_tpl_content): # # For printing a dictionary # new_atom_type_dict = {} with open(cfg[PDBS_FILE]) as f: for pdb_file in f.readlines(): pdb_atom_line = [] pdb_file = pdb_file.strip() with open(pdb_file) as d: atom_num = 0 for line in d.readlines(): pdb_section = line[:cfg[PDB_SECTION_LAST_CHAR]] if pdb_section == 'ATOM ': # atom_nums = line[cfg[PDB_SECTION_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] # atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_INFO_LAST_CHAR]] # There is already a try when calling the subroutine, so maybe I don't need to? # mol_num = int(line[cfg[PDB_ATOM_INFO_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) # last_cols = line[cfg[PDB_Z_LAST_CHAR]:] # if data_tpl_content[ATOMS_CONTENT][atom_num][2] !=data_tpl_content[ATOM_TYPE_DICT][atom_type]: # print(atom_num,atom_type, data_tpl_content[ATOMS_CONTENT][atom_num][2], # data_tpl_content[ATOM_TYPE_DICT][atom_type]) # # For printing a dictionary # new_atom_type_dict[atom_type] = data_tpl_content[ATOMS_CONTENT][atom_num][2] pdb_atom_line.append(data_tpl_content[ATOMS_CONTENT][atom_num][:4] + [pdb_x, pdb_y, pdb_z] + data_tpl_content[ATOMS_CONTENT][atom_num][4:]) atom_num += 1 if atom_num != data_tpl_content[NUM_ATOMS]: raise InvalidDataError('The length of the "Atoms" section ({}) in the pdb does not equal ' 'the number of atoms in the data template file ({}).' ''.format(len(atom_num), data_tpl_content[NUM_ATOMS])) d_out = create_out_fname(pdb_file, suffix='_from_py', ext='.data') list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_atom_line + data_tpl_content[TAIL_CONTENT], d_out) print('Wrote file: {}'.format(d_out))
def create_hists(data_file, header_row, hist_data, out_dir): counts_to_print = [] if len(hist_data) > 0: for col in hist_data: count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file) if len(counts_to_print) == 0: counts_to_print = count_to_print else: len1 = len(counts_to_print) len2 = len(count_to_print) width1 = len(counts_to_print[0]) width2 = len(count_to_print[0]) combined_list = [] for row in range(min(len1, len2)): combined_list.append(counts_to_print[row] + count_to_print[row]) for row in range(len2, len1): combined_list.append(counts_to_print[row] + [""] * width2) for row in range(len1, len2): # noinspection PyTypeChecker combined_list.append([""] * width1 + count_to_print[row]) counts_to_print = copy.deepcopy(combined_list) f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir) list_to_csv(counts_to_print, f_name, delimiter=',')
def create_hists(data_file, header_row, hist_data, out_dir): counts_to_print = [] if len(hist_data) > 0: for col in hist_data: count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file) if len(counts_to_print) == 0: counts_to_print = count_to_print else: len1 = len(counts_to_print) len2 = len(count_to_print) width1 = len(counts_to_print[0]) width2 = len(count_to_print[0]) combined_list = [] for row in range(min(len1, len2)): combined_list.append(counts_to_print[row] + count_to_print[row]) for row in range(len2, len1): combined_list.append(counts_to_print[row] + [""] * width2) for row in range(len1, len2): # noinspection PyTypeChecker combined_list.append([""] * width1 + count_to_print[row]) counts_to_print = copy.deepcopy(combined_list) f_name = create_out_fname(data_file, prefix="counts_", ext=".csv", base_dir=out_dir) list_to_csv(counts_to_print, f_name, delimiter=",")
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False): try: dim_vectors, header_row, hist_data = np_float_array_from_file( data_file, delimiter=delimiter, header=header, gather_hist=make_hist ) except InvalidDataError as e: raise InvalidDataError( "{}\n" "Run program with '-h' to see options, such as specifying header row (-n) " "and/or delimiter (-d)".format(e) ) if header: to_print = [[""] + header_row] else: to_print = [] max_vector = dim_vectors.max(axis=0) min_vector = dim_vectors.min(axis=0) avg_vector = dim_vectors.mean(axis=0) med_vector = np.percentile(dim_vectors, 50, axis=0) # noinspection PyTypeChecker to_print += [ ["Min values:"] + min_vector.tolist(), ["Max values:"] + max_vector.tolist(), ["Avg values:"] + avg_vector.tolist(), ["Std dev:"] + dim_vectors.std(axis=0, ddof=1).tolist(), ["5% percentile:"] + np.percentile(dim_vectors, 4.55, axis=0).tolist(), ["32% percentile:"] + np.percentile(dim_vectors, 31.73, axis=0).tolist(), ["50% percentile:"] + med_vector.tolist(), ["68% percentile:"] + np.percentile(dim_vectors, 68.27, axis=0).tolist(), ["95% percentile:"] + np.percentile(dim_vectors, 95.45, axis=0).tolist(), ] if len_buffer is not None: to_print.append(["Max plus {} buffer:".format(len_buffer)] + (max_vector + len_buffer).tolist()) if min_max_dict is not None: nan_list = [np.nan] * len(header_row) avg_ini_diff = ["Avg % Diff:"] + nan_list med_ini_diff = ["Med % Diff:"] + nan_list med_is_min = ["Median is Min:"] + nan_list med_is_max = ["Median is Max:"] + nan_list for col_num, header in enumerate(to_print[0]): if header in min_max_dict[0]: ini_val = min_max_dict[0][header] low_val = min_max_dict[1][header] upp_val = min_max_dict[2][header] avg_val = avg_vector[col_num - 1] med_val = med_vector[col_num - 1] min_val = min_vector[col_num - 1] max_val = max_vector[col_num - 1] min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL) med_tol = max(TOL * abs(med_val), TOL) max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL) if (low_val - min_val) > min_tol: warning( "Minimum value found for header '{}' ({}) is less than lower bound ({})" "".format(header, min_val, low_val) ) if (max_val - upp_val) > max_tol: warning( "Maximum value found for header '{}' ({}) is greater than upper bound ({})" "".format(header, max_val, upp_val) ) avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100 med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100 if abs(med_val - low_val) > med_tol: med_is_min[col_num] = 0 else: med_is_min[col_num] = 1 if abs(med_val - upp_val) > med_tol: med_is_max[col_num] = 0 else: med_is_max[col_num] = 1 # else: # for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: # min_max_list.append(np.nan) for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: to_print.append(min_max_list) # Printing to standard out: do not print quotes around strings because using csv writer # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file)) if len(dim_vectors[0]) < 12: for index, row in enumerate(to_print): # formatting for header if index == 0 and header: print("{:>20s} {}".format(row[0], " ".join(["{:>16s}".format(x.strip()) for x in row[1:]]))) # formatting for vals else: print("{:>20s} {}".format(row[0], " ".join(["{:16.6f}".format(x) for x in row[1:]]))) f_name = create_out_fname(data_file, prefix="stats_", ext=".csv", base_dir=out_dir) list_to_csv(to_print, f_name) # list_to_file(to_print, f_name, delimiter=',') if make_hist: create_hists(data_file, header_row, hist_data, out_dir)
def process_dump_file(cfg, dump_file, atom_num_dict, atom_type_dict, mol_num_dict): section = None box = np.zeros((3,)) counter = 1 num_atoms = 0 head_content = [] steps_count = 0 step_stop = cfg[MAX_STEPS] * cfg[OUT_FREQ] timestep = None with open(dump_file) as d: d_out = create_out_fname(dump_file, suffix='_reorder', base_dir=cfg[OUT_BASE_DIR]) write_mode = 'w' for line in d: line = line.strip() if section == SEC_ATOMS: split_line = line.split() # If there is an incomplete line in a dump file, move on to the next file if len(split_line) < 7: break atom_num = int(split_line[0]) if atom_num in atom_num_dict: atom_num = atom_num_dict[atom_num] mol_num = int(split_line[1]) if mol_num in mol_num_dict: mol_num = mol_num_dict[mol_num] # Default RENUM_START_MOL is neg 1; if still less than zero, user did not specify renumbering if 0 <= cfg[RENUM_START_MOL] <= mol_num: mol_num += cfg[RENUM_SHIFT] atom_type = int(split_line[2]) if atom_type in atom_type_dict: atom_type = atom_type_dict[atom_type] charge = float(split_line[3]) x, y, z = map(float, split_line[4:7]) atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z] atom_data.append(atom_struct) if counter == num_atoms: if len(atom_num_dict) > 0: atom_data = sorted(atom_data, key=lambda atom: atom[0]) steps_count += 1 if steps_count % cfg[OUT_FREQ] == 0: print_to_dump_file(head_content, atom_data, d_out, mode=write_mode) if write_mode == 'w': write_mode = 'a' if steps_count == step_stop: print("Reached the maximum number of steps ({})".format(cfg[MAX_STEPS])) counter = 1 break # reset for next timestep head_content = [] counter = 0 section = None counter += 1 else: head_content.append(line) if section is None: section = find_dump_section_state(line) if section is None: raise InvalidDataError('Unexpected line in file {}: {}'.format(d, line)) elif section == SEC_TIMESTEP: timestep = line # Reset variables atom_data = [] section = None elif section == SEC_NUM_ATOMS: num_atoms = int(line) section = None elif section == SEC_BOX_SIZE: split_line = line.split() diff = float(split_line[1]) - float(split_line[0]) box[counter - 1] = diff if counter == 3: counter = 0 section = None counter += 1 if counter == 1: print("Completed reading: {}".format(dump_file)) else: warning("Dump file {} step {} did not have the full list of atom numbers. " "Continuing program.".format(dump_file, timestep))
def process_data_file(cfg, chk_atom_type, data_dict, data_file, data_tpl_content): with open(data_file) as d: pdb_data_section = copy.deepcopy(data_tpl_content[ATOMS_CONTENT]) pdb_atom_num = len(pdb_data_section) section = SEC_HEAD atom_id = 0 num_atoms = None atom_types = [] for line in d: line = line.strip() # not currently keeping anything from the header; just check num atoms if section == SEC_HEAD: if ATOMS_PAT.match(line): section = SEC_ATOMS elif num_atoms is None: atoms_match = NUM_ATOMS_PAT.match(line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) if num_atoms != pdb_atom_num: raise InvalidDataError( "Mismatched numbers of atoms: \n" " Found {} atoms in file: {}\n" " and {} atoms in file: {}\n" "".format(pdb_atom_num, cfg[PDB_TPL_FILE], num_atoms, data_file)) # atoms_content to contain only xyz; also perform some checking elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() # Not currently checking molecule number # If decide to do so, should make a count from 1 as the PDB is read; the PDB does not # have to start from 1, but the data file counts molecules from 1. For now, decided # checking atom type is a sufficient check # mol_num = int(split_line[1]) # Keep as string; json save as string and this helps compare atom_types.append(split_line[2]) pdb_data_section[atom_id][5:8] = map(float, split_line[4:7]) atom_id += 1 # Check after increment because the counter started at 0 if atom_id == num_atoms: # Since the tail will come only from the template, nothing more is needed. break # Now that finished reading the file... if atom_id != num_atoms: raise InvalidDataError( 'In data file: {}\n' ' header section lists {} atoms, but found {} atoms'.format( data_file, num_atoms, atom_id)) if chk_atom_type: for data_type, atom in zip(atom_types, pdb_data_section): try: pdb_type = atom[2] + atom[3] if pdb_type not in data_dict[data_type]: warning( 'Did not find type {} in dictionary of values for atom_type {}: ({})' ''.format(pdb_type, data_type, data_dict[data_type])) # print("atom", atom_type, data_dict[atom_type]) except KeyError: warning( 'Did not find data file atom type {} in the atom type dictionary {}' ''.format(data_type, cfg[ATOM_TYPE_DICT_FILE])) f_name = create_out_fname(data_file, ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_data_section + data_tpl_content[TAIL_CONTENT], f_name, list_format=cfg[PDB_FORMAT])
def process_pdb_tpl(cfg): tpl_loc = cfg[PDB_TPL_FILE] tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} atom_id = 0 with open(tpl_loc) as f: for line in f: line = line.strip() if len(line) == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms # match 5 letters so don't need to set up regex for the ones that have numbers following the letters # noinspection SpellCheckingInspection if line_head[:-1] in [ 'HEADE', 'TITLE', 'REMAR', 'CRYST', 'MODEL', 'COMPN', 'NUMMD', 'ORIGX', 'SCALE', 'SOURC', 'AUTHO', 'CAVEA', 'EXPDT', 'MDLTY', 'KEYWD', 'OBSLT', 'SPLIT', 'SPRSD', 'REVDA', 'JRNL ', 'DBREF', 'SEQRE', 'HET ', 'HETNA', 'HETSY', 'FORMU', 'HELIX', 'SHEET', 'SSBON', 'LINK ', 'CISPE', 'SITE ', ]: tpl_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # By renumbering, handles the case when a PDB template has ***** after atom_id 99999. # For renumbering, making sure prints in the correct format, including num of characters: atom_id += 1 if atom_id > 99999: atom_num = format(atom_id, 'x') else: atom_num = '{:5d}'.format(atom_id) # Alternately, use this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] atom_type = line[ cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] res_type = line[ cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] # There is already a try when calling the subroutine, so maybe I don't need to? mol_num = int(line[ cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float( line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) last_cols = line[cfg[PDB_Z_LAST_CHAR]:] line_struct = [ line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, last_cols ] tpl_data[ATOMS_CONTENT].append(line_struct) # tail_content to contain everything after the 'Atoms' section else: tpl_data[TAIL_CONTENT].append(line) if logger.isEnabledFor(logging.DEBUG): f_name = create_out_fname('reproduced_tpl', ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT] + tpl_data[TAIL_CONTENT], f_name, list_format=cfg[PDB_FORMAT]) return tpl_data
def process_evb_files(cfg): """ Want to grab the timestep and highest prot ci^2, highest wat ci^2, and print them @param cfg: configuration data read from ini file @return: @raise InvalidDataError: """ first_file_flag = True evb_file_list = [] if cfg[EVB_FILE] is not None: evb_file_list.append(cfg[EVB_FILE]) # Separate try-catch block here because want it to continue rather than exit; exit below if there are no files to # process try: with open(cfg[EVB_FILES]) as f: for evb_file in f: evb_file_list.append(evb_file.strip()) except IOError as e: warning("Problems reading file:", e) if len(evb_file_list) == 0: raise InvalidDataError("Found no evb file names to read. Specify one file with the keyword '{}' or \n" "a file containing a list of evb files with the keyword '{}'.".format(EVB_FILE, EVB_FILES)) for evb_file in evb_file_list: data_to_print, subset_to_print, wat_mol_data_to_print = process_evb_file(evb_file, cfg) no_print = [] if cfg[PRINT_PER_FILE] is True: if cfg[PRINT_KEY_PROPS]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_KEY_PROPS) if cfg[PRINT_CI_SUBSET]: if len(subset_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CI_SUBSET) if cfg[PRINT_CI_SQ]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_ci_sq', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CI_SQ) if cfg[PRINT_CEC]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CEC) if cfg[PRINT_WAT_MOL]: if len(wat_mol_data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_wat_mols', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_WAT_MOL) if len(no_print) > 0: warning("{} set to true, but found no data from: {} \n" "No output will be printed for this file.".format(",".join(map(single_quote, no_print)), evb_file)) if cfg[PRINT_PER_LIST]: if first_file_flag: print_mode = 'w' first_file_flag = False else: print_mode = 'a' if cfg[PRINT_CI_SQ]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_CI_SUBSET]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_WAT_MOL]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_wat_mols', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_CEC]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_KEY_PROPS]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore", mode=print_mode)
def process_data_tpl(cfg): tpl_loc = cfg[DATA_TPL_FILE] tpl_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: [], PROT_RES_MOL: [], H3O_MOL: [], WATER_MOLS: defaultdict(list), FIRST_H3O_H_INDEX: None} section = SEC_HEAD num_atoms_pat = re.compile(r"(\d+).*atoms$") atoms_pat = re.compile(r"^Atoms.*") # put in dummy x y z x = 0.0 y = 0.0 z = 0.0 total_charge = 0.0 # For debugging total charge calc_charge_atom_nums = {} for name in CALC_CHARGE_NAMES: calc_charge_atom_nums[cfg[name]] = name with open(tpl_loc) as f: for line in f: line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: tpl_data[HEAD_CONTENT].append(line) if NUM_ATOMS not in tpl_data: atoms_match = num_atoms_pat.match(line) if atoms_match: # regex is 1-based tpl_data[NUM_ATOMS] = int(atoms_match.group(1)) if atoms_pat.match(line): section = SEC_ATOMS tpl_data[HEAD_CONTENT].append('') # atoms_content to contain everything but the xyz: atom_num, mol_num, atom_type, charge, type' elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() atom_num = int(split_line[0]) mol_num = int(split_line[1]) atom_type = int(split_line[2]) charge = float(split_line[3]) description = ' '.join(split_line[7:]) atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z, description] tpl_data[ATOMS_CONTENT].append(atom_struct) total_charge += charge if atom_type == cfg[H3O_O_TYPE]: tpl_data[H3O_MOL].append(atom_struct) tpl_data[H3O_O_CHARGE] = charge elif atom_type == cfg[H3O_H_TYPE]: if tpl_data[FIRST_H3O_H_INDEX] is None: tpl_data[FIRST_H3O_H_INDEX] = len(tpl_data[H3O_MOL]) tpl_data[H3O_MOL].append(atom_struct) tpl_data[H3O_H_CHARGE] = charge elif mol_num == cfg[PROT_RES_MOL_ID]: tpl_data[PROT_RES_MOL].append(atom_struct) elif atom_type == cfg[WAT_O_TYPE] or atom_type == cfg[WAT_H_TYPE]: tpl_data[WATER_MOLS][mol_num].append(atom_struct) if atom_num == tpl_data[NUM_ATOMS]: section = SEC_TAIL # Perform checks total charge if abs(total_charge) < TOL: print('The data file system is neutral (total charge {:.2e})'.format(total_charge)) else: warning('The data file system is not neutral. Total charge {0:.6f}'.format(total_charge)) if len(tpl_data[PROT_RES_MOL]) == 0: raise InvalidDataError('Did not find the input {} ({}).'.format(PROT_RES_MOL, cfg[PROT_RES_MOL])) for mol_list in [H3O_MOL, WATER_MOLS]: if len(tpl_data[mol_list]) == 0: raise InvalidDataError('In reading the data file, found no {}. Check the data file and ' 'the input atom types: \n{} = {}\n{} = {}\n{} = {}\n' '{} = {}\n{} = {}.' ''.format(mol_list, PROT_H_TYPE, cfg[PROT_H_TYPE], H3O_O_TYPE, cfg[H3O_O_TYPE], H3O_H_TYPE, cfg[H3O_H_TYPE], WAT_O_TYPE, cfg[WAT_O_TYPE], WAT_H_TYPE, cfg[WAT_H_TYPE])) elif atom_num in calc_charge_atom_nums: print('After atom {0} ({1}), the total charge is: {2:.3f}'.format(atom_num, calc_charge_atom_nums[atom_num], total_charge)) # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: tpl_data[TAIL_CONTENT].append(line) # Validate data section if len(tpl_data[ATOMS_CONTENT]) != tpl_data[NUM_ATOMS]: raise InvalidDataError('In the file {}, The length of the "Atoms" section ({}) does not equal ' 'the number of atoms ({}).'.format(tpl_loc, len(tpl_data[ATOMS_CONTENT]), tpl_data[NUM_ATOMS])) if cfg[REPROD_TPL]: f_out = create_out_fname('reproduced_tpl', base_dir=cfg[OUT_BASE_DIR], ext='.data') list_to_file(tpl_data[HEAD_CONTENT] + tpl_data[ATOMS_CONTENT][:] + tpl_data[TAIL_CONTENT], f_out) return tpl_data
def process_dump_file(cfg, data_tpl_content, dump_file): section = None box = np.zeros((3,)) counter = 1 atom_list_order = [PRE_RES, PROT_RES, POST_RES, HYD_MOL, WAT_MOL, POST_WAT] dump_atom_data = [] atom_lists = {PRE_RES: [], PROT_RES: [], POST_RES: [], HYD_MOL: [], WAT_MOL: [], POST_WAT: [] } with open(dump_file) as d: for line in d: line = line.strip() if section is None: section = find_dump_section_state(line) if section is None: raise InvalidDataError('Unexpected line in file {}: {}'.format(dump_file, line)) elif section == SEC_TIMESTEP: timestep = line # Reset variables water_dict = defaultdict(list) dump_atom_data = [] excess_proton = None hydronium = [] for a_list in atom_lists: atom_lists[a_list] = [] section = None elif section == SEC_NUM_ATOMS: if data_tpl_content[NUM_ATOMS] != int(line): raise InvalidDataError('At timestep {} in file {}, the listed number of atoms ({}) does ' 'not equal the number of atoms in the template data file ' '({}).'.format(timestep, dump_file, line, data_tpl_content[NUM_ATOMS])) section = None elif section == SEC_BOX_SIZE: split_line = line.split() diff = float(split_line[1]) - float(split_line[0]) box[counter - 1] = diff if counter == 3: counter = 0 section = None counter += 1 elif section == SEC_ATOMS: split_line = line.split() # If there is an incomplete line in a dump file, move on to the next file if len(split_line) < 7: continue atom_num = int(split_line[0]) mol_num = int(split_line[1]) atom_type = int(split_line[2]) charge = float(split_line[3]) x, y, z = map(float, split_line[4:7]) description = '' atom_struct = [atom_num, mol_num, atom_type, charge, x, y, z, description] # Keep track of separate portions of the system to allow sorting and processing if mol_num == cfg[PROT_RES_MOL_ID]: if atom_type == cfg[PROT_H_TYPE] and atom_num not in cfg[PROT_H_IGNORE]: excess_proton = atom_struct else: atom_lists[PROT_RES].append(atom_struct) elif atom_type == cfg[H3O_O_TYPE] or atom_type == cfg[H3O_H_TYPE]: hydronium.append(atom_struct) elif atom_type == cfg[WAT_O_TYPE] or atom_type == cfg[WAT_H_TYPE]: water_dict[mol_num].append(atom_struct) # Save everything else in three chunks for recombining sections post-processing elif len(atom_lists[PROT_RES]) == 0: atom_lists[PRE_RES].append(atom_struct) elif len(water_dict) == 0: atom_lists[POST_RES].append(atom_struct) else: atom_lists[POST_WAT].append(atom_struct) if counter == data_tpl_content[NUM_ATOMS]: counter = 0 section = None # Now that finished reading all atom lines... # Check and process! if len(water_dict) == 0: raise InvalidDataError('Found no water molecules. Check that the input types {} = {} ' 'and {} = {} are in the dump ' 'file.'.format(WAT_O_TYPE, cfg[WAT_O_TYPE], WAT_H_TYPE, cfg[WAT_H_TYPE])) if excess_proton is None: if len(hydronium) != 4: raise InvalidDataError('Did not find an excess proton or one hydronium ion. Check dump ' 'file and input types: {} = {}; {} = {}; {} = {}' .format(PROT_H_TYPE, cfg[PROT_H_TYPE], H3O_O_TYPE, cfg[H3O_O_TYPE], H3O_H_TYPE, cfg[H3O_H_TYPE])) else: if len(hydronium) != 0: raise InvalidDataError('Found an excess proton and a hydronium atoms. Check dump file ' 'and input types: {} = {}; {} = {}; {} = {}' .format(PROT_H_TYPE, cfg[PROT_H_TYPE], H3O_O_TYPE, cfg[H3O_O_TYPE], H3O_H_TYPE, cfg[H3O_H_TYPE])) deprotonate(cfg, atom_lists[PROT_RES], excess_proton, hydronium, water_dict, box, data_tpl_content) # Ensure in correct order for printing atom_lists[HYD_MOL] = assign_hyd_mol(cfg, hydronium) atom_lists[WAT_MOL] = sort_wat_mols(cfg, water_dict) for a_list in atom_list_order: dump_atom_data += atom_lists[a_list] # overwrite atom_num, mol_num, atom_type, charge, then description for index in range(len(dump_atom_data)): if dump_atom_data[index][3] == data_tpl_content[ATOMS_CONTENT][index][3] or \ dump_atom_data[index][0] in cfg[PROT_TYPE_IGNORE_ATOMS]: dump_atom_data[index][0:4] = data_tpl_content[ATOMS_CONTENT][index][0:4] dump_atom_data[index][7] = ' '.join(data_tpl_content[ATOMS_CONTENT][index][7:]) else: raise InvalidDataError("In reading file: {}\n found atom index {} with charge {} which " "does not match the charge in the data template ({}). \n" "To ignore this mis-match, list " "the atom's index number in the keyword '{}' in the ini file." "".format(dump_file, dump_atom_data[index][0], dump_atom_data[index][3], data_tpl_content[ATOMS_CONTENT][index][3], PROT_TYPE_IGNORE_ATOMS)) d_out = create_out_fname(dump_file, suffix='_' + str(timestep), ext='.data', base_dir=cfg[OUT_BASE_DIR]) data_tpl_content[HEAD_CONTENT][0] = "Created by evbdump2data from {} " \ "timestep {}".format(dump_file, timestep) list_to_file(data_tpl_content[HEAD_CONTENT] + dump_atom_data + data_tpl_content[TAIL_CONTENT], d_out) counter += 1 if counter == 1: print("Completed reading dumpfile {}".format(dump_file)) else: warning("Dump file {} step {} did not have the full list of atom numbers. " "Continuing program.".format(dump_file, timestep))
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode): f_out = create_out_fname(dump_file, suffix="_sum", ext=".csv", base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, out_fieldnames, extrasaction="ignore", mode=write_mode)
def process_data_file(cfg, chk_atom_type, data_dict, data_file, data_tpl_content): with open(data_file) as d: pdb_data_section = copy.deepcopy(data_tpl_content[ATOMS_CONTENT]) pdb_atom_num = len(pdb_data_section) section = SEC_HEAD atom_id = 0 num_atoms = None atom_types = [] for line in d: line = line.strip() # not currently keeping anything from the header; just check num atoms if section == SEC_HEAD: if ATOMS_PAT.match(line): section = SEC_ATOMS elif num_atoms is None: atoms_match = NUM_ATOMS_PAT.match(line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) if num_atoms != pdb_atom_num: raise InvalidDataError("Mismatched numbers of atoms: \n" " Found {} atoms in file: {}\n" " and {} atoms in file: {}\n" "".format(pdb_atom_num, cfg[PDB_TPL_FILE], num_atoms, data_file)) # atoms_content to contain only xyz; also perform some checking elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() # Not currently checking molecule number # If decide to do so, should make a count from 1 as the PDB is read; the PDB does not # have to start from 1, but the data file counts molecules from 1. For now, decided # checking atom type is a sufficient check # mol_num = int(split_line[1]) # Keep as string; json save as string and this helps compare atom_types.append(split_line[2]) pdb_data_section[atom_id][5:8] = map(float, split_line[4:7]) atom_id += 1 # Check after increment because the counter started at 0 if atom_id == num_atoms: # Since the tail will come only from the template, nothing more is needed. break # Now that finished reading the file... if atom_id != num_atoms: raise InvalidDataError('In data file: {}\n' ' header section lists {} atoms, but found {} atoms'.format(data_file, num_atoms, atom_id)) if chk_atom_type: for data_type, atom in zip(atom_types, pdb_data_section): try: pdb_type = atom[2] + atom[3] if pdb_type not in data_dict[data_type]: warning('Did not find type {} in dictionary of values for atom_type {}: ({})' ''.format(pdb_type, data_type, data_dict[data_type])) # print("atom", atom_type, data_dict[atom_type]) except KeyError: warning('Did not find data file atom type {} in the atom type dictionary {}' ''.format(data_type, cfg[ATOM_TYPE_DICT_FILE])) f_name = create_out_fname(data_file, ext='.pdb', base_dir=cfg[OUT_BASE_DIR]) list_to_file(data_tpl_content[HEAD_CONTENT] + pdb_data_section + data_tpl_content[TAIL_CONTENT], f_name, list_format=cfg[PDB_FORMAT])
def make_summary(output_file, summary_file, cfg): low, high, headers = get_param_info(cfg) latest_output = np.loadtxt(output_file, dtype=np.float64) # append last best resid low = np.append(low, np.nan) high = np.append(high, np.nan) headers.append('resid') base_dir = os.path.dirname(output_file) latest_output = np.append(latest_output, get_resid(base_dir)) if os.path.isfile(summary_file): last_row = None percent_diffs = [] previous_output = np.loadtxt(summary_file, dtype=np.float64) all_output = np.vstack((previous_output, latest_output)) for row in all_output: if last_row is not None: diff = row - last_row percent_diff = {} # Check data for small values, hitting upper or lower bound, and calc % diff for index, val in enumerate(np.nditer(row)): if abs(val) < TOL: warning("Small value ({}) encountered for parameter {} (col {})" "".format(val, headers[index], index)) if abs(diff[index]) > TOL: if abs(last_row[index]) > TOL: percent_diff[headers[index]] = "%8.2f" % (diff[index] / last_row[index] * 100) else: percent_diff[headers[index]] = ' ' if abs(val-low[index]) < TOL: warning("Value ({}) near lower bound ({}) encountered for parameter {} (col {})." "".format(val, low[index], headers[index], index)) if abs(val-high[index]) < TOL: warning("Value ({}) near upper bound ({}) encountered for parameter {} (col {})." "".format(val, high[index], headers[index], index)) else: percent_diff[headers[index]] = ' ' percent_diffs.append(percent_diff) last_row = row # format for gnuplot and np.loadtxt f_out = create_out_fname(summary_file, suffix='_perc_diff', ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) write_csv(percent_diffs, f_out, headers, extrasaction="ignore") print('Wrote file: {}'.format(f_out)) f_out = create_out_fname(summary_file, ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) with open(f_out, 'w') as s_file: s_file.write(','.join(headers)+'\n') np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',') print('Wrote file: {}'.format(f_out)) # in addition to csv (above), print format for gnuplot and np.loadtxt with open(summary_file, 'w') as s_file: np.savetxt(s_file, all_output, fmt='%12.6f') print(summary_file) print("Wrote summary file {}".format(summary_file)) else: # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ') with open(summary_file, 'w') as s_file: np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ') print("Wrote results from {} to new summary file {}".format(output_file, summary_file))
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict): pdb_loc = cfg[PDB_FILE] pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} # to allow warning to be printed once and only once missing_types = [] qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] with open(pdb_loc) as f: wat_count = 0 atom_count = 0 mol_count = 1 current_mol = None last_mol_num = None atoms_content = [] for line in f: line = line.strip() line_len = len(line) if line_len == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if line_head == 'REMARK' or line_head == 'CRYST1': pdb_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] # For renumbering, making sure prints in the correct format, including num of characters: atom_count += 1 # For reordering atoms if atom_count in atom_num_dict: atom_id = atom_num_dict[atom_count] else: atom_id = atom_count if atom_id > 99999: atom_num = format(atom_id, 'x') if len(atom_num) > 5: warning("Hex representation of {} is {}, which is greater than 5 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: atom_num = '{:5d}'.format(atom_id) atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]] element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]] last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:] # For user-specified changing of molecule number if mol_num in mol_num_dict: mol_num = mol_num_dict[mol_num] # If doing water molecule checking... if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]: if (wat_count % 3) == 0: current_mol = mol_num if atom_type != ' OH2 ': warning('Expected an OH2 atom to be the first atom of a water molecule. ' 'Check line: {}'.format(line)) # last_cols = ' 0.00 0.00 S2 O' else: if current_mol != mol_num: warning('Water not in order on line:', line) if (wat_count % 3) == 1: if atom_type != ' H1 ': warning('Expected an H1 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) else: if atom_type != ' H2 ': warning('Expected an H2 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) wat_count += 1 if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES: if atom_type == C_ALPHA: ca_res_atom_id_dict[mol_num] = atom_id else: if atom_type == C_BETA: cb_res_atom_id_dict[mol_num] = atom_id if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_id) else: qmmm_elem_id_dict[element] = [atom_id] atoms_for_vmd.append(atom_id - 1) if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]: if atom_type in element_dict: element = element_dict[atom_type] else: if atom_type not in missing_types: warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite " "element type in the pdb output.".format(atom_type)) missing_types.append(atom_type) # For numbering molecules from 1 to end if cfg[RENUM_MOL]: if last_mol_num is None: last_mol_num = mol_num if mol_num != last_mol_num: last_mol_num = mol_num mol_count += 1 if mol_count == 10000: warning("Molecule numbers greater than 9999 will be printed in hex") # Due to PDB format constraints, need to print in hex starting at 9999 molecules. if mol_count > 9999: mol_num = format(mol_count, 'x') if len(mol_num) > 4: warning("Hex representation of {} is {}, which is greater than 4 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: mol_num = '{:4d}'.format(mol_count) line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, occ_t, element, last_cols] atoms_content.append(line_struct) # tail_content to contain everything after the 'Atoms' section else: pdb_data[TAIL_CONTENT].append(line) # Only sort if there is renumbering if len(atom_num_dict) > 0: pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1]) else: pdb_data[ATOMS_CONTENT] = atoms_content if cfg[PDB_NEW_FILE] is None: f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR]) print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT]) if len(cfg[RESID_QMMM]) > 0: f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def process_data_file(atom_type_dict, data_file, data_tpl_content, new_data_section): with open(data_file) as d: section = SEC_HEAD atom_id = 0 num_atoms = None for line in d.readlines(): line = line.strip() # not keeping anything from the header if section == SEC_HEAD: if ATOMS_PAT.match(line): section = SEC_ATOMS elif num_atoms is None: atoms_match = NUM_ATOMS_PAT.match(line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) if num_atoms != len(data_tpl_content[ATOMS_CONTENT]): raise InvalidDataError('The number of atoms in the template file ({}) does ' 'not equal the number of atoms ({}) in the data file file: {}.' ''.format(data_tpl_content[NUM_ATOMS], num_atoms, data_file)) # atoms_content to grab xyz and pbc rep; also perform some checking elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() # Not currently checking molecule number; the number may be wrong and the data still correct, # because of the reordering I did to match the template ordering. # Thus, I don't need: # mol_num = int(split_line[1]) # Perform checking that the atom type in the corresponding line of the template file matches # the current file try: old_atom_type = int(split_line[2]) # Add in the xyz coordinates new_data_section[atom_id][4:7] = map(float, split_line[4:7]) except (IndexError, ValueError): raise InvalidDataError("In attempting to read {} atoms from file: {}\n " "expected, but did not find, three ints followed by four floats on" "line: {}\n " "Check input".format(data_tpl_content[NUM_ATOMS], data_file, line)) # If there is an atom_type_dict, and the read atom type is in it.... if old_atom_type in atom_type_dict: new_atom_type = data_tpl_content[ATOMS_CONTENT][atom_id][2] matching_new_atom_type = atom_type_dict[old_atom_type] if new_atom_type != matching_new_atom_type: print('Data mismatch on atom_id {:3d}, line: {}\n Expected type {} but found type {}' ''.format(atom_id + 1, line, matching_new_atom_type, new_atom_type)) # and pbc ids, if they are there, before comments try: new_data_section[atom_id][7] = ' '.join(map(int, split_line[8:10] + [new_data_section[atom_id][7]])) except (ValueError, IndexError): # if there is no pdb id info and/or comment info, no problem. Keep on. pass atom_id += 1 # Check after increment because the counter started at 0 if atom_id == num_atoms: # Since the tail will come only from the template, nothing more is needed. break # Now that finished reading the file... # Check total length # (will be wrong if got to tail before reaching num_atoms) if atom_id != num_atoms: raise InvalidDataError('The number of atoms read from the file {} ({}) does not equal ' 'the listed number of atoms ({}).'.format(data_file, atom_id, num_atoms)) # Now make new file f_name = create_out_fname(data_file, suffix='_new', ext='.data') list_to_file(data_tpl_content[HEAD_CONTENT] + new_data_section + data_tpl_content[TAIL_CONTENT], f_name) print('Completed writing {}'.format(f_name))
def make_summary(cfg): """ If the option is specified, add the last best fit output file to the list of outputs and evaluate changes @param cfg: configuration for the run @return: """ best_file = cfg[MAIN_SEC][BEST_FILE] summary_file = cfg[MAIN_SEC][SUMMARY_FILE] low, high, headers = get_param_info(cfg) latest_output = np.loadtxt(best_file, dtype=np.float64) if os.path.isfile(summary_file): last_row = None percent_diffs = [] previous_output = np.loadtxt(summary_file, dtype=np.float64) all_output = np.vstack((previous_output, latest_output)) for row in all_output: if last_row is not None: diff = row - last_row percent_diff = {} # Check data for small values, hitting upper or lower bound, and calc % diff for index, val in enumerate(np.nditer(row)): if abs(val) < TOL: warning( "Small value ({}) encountered for parameter {} (col {})" "".format(val, headers[index], index)) if abs(diff[index]) > TOL: if abs(last_row[index]) > TOL: percent_diff[headers[index]] = round( diff[index] / last_row[index] * 100, 2) else: if abs(diff[index]) > TOL: percent_diff[headers[index]] = np.inf if abs(val - low[index]) < TOL: warning( "Value ({}) near lower bound ({}) encountered for parameter {} (col {})." "".format(val, low[index], headers[index], index)) if abs(val - high[index]) < TOL: warning( "Value ({}) near upper bound ({}) encountered for parameter {} (col {})." "".format(val, high[index], headers[index], index)) else: percent_diff[headers[index]] = np.nan percent_diffs.append(percent_diff) last_row = row if len(percent_diffs) > 0: max_percent_diff = 0 max_diff_param = None for param, val in percent_diffs[-1].items(): if abs(val) > abs(max_percent_diff): max_percent_diff = val max_diff_param = param print( "Maximum (absolute value) percent difference from last read line is {} % for parameter '{}'." "".format(max_percent_diff, max_diff_param)) if cfg[MAIN_SEC][RESID_IN_BEST]: print("Percent change in residual: {} %" "".format( percent_diffs[-1][RESIDUAL + cfg[MAIN_SEC][SUM_HEAD_SUFFIX]])) # format for gnuplot and np.loadtxt f_out = create_out_fname(summary_file, suffix='_perc_diff', ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) write_csv(percent_diffs, f_out, headers, extrasaction="ignore") f_out = create_out_fname(summary_file, ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) with open(f_out, 'w') as s_file: s_file.write(','.join(headers) + '\n') np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',') print('Wrote file: {}'.format(f_out)) # in addition to csv (above), print format for gnuplot and np.loadtxt with open(summary_file, 'w') as s_file: np.savetxt(s_file, all_output, fmt='%12.6f') print("Wrote file: {}".format(summary_file)) else: # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ') with open(summary_file, 'w') as s_file: np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ') print("Wrote results from {} to new summary file {}".format( best_file, summary_file))
def process_files(comp_f_list, col_name, base_out_name, delimiter, sep_out_flag, out_location): """ Want to grab the timestep, first and 2nd mole found, first and 2nd ci^2 print the timestep, residue ci^2 @param comp_f_list: a list of lists of file names to process (file read during input processing) @param col_name: name of column to use for alignment @param base_out_name: name of file to be created, or suffix if multiple files to be created @param delimiter: string, delimiter separating file names on lines of the comp_f_list @param sep_out_flag: a boolean to note if separate output files should be made based on each row of input @param out_location: user-specified location for the output files, if specified @return: @raise InvalidDataError: """ all_dicts = defaultdict(dict) # if need multiple output files, designate them by adding a prefix prefix = '' # if there will be multiple output files, make sure do not reuse a prefix, so keep copy of used names prefix_used = [] # if one output file from multiple sets of file to combine, will change write_mode to append later write_mode = 'w' # we don't have to specify run names in the output if there one row set of files to combine, # or if there will be separate output files if len(comp_f_list) < 2 or sep_out_flag: add_run_name = False headers = [] else: add_run_name = True headers = [RUN_NAME] for line_num, line in enumerate(comp_f_list): dict_keys = None if sep_out_flag: headers = [] all_dicts = defaultdict(dict) # separate on delimiter, strip any white space, and also get rid of empty entries comp_files = filter(None, [c_file.strip() for c_file in line.split(delimiter)]) # get the common part of the name, if it exists; otherwise, give the name the line index for file_index, file_name in enumerate(comp_files): base_name = os.path.splitext(os.path.basename(file_name))[0] if file_index == 0: run_name = base_name else: run_name = longest_common_substring(run_name, base_name) if run_name == '': # because will use run_name as a string, need to convert it run_name = str(line_num) + "_" for c_file in comp_files: new_dict = read_csv_to_dict(c_file, col_name) if dict_keys is None: dict_keys = new_dict.keys() else: dict_keys = set(dict_keys).intersection(new_dict.keys()) new_dict_keys = six.next(six.itervalues(new_dict)).keys() # Get the keys for the inner dictionary; diff methods for python 2 and 3 so use six # expect to only get new headers when making a new file (write_mode == 'w') # for the next file, will not gather more headers. When printed, extra cols will be skipped, and # missing columns will have no data shown if write_mode == 'w': for key in new_dict_keys: if key in headers: # okay if already have header if the header is the column. # If we are going to append, we also expect to already have the header name if key != col_name: warning("Non-unique column name {} found in {}. " "Values will be overwritten.".format(key, c_file)) else: headers.append(key) for new_key in new_dict.items(): all_dicts[new_key[0]].update(new_key[1]) final_dict = [] for key in sorted(dict_keys): final_dict.append(all_dicts[key]) # final_dict.append(all_dicts[key].update({RUN_NAME: run_name})) if add_run_name: for each_dict in final_dict: each_dict.update({RUN_NAME: run_name}) # Possible to have no overlap in align column if len(final_dict) > 0: # make sure col_name appears first by taking it out before sorting if sep_out_flag: prefix = run_name if prefix == '' or prefix in prefix_used: prefix = str(line_num) + "_" # have a consistent output by sorting the headers, but keep the aligning column first # only needs to be done for printing the first time if write_mode == 'w': headers.remove(col_name) headers = [col_name] + sorted(headers) if add_run_name: headers.remove(RUN_NAME) headers = [RUN_NAME] + headers f_name = create_out_fname(base_out_name, prefix=prefix, base_dir=out_location) prefix_used.append(prefix) write_csv(final_dict, f_name, headers, mode=write_mode) if not sep_out_flag and write_mode == 'w': write_mode = 'a' else: raise InvalidDataError("No common values found for column {} among files: {}" "".format(col_name, ", ".join(comp_files)))
def process_file(data_file, mcfg, delimiter=','): list_vectors, headers = read_csv_to_list(data_file, delimiter=delimiter, header=True) col_index_dict = {} for section in SUB_SECTIONS: col_index_dict[section] = {} for key, val in mcfg[section].items(): if key in headers: # Parser already made sure that unique entries col_index_dict[section][headers.index(key)] = val else: raise InvalidDataError("Key '{}' found in configuration file but not in data file: " "{}".format(key, data_file)) # set up bins, if needed bin_arrays = {} bin_labels = {} bin_counts = {} bin_ctrs = {} max_bins = {} for bin_col, col_bin_data in col_index_dict[BIN_SEC].items(): bin_min = col_bin_data[0] bin_max = col_bin_data[1] num_bins = col_bin_data[2] max_bins[bin_col] = col_bin_data[3] # already checked that 1 or more bins, so will not divide by zero bin_width = (bin_max - bin_min) / num_bins # set up for np.searchsorted, not np.histogram col_bins = np.arange(bin_min + bin_width, bin_max, bin_width) # set up for recording assigned bin center bin_ctrs[bin_col] = [round_to_print(ctr) for ctr in np.arange(bin_min + bin_width/2, bin_max, bin_width)] bin_counts[bin_col] = [0] * len(bin_ctrs[bin_col]) bin_arrays[bin_col] = col_bins bin_labels[bin_col] = '{0}_bin'.format(headers[bin_col]) headers = [bin_labels[bin_col]] + headers # allow filtering based on min and max col_index_dict[MIN_SEC][bin_col] = bin_min col_index_dict[MAX_SEC][bin_col] = bin_max initial_row_num = len(list_vectors) filtered_vectors = [] for row in list_vectors: keep_row = True for col, max_val in col_index_dict[MAX_SEC].items(): if row[col] > max_val: keep_row = False for col, min_val in col_index_dict[MIN_SEC].items(): if row[col] < min_val: keep_row = False if keep_row: for col_id, col_bins in bin_arrays.items(): bin_index = np.searchsorted(col_bins, row[col_id]) row = [bin_ctrs[col_id][bin_index]] + row bin_counts[col_id][bin_index] += 1 filtered_vectors.append(row) print("Keeping {} of {} rows based on filtering criteria".format(len(filtered_vectors), initial_row_num)) # Print output and determine if the output needs to be adjusted because of a max number of entries per bin ctr_format = "{:^11} {:^8}" ctr_format_max = "{:^11} {:^8} {:^7}" excess_bins = {} for col_bin in bin_arrays: print("Histogram data for column '{}': ".format(bin_labels[col_bin])) if max_bins[col_bin] is None: print(ctr_format.format('bin_ctr', 'count')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): print(ctr_format.format(bin_ctr, bin_counts[col_bin][bin_index])) else: bin_max = max_bins[col_bin] excess_bins[col_bin] = {} print(ctr_format_max.format('bin_ctr', 'found', 'keep')) for bin_index, bin_ctr in enumerate(bin_ctrs[col_bin]): num_found = bin_counts[col_bin][bin_index] if num_found > bin_max: num_keep = bin_max # use bin_ctr as key because that is what is saved on the row excess_bins[col_bin][bin_ctrs[col_bin][bin_index]] = {QUOT: num_found / bin_max, MOD: num_found % bin_max} else: num_keep = num_found print(ctr_format_max.format(bin_ctr, num_found, num_keep)) if len(excess_bins) == 1: count_bin = {} delete_rows = [] mod_r = {} quot_r = {} for col_bin in excess_bins: for bin_remove, bin_dict in excess_bins[col_bin].items(): mod_r[bin_remove] = bin_dict[MOD] quot_r[bin_remove] = bin_dict[QUOT] count_bin[bin_remove] = 0 r_count = 0 for row_id, row in enumerate(filtered_vectors): bin_name = row[0] # print(bin_name) if bin_name in excess_bins[col_bin]: count_bin[bin_name] += 1 if count_bin[bin_name] % quot_r[bin_name] != 0 or count_bin[bin_name] <= mod_r[bin_name]: delete_rows.append(row_id) # print(row_id) r_count += 1 filtered_vectors = [row for row_id, row in enumerate(filtered_vectors) if row_id not in delete_rows] if len(excess_bins) > 1: warning("No filtering based on a max number of entries will be done; this feature is currently implemented " "only for binning with one column's values.") f_name = create_out_fname(data_file, prefix='filtered_', ext='.csv') list_to_csv([headers] + filtered_vectors, f_name, delimiter=',')