def main(): # Parse args PELE_sim_paths, output_path, proc_number, warning_threshold = parse_args() all_sim_it = SimIt(PELE_sim_paths) models_counter = {} for PELE_sim_path in all_sim_it: sim_it = SimIt(PELE_sim_path) sim_it.build_repo_it(output_path, 'report') reports = [repo for repo in sim_it.repo_it] with Pool(proc_number) as pool: results = pool.map(parallel_models_counter, reports) models_counter[PELE_sim_path] = sum(results) print('Results:') for sim, result in models_counter.items(): print(' - {:<100}: {:10d} models'.format(str(sim), result)) print('Warnings:') for sim, result in models_counter.items(): if (result < warning_threshold): print(' - {:<100}: {:10d} models'.format(str(sim), result))
def main(): hb_paths, mode, lim, epochs_to_ignore, trajectories_to_ignore, \ models_to_ignore, relative_output_path, proc_number, \ PELE_output_path, PELE_report_name = parse_args() hb_paths_list = [] if (type(hb_paths) == list): for hb_path in hb_paths: hb_paths_list += glob.glob(hb_path) else: hb_paths_list = glob.glob(hb_paths) general_results = {} for hb_path in hb_paths_list: df = create_df(hb_path) # Calculate hbond_atoms, which is a dict with PELE_ids as key and # corresponding lists of H bonds as values hbond_atoms = get_hbond_atoms_from_df(df, hb_path, epochs_to_ignore, trajectories_to_ignore, models_to_ignore) if (relative_output_path is not None): output_path = Path(hb_path).parent.joinpath(relative_output_path) else: output_path = relative_output_path if (mode == "count"): counter = count(hbond_atoms) elif (mode == "relative_frequency"): counter = count_norm(hbond_atoms) elif (mode == "frequent_interactions"): counter = count_norm(hbond_atoms) counter = discard_non_frequent(counter, lim) elif (mode == "mean_energies"): sim_it = SimIt(Path(hb_path).parent) sim_it.build_repo_it(PELE_output_path, 'report') reports = [repo for repo in sim_it.repo_it] PELE_ids = extract_PELE_ids(reports) metrics = extract_metrics(reports, (4, ), proc_number) ies = [] for ies_chunk in metrics: ies.append(list(map(float, np.concatenate(ies_chunk)))) ie_by_PELE_id = get_metric_by_PELE_id(PELE_ids, ies) counter = count_energy(hbond_atoms, ie_by_PELE_id) general_results[hb_path] = counter combined_results = combine_results(general_results, mode) generate_barplot(combined_results, mode, lim, output_path)
def main(): # Parse args PELE_sim_paths, csv_file, output_name = parse_args() all_sim_it = SimIt(PELE_sim_paths) print(' - Simulations that will be analyzed:') for sim_path in all_sim_it: print(' - {}'.format(sim_path.name)) for PELE_sim_path in all_sim_it: print('') print(' - Analyzing {}'.format(PELE_sim_path)) csv_path = PELE_sim_path.joinpath(csv_file) if (not csv_path.is_file()): print(' - Skipping simulation because intersections csv file ' + 'was missing') continue data = pd.read_csv(str(csv_path)) data = data.loc[:, ~data.columns.str.contains('^Unnamed')] columns = [] print(' - Subpockets found:') for col in data.columns: if ('_intersection' in col): columns.append(col) print(' - {}'.format(col.strip('_intersection'))) if (len(columns) == 0): print(' - Skipping simulation because no subpocket was found') continue with open(str(PELE_sim_path.joinpath(output_name)), 'w') as f: f.write(' - Subpocket results:\n') for col in columns: intersects = data.loc[:, col].to_numpy() f.write(' - {}:\n'.format(col.strip('_intersection'))) f.write(' - Mean: {: 7.2f}\n'.format(np.mean(intersects))) f.write(' - Min: {: 7.2f}\n'.format(np.min(intersects))) f.write(' - 5th percentile: {: 7.2f}\n'.format( np.percentile(intersects, 5))) f.write(' - 1st quartile: {: 7.2f}\n'.format( np.percentile(intersects, 25))) f.write(' - Median: {: 7.2f}\n'.format( np.median(intersects))) f.write(' - 3rd quartile: {: 7.2f}\n'.format( np.percentile(intersects, 75))) f.write(' - 95th percentile: {: 7.2f}\n'.format( np.percentile(intersects, 95))) f.write(' - Max: {: 7.2f}\n'.format(np.max(intersects)))
def main(): # Parse args PELE_sim_paths, filtered_hbonds_path, ic50_csv, hbonds, normalize = \ parse_args() if (len(hbonds) == 0): raise ValueError('No H bonds to track were defined') all_sim_it = SimIt(PELE_sim_paths) print(' - Simulations that will be analyzed:') for sim_path in all_sim_it: print(' - {}'.format(sim_path.name)) data = pd.DataFrame() for PELE_sim_path in all_sim_it: print('') print(' - Reading data from {}'.format(PELE_sim_path)) if (not PELE_sim_path.joinpath(filtered_hbonds_path).is_file()): print(' - Skipping simulation because filtered H bonds csv file ' + 'was missing') continue sim_data = pd.read_csv(PELE_sim_path.joinpath(filtered_hbonds_path), sep=';') spec_hbonds = [] for hbond in hbonds: for col in sim_data.columns: if (hbond in col and col not in spec_hbonds): spec_hbonds.append(col) break print(' - Retrieving H bonds: {}'.format(spec_hbonds)) sim_data = sim_data.loc[:, spec_hbonds + ['donors', 'acceptors']] sim_data['path'] = PELE_sim_path.name data = pd.concat((data, sim_data)) print(' - Retrieving IC50 values') ic50 = pd.read_csv(ic50_csv) data = data.merge(ic50, left_on='path', right_on='path') data['pIC50'] = -np.log10(data.loc[:, 'IC50'] / 1000000) if (normalize): print(' - Normalizing H bonds') data['donors+acceptors'] = data['donors'] + data['acceptors'] for hbond in spec_hbonds: data[hbond] = data[hbond] / data['donors+acceptors'] fig, axs = plt.subplots( int(len(spec_hbonds) / 2) + len(spec_hbonds) % 2, 2, figsize=(15, 5 * int(len(spec_hbonds) / 2 + len(spec_hbonds) % 2))) fig.suptitle('H bond frequency vs -pIC50') X_all = data.loc[:, spec_hbonds].values y_all = data['pIC50'].values for i, hbond in enumerate(spec_hbonds): ax = axs[int(i / 2)][i % 2] ax.set_title(hbond) if (normalize): ax.set_ylabel('Normalized frequency') else: ax.set_ylabel('Frequency') ax.set_xlabel('-pIC50') x_array = np.array([X[i] for X in X_all]) ax.plot(y_all, x_array, ls='', c='r', marker='x') ax.set_axisbelow(True) ax.grid(True, color='white') ax.set_facecolor('lightgray') lin_reg = LinearRegression() lin_reg.fit(y_all.reshape(-1, 1), x_array.reshape(-1, 1)) y_pred = lin_reg.predict(y_all.reshape(-1, 1)) for x, xp, y, l in zip(x_array, y_pred, y_all, data['path'].values): if (xp == min(y_pred)): min_y = y if (xp == max(y_pred)): max_y = y ax.annotate(l.split('_')[-1], (y, x), textcoords="offset points", xytext=(0, 10), ha='center') ax.plot((min_y, max_y), (min(y_pred), max(y_pred)), 'k--', linewidth=1) ax.autoscale(tight=False) handles = [ mpl_patches.Rectangle((0, 0), 1, 1, fc="white", ec="white", lw=0, alpha=0) ] score = "r2 = {:.3f}".format(metrics.r2_score(x_array, y_pred)) labels = [] labels.append(score) ax.legend(handles, labels, loc='best', fontsize='small', fancybox=True, framealpha=0.7, handlelength=0, handletextpad=0) # Empty unpaired axis if (i % 2 == 0): fig.delaxes(axs[int(i / 2)][1]) plt.tight_layout(rect=(0, 0, 1, 0.97)) plt.savefig('Hbond_correlations.png') plt.close()
def main(): # Parse args PELE_sim_paths, csv_file_name, ic50_csv, percentile = parse_args() all_sim_it = SimIt(PELE_sim_paths) print(' - Simulations that will be analyzed:') for sim_path in all_sim_it: print(' - {}'.format(sim_path.name)) columns = [] for PELE_sim_path in all_sim_it: if (not PELE_sim_path.joinpath(csv_file_name).is_file()): print(' - Skipping simulation because subpockets csv file ' + 'was missing') continue data = pd.read_csv(PELE_sim_path.joinpath(csv_file_name)) data = data.loc[:, ~data.columns.str.contains('^Unnamed')] for col in data.columns: if ('_nonpolar_intersection' in col): if (col not in columns): columns.append(col) print(' - Subpockets found:') for col in columns: print(' - {}'.format(col.strip('_nonpolar_intersection'))) if (len(columns) == 0): raise ValueError('Subpocket nonpolar intersections were missing in ' + 'the simulation paths that were supplied') fig, axs = plt.subplots(len(columns), 1, figsize=(20, 15)) fig.suptitle('Subpocket-LIG non-polar volume intersection') for i, col in enumerate(columns): axs[i].set_title(col.strip('_nonpolar_intersection')) axs[i].set_ylabel('{}'.format(col.strip('_nonpolar_intersection')) + '-LIG non-polar volume intersection ($\AA^3$)') subpocket_results = pd.DataFrame() for PELE_sim_path in all_sim_it: print('') print(' - Reading data from {}'.format(PELE_sim_path)) if (not PELE_sim_path.joinpath(csv_file_name).is_file()): print(' - Skipping simulation because intersection csv file ' + 'is missing') continue print(' - Retrieving subpocket intersections') data = pd.read_csv(PELE_sim_path.joinpath(csv_file_name)) metrics = [ PELE_sim_path.name, ] for col in columns: values = data[col].values metrics.append(np.percentile(values, percentile)) subpocket_results = pd.concat([ subpocket_results, pd.DataFrame([metrics], columns=[ 'path', ] + columns) ]) print(' - Retrieving IC50 values') ic50 = pd.read_csv(ic50_csv) subpocket_results = subpocket_results.merge(ic50, left_on='path', right_on='path') subpocket_results['pIC50'] = -np.log10( subpocket_results.loc[:, 'IC50'] / 1000000) fig, axs = plt.subplots(int(len(columns) / 2) + len(columns) % 2, 2, figsize=(15, 5 * int(len(columns) / 2 + len(columns) % 2))) fig.suptitle('Subpocket non-polar occupancy vs -pIC50') X_all = subpocket_results.loc[:, columns].values y_all = subpocket_results['pIC50'].values for i, col in enumerate(columns): ax = axs[int(i / 2)][i % 2] ax.set_title(col.strip('_nonpolar_intersection')) ax.set_ylabel('{}-percentile of {} occupancies'.format( percentile, col.strip('_nonpolar_intersection'))) ax.set_xlabel('-pIC50') x_array = np.array([X[i] for X in X_all]) ax.plot(y_all, x_array, ls='', c='r', marker='x') ax.set_axisbelow(True) ax.grid(True, color='white') ax.set_facecolor('lightgray') lin_reg = LinearRegression() lin_reg.fit(y_all.reshape(-1, 1), x_array.reshape(-1, 1)) y_pred = lin_reg.predict(y_all.reshape(-1, 1)) for x, xp, y, path in zip(x_array, y_pred, y_all, subpocket_results['path'].values): if (xp == min(y_pred)): min_y = y if (xp == max(y_pred)): max_y = y ax.annotate(path, (y, x), textcoords="offset points", xytext=(0, 10), ha='center') ax.plot((min_y, max_y), (min(y_pred), max(y_pred)), 'k--', linewidth=1) ax.autoscale(tight=False) handles = [ mpl_patches.Rectangle((0, 0), 1, 1, fc="white", ec="white", lw=0, alpha=0) ] score = "r2 = {:.3f}".format(skmetrics.r2_score(x_array, y_pred)) labels = [] labels.append(score) ax.legend(handles, labels, loc='best', fontsize='small', fancybox=True, framealpha=0.7, handlelength=0, handletextpad=0) # Empty unpaired axis if (i % 2 == 0): fig.delaxes(axs[int(i / 2)][1]) plt.tight_layout(rect=(0, 0, 1, 0.97)) plt.savefig('subpocket_nonpolar_correlations.png') plt.close()
def main(): # Parse args PELE_sim_paths, hbonds_relative_path, hbonds, output_path, lig_resname = \ parse_args() output_path = Path(output_path) hbonds_to_track = get_hbond_linkers(hbonds) print(' - Persistance will be calculated on H bonds :') print_hbonds(hbonds_to_track) all_sim_it = SimIt(PELE_sim_paths) for PELE_sim_path in all_sim_it: print('') print(' - Filtering H bonds from {}'.format(PELE_sim_path)) hbonds_path = PELE_sim_path.joinpath(hbonds_relative_path) lig_rotamers_path = PELE_sim_path.joinpath('DataLocal/' + 'LigandRotamerLibs/' + '{}'.format(lig_resname) + '.rot.assign') if (not hbonds_path.is_file()): print(' - Skipping simulation because hbonds file was ' + 'missing') continue if (not lig_rotamers_path.is_file()): print(' - Skipping simulation because ligand rotamer library was' + ' missing') continue hbond_data, n_donors, n_acceptors = extract_hbond_linkers(hbonds_path) print(' - Detected {} sets of H bonds'.format(len(hbond_data))) if (len(hbonds) == 0): print(' - Skipping simulation because no H bonds were found') continue persistance_by_hbond = hbond_persistance(hbond_data, hbonds_to_track) n_rotamers = get_ligand_rotatable_bonds(lig_rotamers_path) print(' - Results:') print(' - Ligand rotamers: {:10d}'.format(n_rotamers)) print(' - Ligand donors: {:10d}'.format(n_donors)) print(' - Ligand acceptors: {:10d}'.format(n_acceptors)) print(' - Total models: {:10d}'.format(len(hbond_data))) if (len(hbonds_to_track) > 0): print(' - Maximum persistance by H bond:') for hb_linker in hbonds_to_track: print(' - {}:{}:{:20s} {:10d}'.format( hb_linker.chain, hb_linker.residue, ','.join(list(hb_linker.atoms)), np.max(persistance_by_hbond.get(hb_linker, [ 0, ])))) if (len(hbonds_to_track) > 0): print(' - Mean persistance by H bond:') for hb_linker in hbonds_to_track: print(' - {}:{}:{:20s} {:10.1f}'.format( hb_linker.chain, hb_linker.residue, ','.join(list(hb_linker.atoms)), np.mean(persistance_by_hbond.get(hb_linker, [ 0, ])))) with open(str(PELE_sim_path.joinpath(output_path)), 'w') as f: for hb_linker in hbonds_to_track: f.write('{}:{}:{};'.format(hb_linker.chain, hb_linker.residue, ','.join(hb_linker.atoms))) f.write(';'.join( map( str, sorted(persistance_by_hbond.get(hb_linker, []), reverse=True)))) f.write('\n') with open( str( PELE_sim_path.joinpath( str(output_path).replace(output_path.suffix, '') + '_summary.out')), 'w') as f: f.write('rotamers;donors;acceptors;models') for hb_linker in hbonds_to_track: f.write(';maxp_{}:{}:{}'.format(hb_linker.chain, hb_linker.residue, ','.join(hb_linker.atoms))) f.write(';meanp_{}:{}:{}'.format(hb_linker.chain, hb_linker.residue, ','.join(hb_linker.atoms))) f.write('\n') f.write('{};{};{};'.format(n_rotamers, n_donors, n_acceptors)) f.write('{}'.format(len(hbond_data))) for hb_linker in hbonds_to_track: f.write(';{:d}'.format( np.max(persistance_by_hbond.get(hb_linker, [ 0, ])))) f.write(';{:.1f}'.format( np.mean(persistance_by_hbond.get(hb_linker, [ 0, ]))))
def main(): # Parse args PELE_sim_paths, PELE_output_path, proc_number, output_relative_path, \ ie_col, topology_relative_path, lig_resname = parse_args() all_sim_it = SimIt(PELE_sim_paths) p_function = partial(parallel_metrics_getter, ie_col) for PELE_sim_path in all_sim_it: sim_it = SimIt(PELE_sim_path) sim_it.build_repo_it(PELE_output_path, 'report') print(' - Analyzing {}'.format(PELE_sim_path)) topology_path = PELE_sim_path.joinpath(topology_relative_path) if (not topology_path.is_file()): print(' - Skipping simulation because topology file with ' + 'connectivity was missing') continue reports = [repo for repo in sim_it.repo_it] with Pool(proc_number) as pool: results = pool.map(p_function, reports) min_te = 0 min_ie = 0 min_te_PDB_id = None min_ie_PDB_id = None for repo, (tes, ies) in zip(reports, results): for i, te in enumerate(tes): if (float(te) < min_te): min_te = float(te) min_te_PDB_id = (repo.parent, int(''.join(filter( str.isdigit, repo.name))), i) for i, ie in enumerate(ies): if (float(ie) < min_ie): min_ie = float(ie) min_ie_PDB_id = (repo.parent, int(''.join(filter( str.isdigit, repo.name))), i) ligand_heavy_atoms, ligand_mass = extract_ligand_properties( topology_path, lig_resname) output_path = PELE_sim_path.joinpath(output_relative_path) if (not output_path.is_dir()): os.mkdir(str(output_path)) with open(str(output_path.joinpath('results.out')), 'w') as f: f.write('lig_heavy_atoms,lig_mass,') f.write('best_total_energy,best_interaction_energy\n') f.write('{},{:.3f},{},{}\n'.format(ligand_heavy_atoms, ligand_mass, min_te, min_ie)) if (min_te_PDB_id is not None): t = md.load(str(min_te_PDB_id[0].joinpath( 'trajectory_{}.xtc'.format(min_te_PDB_id[1]))), top=str(topology_path)) t[min_te_PDB_id[2]].save_pdb( str(output_path.joinpath('best_total_energy.pdb'))) if (min_ie_PDB_id is not None): t = md.load(str(min_ie_PDB_id[0].joinpath( 'trajectory_{}.xtc'.format(min_ie_PDB_id[1]))), top=str(topology_path)) t[min_ie_PDB_id[2]].save_pdb( str(output_path.joinpath('best_interaction_energy.pdb')))