def draw_plot(ax, x, y, color, x_axis, y_axis, title): scatterplot.draw_actual_plot(ax, x, y, color, x_axis, y_axis, title, size=40) coeff, pval = pearsonr(x, y) rho, pval = spearmanr(x, y) mae = mean_abs_error(x, y) conv.add_text_dict(ax, { "PCC" : coeff, "Rho" : rho, "MAE" : mae }) scatterplot.add_x_y_line(ax, min_val=min(x), max_val=max(x)) return [coeff, rho, mae]
def gen_scatterplot(ax, x_axis, y_axis, z_axis, min_naive_by_pdb): x_deltas = get_dist_deltas(x_axis, "All", min_naive_by_pdb) y_deltas = get_dist_deltas(y_axis, "All", min_naive_by_pdb) z_deltas = get_dist_deltas(z_axis, "All", min_naive_by_pdb) #c_deltas = get_dist_deltas("All", None, min_naive_by_pdb) scatterplot.draw_actual_plot(ax, x_deltas, y_deltas, 'k', x_axis, x_axis + " Delta to Min RMSD (A)", "Delta to Min RMSD (A)", size=15, label=y_axis) scatterplot.draw_actual_plot(ax, x_deltas, z_deltas, 'r', x_axis, x_axis + " Delta to Min RMSD (A)", "Delta to Min RMSD (A)", size=15, label=z_axis) scatterplot.add_x_y_line(ax) conv.add_legend(ax)
def main(rec_corr_path, ddg_path, amber_pdb_path, rosetta_pdb_path, out_csv_path): amber_csv_path = os.path.join(ddg_path, "amber") rosetta_csv_path = os.path.join(ddg_path, "rosetta") amber_inter_mean_path = os.path.join(ddg_path, "amber_inter_mean") rosetta_inter_mean_path = os.path.join(ddg_path, "rosetta_inter_mean") rosetta_inter_path = os.path.join(ddg_path, "rosetta_inter") #Plots to generate #Per pdb - known ddg vs. many diff protocols, each with their own row. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto) #For all pdbs - pred rosetta corr values vs. many diff protocols, each with their series color. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto). 3 rows one for each corr value #For all pdbs - known ddg vs. many diff protocols, each with their own row. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto) list_rec_corr_names = glob.glob(rec_corr_path + "*.rc") #corr_values_dict has the following shape - "Pred" : { "Pred" : [ddg_vals] }, "Amber" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]}, "Rosetta" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]} corr_values_dict = {} all_amber_ddg_dict = {} all_rosetta_ddg_dict = {} all_known_ddg_dict = {} all_pred_ddg_dict = {} k_ddg = [] p_ddg = [] for rec_corr in list_rec_corr_names: print rec_corr rec_corr_list = read_csv_list(rec_corr) #no known ddg if len(rec_corr_list[0]) == 3: continue amber_dg_dict = {} rosetta_dg_dict = {} #read in all amber csvs that correspond to column 3 in rec_corr file and rosetta ones too for record_id, prefix, filename, known_ddg, pred_ddg in rec_corr_list: amber_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"), "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"), "Mean Interaction Energy" : get_mean_txt(os.path.join(amber_inter_mean_path,filename+".txt")) } rosetta_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), "Mean Interaction Energy" : get_mean_txt(os.path.join(rosetta_inter_mean_path,filename+".txt")), "Bottom 3 Interaction Energy" : get_bottom3_csv(os.path.join(rosetta_inter_path,filename+".csv")) } #find wt csv that correspond to wt row in rec_corr_file (column 2) wt_csv_name = [ rec[2] for rec in rec_corr_list if "wt" in rec[1] ][0] amber_ddg_dict = {} rosetta_ddg_dict = {} known_ddg_dict = {} pred_ddg_dict = {} #loops thru other records in rec_corr_dict for rec, prefix, filename, k,p in rec_corr_list: if "wt" not in prefix: if amber_ddg_dict.get(filename) is None: amber_ddg_dict[filename] = {} if rosetta_ddg_dict.get(filename) is None: rosetta_ddg_dict[filename] = {} for key, dg in amber_dg_dict[wt_csv_name].items(): amber_ddg_dict[filename][key] = amber_dg_dict[filename][key] - dg for key, dg in rosetta_dg_dict[wt_csv_name].items(): rosetta_ddg_dict[filename][key] = rosetta_dg_dict[filename][key] - dg known_ddg_dict[filename] = { "Known" : float(k) } pred_ddg_dict[filename] = { "Pred" : float(p) } all_amber_ddg_dict.update(amber_ddg_dict) all_rosetta_ddg_dict.update(rosetta_ddg_dict) all_known_ddg_dict.update(known_ddg_dict) all_pred_ddg_dict.update(pred_ddg_dict) fig, axarr = conv.create_ax(max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()]), 3, shx=True, shy=True) plot_ddg_dict(rosetta_ddg_dict,known_ddg_dict,axarr,0,"Rosetta",corr_values_dict) plot_ddg_dict(amber_ddg_dict,known_ddg_dict,axarr,1,"Amber",corr_values_dict) plot_ddg_dict(pred_ddg_dict,known_ddg_dict,axarr,2,"Pred",corr_values_dict) conv.save_fig(fig, out_csv_path + "/" + os.path.splitext(os.path.basename(rec_corr))[0] + ".txt", "ddg", max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12) #Plot all correlation values fig_all, axarr_all = conv.create_ax(len(corr_values_dict["Rosetta"]),3) #assumes that Rosetta has more protocols than Amber for x_ind,(protocol, vals) in enumerate(corr_values_dict["Rosetta"].items()): if corr_values_dict["Amber"].get(protocol) is not None: amber_vals = corr_values_dict["Amber"][protocol] else: amber_vals = None pred_vals = corr_values_dict["Pred"]["Pred"] labels=["-PCC","-Rho","-Mae"] for ind,(val_list,label) in enumerate(zip(vals,labels)): series = [[val_list,pred_vals[ind],"Rosetta "+protocol]] if amber_vals is not None: series.append([amber_vals[ind],pred_vals[ind],"Amber "+protocol]) scatterplot.plot_series(axarr_all[ind,x_ind], series, protocol,"Pred",label,colors=['coral','cyan'], size=40) scatterplot.add_x_y_line(axarr_all[ind,x_ind]) #if x_ind == 2: # axarr_all[x_ind,y_ind].set_xlim([-0.2,10.0]) # axarr_all[x_ind,y_ind].set_ylim([-0.2,10.0]) # scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],0.0,10.0) #else: #axarr_all[x_ind,y_ind].set_xlim([-1.2,1.2]) # axarr_all[x_ind,y_ind].set_ylim([-1.2,1.2]) # scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],-1.0,1.0) conv.save_fig(fig_all, out_csv_path + "/all.txt", "ddg", 16, 12) fig_all_corr, axarr_all_corr = conv.create_ax(max([len(d) for k, d in all_amber_ddg_dict.items() ]+[len(d) for k,d in all_rosetta_ddg_dict.items()]), 3, shx=True, shy=True) plot_ddg_dict(all_rosetta_ddg_dict,all_known_ddg_dict,axarr_all_corr,0,"Rosetta",corr_values_dict) plot_ddg_dict(all_amber_ddg_dict,all_known_ddg_dict,axarr_all_corr,1,"Amber",corr_values_dict) plot_ddg_dict(all_pred_ddg_dict,all_known_ddg_dict,axarr_all_corr,2,"Pred",corr_values_dict) conv.save_fig(fig_all_corr, out_csv_path + "/all_corr.txt", "ddg",max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12)
def main(list_input_dirs, energies_names, output_pre): #read in and rename arguments inp_dir1=list_input_dirs[0][0] scoretype1=list_input_dirs[0][1] inp_dir2=list_input_dirs[1][0] scoretype2=list_input_dirs[1][1] title1 = os.path.basename(inp_dir1) title2 = os.path.basename(inp_dir2) column_dict = {} for c in energies_names: column_dict[c[0]] = c[1:] dec1, nat1 = scorefileparse.read_dec_nat(inp_dir1, energies_names[scoretype1], scoretype1) dec2, nat2 = scorefileparse.read_dec_nat(inp_dir2, energies_names[scoretype2], scoretype2) [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec1, nat1, dec2, nat2]) sum_discs = Counter() fig, axarr = conv.create_ax(1, len(dec_inter1)+1, True,True) for x_ind, pdb in enumerate(sorted(dec_inter1.keys())): discs_per_pdb = {} for w_1 in xrange(-10,10,2): for w_2 in xrange(-10,10,2): weight_1 = 2 ** w_1 weight_2 = 2 ** w_2 weighted_1 = scorefileparse.weight_dict(dec_inter1[pdb], weight_1) weighted_2 = scorefileparse.weight_dict(dec_inter2[pdb], weight_2) merged = scorefileparse.merge_dicts([weighted_1, weighted_2]) ddata1 = scorefileparse.convert_disc(merged) disc_divs = [1.0,1.5,2.0,2.5,3.0,4.0,6.0] disc1, d, counts = disc.given_data_run_disc(ddata1, True, disc_divs) discs_per_pdb[(weight_1,weight_2)] = disc1 sorted_disc = sorted(discs_per_pdb.values()) max_title = [ t for t,v in discs_per_pdb.items() if v == sorted_disc[0] ] #header_string = "\t".join("{0:.3f}-{1:.3f}".format(x,y) for x,y in sorted(discs_per_pdb.keys())) + "\tMax_Weight" #values_string = "\t".join(format(x, "10.3f") for (w1,w2),x in sorted(discs_per_pdb.items())) + "\t{0:.3f}".format(max_title[0]) #print header_string #print values_string ax = axarr[x_ind, 0] #ax.set_xlim(-10, 600) #ax.set_ylim(-10, 600) ax.set_xscale('log', basex=2) ax.set_yscale('log', basey=2) x = [ w1 for (w1,w2) in sorted(discs_per_pdb.keys()) ] y = [ w2 for (w1,w2) in sorted(discs_per_pdb.keys()) ] d = [ v for k,v in sorted(discs_per_pdb.items()) ] min_y = min(discs_per_pdb.values()) max_y = max(discs_per_pdb.values()) #print min_y, max_y s = scatterplot.draw_actual_plot(ax, x, y, d, pdb, scoretype1, scoretype2, 'bwr') fig.colorbar(s,ax=ax) #ax.axhline(y=min_y) #ax.set_ylim(min_y-0.05,max_y+0.05) scatterplot.add_x_y_line(ax, 0,600) sum_discs.update(discs_per_pdb) #print "All PDBs {0}".format(len(dec_inter1)) #sorted_disc = sorted(sum_discs.values()) #max_title = [ t for t,v in sum_discs.items() if v == sorted_disc[0] ] #header_string = "\t".join(format(x, "10.3f") for x in sorted(sum_discs.keys())) + "\tMax_Weight" #values_string = "\t".join(format(x/len(dec_inter1), "10.3f") for key,x in sorted(sum_discs.items())) + "\t{0:.3f}".format(max_title[0]) #print header_string #print values_string ax = axarr[len(dec_inter1), 0] min_y = min(x/len(dec_inter1) for x in sum_discs.values()) max_y = max(x/len(dec_inter1) for x in sum_discs.values()) x = [ w1 for w1,w2 in sorted(sum_discs.keys()) ] y = [ w2 for w1,w2 in sorted(sum_discs.keys()) ] d = [ v/len(dec_inter1) for k,v in sorted(sum_discs.items()) ] #fix titles of axes ax.set_xscale('log', basex=2) ax.set_yscale('log', basey=2) s = scatterplot.draw_actual_plot(ax, x,y,d, "All", scoretype1, scoretype2, cm='bwr') fig.colorbar(s,ax=ax) scatterplot.add_x_y_line(ax, 0,600) #ax.axhline(y=min_y) conv.save_fig(fig, output_pre, "_weights_v_disc", 3, len(dec_inter1)*3)