def main(data_file, output_prefix, degree_file, width, height): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True) seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True) degree_frac = defaultdict(list) for seq, seq_dict in sequences.items(): degree_frac[seq_degree[seq]['Degree']].append(np.mean(seq_dict["Frac"])) data = [ np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items() ] degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ] degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0,0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr2[0,0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
def main(epistasis_file): dict_epistasis = {} #list of list of sequences, where each item represents a label with open(epistasis_file) as e: lines = e.readlines() for l in lines[1:]: #ignore header line tokens = l.split(',') #value consists of Starting Ratio, Ending Ratio, Epistasis, Ending Fitness, # of Mutations, list of InterSeqs, list of InterFits, list of InterRatios if dict_epistasis.get((tokens[2], tokens[0])) is None: dict_epistasis[(tokens[0],tokens[2])] = [ float(tokens[1]), float(tokens[3]), float(tokens[5]), tokens[4], len(tokens[6::3]), tokens[6::3], tokens[7::3], tokens[8::3] ] neg_epistasis = [0] * 4 no_epistasis = [0] * 4 pos_epistasis = [0] * 4 n_functional = [0] * 4 n_should_be_functional = [0] * 4 n_total = [0] * 4 for i in xrange(2,6): ind = i-2 neg_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] < -0.000005 and value[4] == i ]) no_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if abs(value[2]) < 0.000005 and value[4] == i ]) pos_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] > 0.000005 and value[4] == i ]) n_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[3] == "CLEAVED" and value[4] == i ]) n_should_be_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if all(v == "CLEAVED" for v in value[6]) and value[4] == i ]) n_total[ind] = float(sum([ 1 for key, value in dict_epistasis.items() if value[4] == i])) seq_func = set([ key[1] for key,val in dict_epistasis.items() if val[3] == "CLEAVED" ]) seq_pred_func = set([ key[1] for key,val in dict_epistasis.items() if all(v == "CLEAVED" for v in val[6]) ]) fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) fig2, axarr2 = pconv.create_ax(1, 1) artists = [] artists.extend(plot_epi(no_epistasis, n_total, axarr[0,0], "No", color="gray")) artists.extend(plot_epi(neg_epistasis, n_total, axarr[0,0], "Neg.", bottom=no_epistasis, color="white")) artists.extend(plot_epi(pos_epistasis, n_total, axarr[0,0], "Pos.", bottom=[no + neg for no, neg in zip(no_epistasis, neg_epistasis)], color="black")) n_func_frac = [ func/total for func, total in zip(n_functional, n_total) ] n_pred_frac = [ pred/total for pred, total in zip(n_should_be_functional, n_total) ] scatterplot.plot_series(axarr2[0,0], [(range(2,6),n_func_frac,"% Cleaved"),(range(2,6),n_pred_frac,"% Pred Cleaved")], "", "Number of Mutations", "Fraction of Total Cases", size=40, connect_dots=True, alpha=1.0) axarr2[0,0].set_ylim([0,4.0]) fig_venn, axarr_venn = pconv.create_ax(1, 1) venn2([seq_func, seq_pred_func], set_labels = ["Cleaved", "Pred Cleaved"], ax=axarr_venn[0,0]) lgd = axarr[0,0].legend(artists,["No","Neg.","Pos."], loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0., prop={'size':9}, ncol=1, fancybox=True) pconv.save_fig(fig, epistasis_file, "plot", 3, 2.5, tight=False, size=9, extra_artists=lgd) pconv.save_fig(fig2, epistasis_file, "pred_v_cl", 5, 5, tight=True, size=10) pconv.save_fig(fig_venn, epistasis_file, "venn", 5, 5, tight=True, size=14)
def plot_dict(dict_to_plot, samplesize, output_pre, suffix, x_axis, norm="all"): fig, axarr = conv.create_ax(len(dict_to_plot), 1, shx=True, shy=True) for ind, (key, (val, sample_name)) in enumerate(dict_to_plot.items()): samplesize_list = [ l for k, l in samplesize.items() if k == sample_name ][0] #assume only one item meets that criteria title = ''.join(key)[0:28] plot_curve(axarr[0, ind], val, samplesize_list, "", title, x_axis, "Sample Size", norm=norm) conv.save_fig(fig, output_pre + (norm if norm is not None else "none"), suffix, len(dict_to_plot) * 4, 4, tight=True, size=10)
def main(json_file, output_prefix, metric): with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data) metrics = {} #metrics["degree"] = degree(G) metrics["closeness"] = closeness_centrality(G).values() #TODO: add any other metrics here using a similar format to above line. sequences = {} cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" } if metric != "metrics": labels_to_plot = [metric] else: labels_to_plot = metrics.keys() n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 20 for ind, key in enumerate(labels_to_plot): normed = True hist.draw_actual_plot(axarr[0,ind], metrics["key"], "", key.capitalize(), normed=normed, nbins=nbins) axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) #pconv.add_legend(axarr[0,ind], location="middle right") pconv.save_fig(fig, output_prefix, "metrics", n_to_plot*5, 5, tight=True, size=12)
def main(sequence_ratio_file, width, height, pattern, legend): sequences = seq_IO.read_sequences(sequence_ratio_file, additional_params=True) shell_data = [] for shell in xrange(1,len(sequences[0])): shell_data.append([ seq[shell] for seq in sequences ]) avg = [] std = [] label = xrange(1,4) for sd in shell_data: avg.append( np.median(sd)) std.append( np.std(sd)) #check if std has to be fixed #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]): # min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ] # max_err = [ a + s for a,s in zip(avg_ratio, std) ] # err = [min_err, max_err] #else: # err = std err = std fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0,0], avg, ['lightsteelblue','lightblue','darkgray'], "", "Shell", "Fraction Cleaved", tick_label=label, yerr = err) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10)
def main(list_sequence_names, output_prefix): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) cleaved_ind = labels.index("CLEAVED") middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") frac_uncleaved = {} frac_cleaved = {} frac_middle = {} for seq in list_sequences[cleaved_ind]: cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1]) uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1]) middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1]) if cleaved_seqs > 0 or uncleaved_seqs > 0: total = uncleaved_seqs+middle_seqs+cleaved_seqs frac_uncleaved[seq] = float(uncleaved_seqs)/total frac_cleaved[seq] = float(cleaved_seqs)/total frac_middle[seq] = float(middle_seqs)/total fig, ax = pconv.create_ax(3, 1) hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20) pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
def main(sequences_ratio_file): sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True) seq_ratio_dict = [[l[1], l[2], l[3]] for l in (sequences_ratio)] seq_cleaved_dict = [l[4] for l in sequences_ratio] seqs = [l[0] for l in sequences_ratio] avg_ratio = [sum(v) / 3.0 for v in seq_ratio_dict] min_ratio = [sum(v) / 3.0 - min(v) for v in seq_ratio_dict] max_ratio = [max(v) - sum(v) / 3.0 for v in seq_ratio_dict] fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0, 0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr=[min_ratio, max_ratio]) pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
def plot_dict(dict_to_plot, samplesize, output_pre, suffix, x_axis, norm="all"): fig, axarr = conv.create_ax(len(dict_to_plot), 1, shx=True, shy=True) for ind, (key, (val,sample_name)) in enumerate(dict_to_plot.items()): samplesize_list = [ l for k, l in samplesize.items() if k == sample_name ][0] #assume only one item meets that criteria title = ''.join(key)[0:28] plot_curve(axarr[0,ind], val, samplesize_list, "", title, x_axis, "Sample Size", norm=norm) conv.save_fig(fig, output_pre + (norm if norm is not None else "none"), suffix, len(dict_to_plot)*4, 4, tight=True, size=10)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [conv.hamdist(seq, canonical) for seq in sequences] dict_sequences[label] = { i: sum([d for d in distances if d == i]) for i in xrange(1, 6) } x = [] y = [] for i in xrange(1, 6): func = 0.0 unfunc = 0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append(func / (func + unfunc)) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series(ax[0, 0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0, 0].set_xlim(xmin=1, xmax=5) ax[0, 0].set_xticks(xrange(1, 6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def main(data_file, title, output_prefix): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True) data = [ seq_dict["Degree"] for seq, seq_dict in sequences.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0,0], data, "", title.capitalize(), normed=True, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, title, 5, 5, tight=True, size=10)
def main(list_sequence_names, output_prefix): sequence_list = [] labels = [] for [filename, label] in list_sequence_names: sequence_list.append(set(seq_IO.read_sequences(filename))) labels.append(label) fig, ax = pconv.create_ax(1, 1) venn3(sequence_list, set_labels = labels, ax=ax[0,0]) pconv.save_fig(fig, output_prefix, '_'.join(labels)+"_venn", 10, 10, size=12)
def main(sequences_ratio_file): sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True) seq_ratio_dict = [ [l[1],l[2],l[3]] for l in (sequences_ratio) ] seq_cleaved_dict = [ l[4] for l in sequences_ratio ] seqs = [ l[0] for l in sequences_ratio ] avg_ratio = [ sum(v)/3.0 for v in seq_ratio_dict] min_ratio = [ sum(v)/3.0 - min(v) for v in seq_ratio_dict] max_ratio = [ max(v)-sum(v)/3.0 for v in seq_ratio_dict] fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0,0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr = [min_ratio, max_ratio] ) pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
def main(list_sequence_names, output_prefix): lines = [] temp_dict = { "CLEAVED" : {}, "UNCLEAVED" : {}, "MIDDLE" : {} } for [filename, label, sample] in list_sequence_names: sequences = seq_IO.read_sequences(filename) temp_dict[label][sample] = len(sequences) lines.append(([ val for k, val in sorted(temp_dict["CLEAVED"].items()) ], "CLEAVED") ) lines.append(([ val for k, val in sorted(temp_dict["MIDDLE"].items()) ], "MIDDLE") ) lines.append(([ val for k, val in sorted(temp_dict["UNCLEAVED"].items()) ], "UNCLEAVED") ) fig, ax = pconv.create_ax(1, 1) bar.plot_series( ax[0,0], lines, title="", x_axis="Variant Name", y_axis="Number of Substrate Sequences Sampled", tick_label=sorted(temp_dict["CLEAVED"].keys())) pconv.save_fig(fig, output_prefix, "cleaved_uncleaved_middle", 6, 6, tight=True, size=10)
def main(list_nodes, output_prefix, metric, create_keys=False): if not create_keys: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True) else: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True) cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } print len(cleaved_seq) if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"] #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"]) labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ]) else: labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()] normed = True else: data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ] normed = True print key hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins) axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) #pconv.add_legend(axarr[0,ind], location="middle right") pconv.save_fig(fig, output_prefix, metric, n_to_plot*2.5, 2.5, tight=True, size=9)
def gen_plots(c, r, ax2, output_pre, dirname, st=""): fig, axarr = conv.create_ax(1, 31, shx=True, shy=True) counters = [] coeffs = [] pvals = [] for i in xrange(1,31): m = combine_counts_ratios(c, r, i, i+9, st=st) coeff, pval = find_coeff_pval(m, axarr[i-1,0], "Sliding Window: {0} to {1}".format(i, i+9) ) print i, coeff, pval counters.append(i) coeffs.append(coeff) pvals.append(pval) plot_coeff_pval(axarr[30,0], counters, coeffs, pvals) suffix = os.path.normpath(dirname).split(os.sep)[-3] conv.save_fig(fig, output_pre + "correlation_plot.txt", "{0}_{1}".format(suffix,st), 4, 20*4) plot_coeff_pval(ax2, counters, coeffs, pvals, suffix)
def main(ratios_file1, ratios_file2, output_pre, use_sel): if use_sel: counts1_dict = read_ratios(ratios_file1)[2] counts2_dict = read_ratios(ratios_file2)[2] title1 = os.path.basename(ratios_file1).split("_")[1] title2 = os.path.basename(ratios_file2).split("_")[1] else: counts1_dict = read_ratios(ratios_file1)[1] counts2_dict = read_ratios(ratios_file2)[1] title1 = os.path.basename(ratios_file1).split("_")[6] title2 = os.path.basename(ratios_file2).split("_")[6] fig, axarr = conv.create_ax(2, 1) c1, c2 = common_points(counts1_dict, counts2_dict) plot_corr(c1, c2, axarr[0,0], title1, title2) c1, c2 = common_points(counts1_dict, counts2_dict, filtered=True) plot_corr(c1, c2, axarr[0,1], title1, title2) conv.save_fig(fig, output_pre + "/corrcounts.txt", title1 + "_" + title2, 20, 10, tight=True)
def main(args): #read in and rename arguments title1 = os.path.basename(args.input_dir_1) title2 = os.path.basename(args.input_dir_2) d1, n1 = scorefileparse.read_dec_nat(args.input_dir_1, [], args.scoretype1, True) d2, n2 = scorefileparse.read_dec_nat(args.input_dir_2, [], args.scoretype2, True) dec1 = scorefileparse.filter_pdbs_by_rmsd(d1, args.rmsd_cutoff) nat1 = scorefileparse.filter_pdbs_by_rmsd(n1, args.rmsd_cutoff) dec2 = scorefileparse.filter_pdbs_by_rmsd(d2, args.rmsd_cutoff) nat2 = scorefileparse.filter_pdbs_by_rmsd(n2, args.rmsd_cutoff) dec_norm1 = scorefileparse.norm_pdbs(dec1) nat_norm1 = scorefileparse.norm_pdbs(nat1,dec1) dec_norm2 = scorefileparse.norm_pdbs(dec2) nat_norm2 = scorefileparse.norm_pdbs(nat2,dec2) [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec_norm1, nat_norm1, dec_norm2, nat_norm2]) [dec_inter1, dec_inter2] = scorefileparse.pdbs_scores_intersect([dec_inter1, dec_inter2]) [nat_inter1, nat_inter2] = scorefileparse.pdbs_scores_intersect([nat_inter1, nat_inter2]) dec_filt1 = scorefileparse.filter_norm_pdbs(dec_norm1) nat_filt1 = scorefileparse.filter_norm_pdbs(nat_norm1) dec_filt2 = scorefileparse.filter_norm_pdbs(dec_norm2) nat_filt2 = scorefileparse.filter_norm_pdbs(nat_norm2) [dec_finter1, dec_finter2] = scorefileparse.pdbs_scores_intersect([dec_filt1, dec_filt2]) [nat_finter1, nat_finter2] = scorefileparse.pdbs_scores_intersect([nat_filt1, nat_filt2]) fig, axarr = conv.create_ax(2, len(dec_inter1)) for x_ind,pdb in enumerate(sorted(dec_inter1.keys())): ax = axarr[x_ind, 0] plot(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2) ax = axarr[x_ind, 1] plot(dec_finter1, dec_finter2, nat_finter1, nat_finter2, ax, pdb, title1, title2)
def main(args): #read in and rename arguments inp_dir=args[1] scoretype=args[2] dec, nat = scorefileparse.read_dec_nat(inp_dir, [], scoretype) disc = discparse.read_dir(inp_dir) dec_norm = scorefileparse.norm_pdbs(dec) nat_norm = scorefileparse.norm_pdbs(nat,dec) [dec_inter, nat_inter, disc_inter] = scorefileparse.pdbs_intersect([dec_norm, nat_norm, disc]) #labels = ["Average","1.0","1.5","2.0","2.5","3.0","4.0","6.0"] labels = ["Average"] energy_gap = [[] for l in labels] avg_disc = [[] for l in labels] for pdb in dec_inter.keys(): for ind in xrange(0,len(labels)): lowest_dec = min([ e[0] for e in dec_inter[pdb].values() ]) lowest_nat = min([ n[0] for n in nat_inter[pdb].values() if n[1] < 2.0 ]) energy_gap[ind].append(lowest_nat - lowest_dec) avg_disc[ind].append(disc_inter[pdb][0]) fig, axarr = conv.create_ax(len(labels), 1) for x_ind,l in enumerate(labels): ax = axarr[0,x_ind] scatterplot.draw_actual_plot(ax, avg_disc[x_ind], energy_gap[x_ind], [], l,"Disc","Energy Gap") scatterplot.plot_regression(ax, avg_disc[x_ind], energy_gap[x_ind], False, False) title = os.path.basename(inp_dir) filename=inp_dir + "/test.txt" conv.save_fig(fig, filename, "disc_v_egap", len(labels)*3, 4)
def gen_plots(c, r, ax2, output_pre, dirname, st=""): fig, axarr = conv.create_ax(1, 31, shx=True, shy=True) counters = [] coeffs = [] pvals = [] for i in xrange(1, 31): m = combine_counts_ratios(c, r, i, i + 9, st=st) coeff, pval = find_coeff_pval( m, axarr[i - 1, 0], "Sliding Window: {0} to {1}".format(i, i + 9)) print i, coeff, pval counters.append(i) coeffs.append(coeff) pvals.append(pval) plot_coeff_pval(axarr[30, 0], counters, coeffs, pvals) suffix = os.path.normpath(dirname).split(os.sep)[-3] conv.save_fig(fig, output_pre + "correlation_plot.txt", "{0}_{1}".format(suffix, st), 4, 20 * 4) plot_coeff_pval(ax2, counters, coeffs, pvals, suffix)
def process_dir(dirnames, unsel, output_pre): fig_all, axarr_all = conv.create_ax(len(dirnames), 3, shx=True, shy=True) for ind,dirname in enumerate(dirnames): print dirname counts_fn = dirname + '/counts_' + unsel + '*_PRO_qc' ratios_fn = dirname + '/ratios_*_PRO_qc' c_fn = glob.glob(counts_fn) if len(c_fn) == 0: c_fn = None else: c_fn = c_fn[0] r_fn = glob.glob(ratios_fn)[0] c,r = read_files(c_fn, r_fn) gen_plots(c, r, axarr_all[0,ind], output_pre, dirname, st="") gen_plots(c, r, axarr_all[1,ind], output_pre, dirname, st="mean") gen_plots(c, r, axarr_all[2,ind], output_pre, dirname, st="median") conv.save_fig(fig_all, output_pre + "all_coeff_pval.txt", "", 4*len(dirnames), 12)
def process_dir(dirnames, unsel, output_pre): fig_all, axarr_all = conv.create_ax(len(dirnames), 3, shx=True, shy=True) for ind, dirname in enumerate(dirnames): print dirname counts_fn = dirname + '/counts_' + unsel + '*_PRO_qc' ratios_fn = dirname + '/ratios_*_PRO_qc' c_fn = glob.glob(counts_fn) if len(c_fn) == 0: c_fn = None else: c_fn = c_fn[0] r_fn = glob.glob(ratios_fn)[0] c, r = read_files(c_fn, r_fn) gen_plots(c, r, axarr_all[0, ind], output_pre, dirname, st="") gen_plots(c, r, axarr_all[1, ind], output_pre, dirname, st="mean") gen_plots(c, r, axarr_all[2, ind], output_pre, dirname, st="median") conv.save_fig(fig_all, output_pre + "all_coeff_pval.txt", "", 4 * len(dirnames), 12)
def main(list_sequence_names, canonical_list, output_prefix ): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] ) uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] ) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [ conv.hamdist(seq, canonical) for seq in sequences ] dict_sequences[label] = { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } x = [] y = [] for i in xrange(1,6): func=0.0 unfunc=0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append( func/(func+unfunc) ) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0,0].set_xlim(xmin=1,xmax=5) ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def plot(disc_metrics_1, disc_metrics_2, title1, title2, output_pre, add_slash=True): pdbs = sorted(disc_metrics_1.keys()) n_metrics = len(disc_metrics_1[pdbs[0]]) fig, axarr = conv.create_ax(n_metrics, 1) for x_ind,metric_name in enumerate(disc_metrics_1[pdbs[0]].keys()): x = [] y = [] ax = axarr[0,x_ind] for pdb in pdbs: x.append(disc_metrics_1[pdb][metric_name]) y.append(disc_metrics_2[pdb][metric_name]) scatterplot.draw_actual_plot(ax, x, y, 'b', metric_name, title1,title2, size=20, edgecolors='k') scatterplot.plot_regression(ax,x,y,False) if add_slash: filename = output_pre + "/" + title1 + "_" + title2 + ".txt" else: filename = output_pre + title1 + "_" + title2 + ".txt" suffix="disc_v_disc" conv.save_fig(fig, filename, suffix, n_metrics*3, 3, size=9)
def main(data_file, output_prefix, degree_file, width, height): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True) seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True) degree_frac = defaultdict(list) for seq, seq_dict in sequences.items(): degree_frac[seq_degree[seq]['Degree']].append(np.mean( seq_dict["Frac"])) data = [np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items()] degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ] degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0, 0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr2[0, 0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
def main(epistasis_file): dict_epistasis = { } #list of list of sequences, where each item represents a label with open(epistasis_file) as e: lines = e.readlines() for l in lines[1:]: #ignore header line tokens = l.split(',') #value consists of Starting Fitness, Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Intermediates if dict_epistasis.get((tokens[2], tokens[0])) is None: dict_epistasis[tokens[0]] = [ tokens[1], tokens[2], float(tokens[3]), tokens[4::2], [t.strip() for t in tokens[5::2]] ] ''' n_functional = [0] * 4 n_should_be_functional = [0] * 4 n_total = [0] * 4 for i in xrange(2,6): ind = i-2 neg_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] < -0.000005 and value[4] == i ]) no_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if abs(value[2]) < 0.000005 and value[4] == i ]) pos_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] > 0.000005 and value[4] == i ]) n_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[3] == "CLEAVED" and value[4] == i ]) n_should_be_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if all(v == "CLEAVED" for v in value[6]) and value[4] == i ]) n_total[ind] = float(sum([ 1 for key, value in dict_epistasis.items() if value[4] == i])) ''' seq_func = set( [key for key, val in dict_epistasis.items() if val[1] == "CLEAVED"]) seq_pred_func = set([ key for key, val in dict_epistasis.items() if all(v == "CLEAVED" for v in val[4]) ]) seq_unfunc = set( [key for key, val in dict_epistasis.items() if val[1] == "UNCLEAVED"]) seq_pred_unfunc = set([ key for key, val in dict_epistasis.items() if any(v == "UNCLEAVED" for v in val[4]) or sum(v == "MIDDLE" for v in val[4]) == 2 ]) seq_midfunc = set( [key for key, val in dict_epistasis.items() if val[1] == "MIDDLE"]) seq_pred_midfunc = set([ key for key, val in dict_epistasis.items() if any(v == "MIDDLE" for v in val[4]) ]) #fig, axarr = pconv.create_ax(3, 1, shx=True, shy=True) #fig2, axarr2 = pconv.create_ax(1, 1) #plot_epi(neg_epistasis, n_total, axarr[0,0], "Negative") #plot_epi(no_epistasis, n_total, axarr[0,1], "No") #plot_epi(pos_epistasis, n_total, axarr[0,2], "Positive") #n_func_frac = [ func/total for func, total in zip(n_functional, n_total) ] #n_pred_frac = [ pred/total for pred, total in zip(n_should_be_functional, n_total) ] #scatterplot.plot_series(axarr2[0,0], [(range(2,6),n_func_frac,"% Cleaved"),(range(2,6),n_pred_frac,"% Pred Cleaved")], "", "Number of Mutations", "Fraction of Total Cases", size=40, connect_dots=True, alpha=1.0) #axarr2[0,0].set_ylim([0,1.0]) fig_venn, axarr_venn = pconv.create_ax(1, 1) fig_vennun, axarr_vennun = pconv.create_ax(1, 1) fig_vennmid, axarr_vennmid = pconv.create_ax(1, 1) venn2([seq_func, seq_pred_func], set_labels=["Cleaved", "Pred Cleaved"], ax=axarr_venn[0, 0]) venn2([seq_unfunc, seq_pred_unfunc], set_labels=["Uncleaved", "Pred Uncleaved"], ax=axarr_vennun[0, 0]) venn2([seq_midfunc, seq_pred_midfunc], set_labels=["Middle", "Pred Middle"], ax=axarr_vennmid[0, 0]) #pconv.save_fig(fig, epistasis_file, "plot", 12, 4, tight=True, size=12) #pconv.save_fig(fig2, epistasis_file, "pred_v_cl", 5, 5, tight=True, size=10) pconv.save_fig(fig_venn, epistasis_file, "venn", 5, 5, tight=False, size=14) pconv.save_fig(fig_vennun, epistasis_file, "vennun", 5, 5, tight=False, size=14) pconv.save_fig(fig_vennmid, epistasis_file, "vennmid", 5, 5, tight=False, size=14)
def main(sequence_ratio_file, width, height, pattern, legend): sequence_ratio = seq_IO.read_sequences(sequence_ratio_file, additional_params=True) seqs = [s[0] for s in sequence_ratio] avg_ratio = [s[1] for s in sequence_ratio] std = [s[2] for s in sequence_ratio] label = [s[3] for s in sequence_ratio] if len(sequence_ratio[0]) > 4: color = [s[4] for s in sequence_ratio] else: color = [convert_label_color(l) for l in label] #check if std has to be fixed #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]): # min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ] # max_err = [ a + s for a,s in zip(avg_ratio, std) ] # err = [min_err, max_err] #else: # err = std err = std fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) if legend: label_legend = [ l if l not in ["CLEAVED", "MIDDLE", "UNCLEAVED"] else None for l in label ] patches, labels = bar.draw_actual_plot(axarr[0, 0], avg_ratio, color, "", "", "FLAG/HA Ratio", tick_label=seqs, yerr=err, pattern=pattern, label=label_legend) lgd = axarr[0, 0].legend(patches, labels, loc="upper center", bbox_to_anchor=(0.5, 1.05), borderaxespad=0., prop={'size': 9}, ncol=2, fancybox=True) print patches print labels else: bar.draw_actual_plot(axarr[0, 0], avg_ratio, color, "", "", "FLAG/HA Ratio", tick_label=seqs, yerr=err, pattern=pattern) lgd = None axarr[0, 0].set_ylim([0, 1.3]) pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10, extra_artists=lgd)
def main(epistasis_file): dict_epistasis = { } #list of list of sequences, where each item represents a label with open(epistasis_file) as e: lines = e.readlines() for l in lines[1:]: #ignore header line tokens = l.split(',') #value consists of Starting Ratio, Ending Ratio, Epistasis, Ending Fitness, # of Mutations, list of InterSeqs, list of InterFits, list of InterRatios if dict_epistasis.get((tokens[2], tokens[0])) is None: dict_epistasis[(tokens[0], tokens[2])] = [ float(tokens[1]), float(tokens[3]), float(tokens[5]), tokens[4], len(tokens[6::3]), tokens[6::3], tokens[7::3], tokens[8::3] ] neg_epistasis = [0] * 4 no_epistasis = [0] * 4 pos_epistasis = [0] * 4 n_functional = [0] * 4 n_should_be_functional = [0] * 4 n_total = [0] * 4 for i in xrange(2, 6): ind = i - 2 neg_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] < -0.000005 and value[4] == i ]) no_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if abs(value[2]) < 0.000005 and value[4] == i ]) pos_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] > 0.000005 and value[4] == i ]) n_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[3] == "CLEAVED" and value[4] == i ]) n_should_be_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if all(v == "CLEAVED" for v in value[6]) and value[4] == i ]) n_total[ind] = float( sum([1 for key, value in dict_epistasis.items() if value[4] == i])) seq_func = set( [key[1] for key, val in dict_epistasis.items() if val[3] == "CLEAVED"]) seq_pred_func = set([ key[1] for key, val in dict_epistasis.items() if all(v == "CLEAVED" for v in val[6]) ]) fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) fig2, axarr2 = pconv.create_ax(1, 1) artists = [] artists.extend( plot_epi(no_epistasis, n_total, axarr[0, 0], "No", color="gray")) artists.extend( plot_epi(neg_epistasis, n_total, axarr[0, 0], "Neg.", bottom=no_epistasis, color="white")) artists.extend( plot_epi( pos_epistasis, n_total, axarr[0, 0], "Pos.", bottom=[no + neg for no, neg in zip(no_epistasis, neg_epistasis)], color="black")) n_func_frac = [func / total for func, total in zip(n_functional, n_total)] n_pred_frac = [ pred / total for pred, total in zip(n_should_be_functional, n_total) ] scatterplot.plot_series(axarr2[0, 0], [(range(2, 6), n_func_frac, "% Cleaved"), (range(2, 6), n_pred_frac, "% Pred Cleaved")], "", "Number of Mutations", "Fraction of Total Cases", size=40, connect_dots=True, alpha=1.0) axarr2[0, 0].set_ylim([0, 4.0]) fig_venn, axarr_venn = pconv.create_ax(1, 1) venn2([seq_func, seq_pred_func], set_labels=["Cleaved", "Pred Cleaved"], ax=axarr_venn[0, 0]) lgd = axarr[0, 0].legend(artists, ["No", "Neg.", "Pos."], loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0., prop={'size': 9}, ncol=1, fancybox=True) pconv.save_fig(fig, epistasis_file, "plot", 3, 2.5, tight=False, size=9, extra_artists=lgd) pconv.save_fig(fig2, epistasis_file, "pred_v_cl", 5, 5, tight=True, size=10) pconv.save_fig(fig_venn, epistasis_file, "venn", 5, 5, tight=True, size=14)
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [o for o in one_away if o != canonical] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2, seq) for seq, seq2 in itertools.combinations(seq_ind_list, 2) if gsconv.hamdist(seq2[0], seq[0]) < 2] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0] T_mat = trans_matrix(seq_ind_list, edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2, 23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x, y, canonical]) fig, ax = conv.create_ax(1, 1) color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series(ax[0, 0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0, 0].set_xlim(xmin=1) ax[0, 0].set_ylim(ymin=0.0, ymax=1.0) ax[0, 0].set_xticks(xrange(1, 23, 3)) lgd = conv.add_legend(ax[0, 0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def plot_weights_dS(w_dS_sorted, col_dS, unique_z_vals, title1, title2, x_axis_name, y_axis_name, z_axis_name): n_unique_z_vals = unique_z_vals.shape[0] fig, axarr = conv.create_ax(n_unique_z_vals, 1, True,True) #find values in row of total_min min_vals = w_dS_sorted[w_dS_sorted[:,col_dS]==np.amin(w_dS_sorted[:,col_dS]),:] min_dS = min_vals[0,col_dS] max_dS = np.amax(w_dS_sorted[:,col_dS]) dS_list = sorted(w_dS_sorted[:,col_dS].tolist()) nlev = len(dS_list)/10 if len(dS_list)>10 else len(dS_list)/2 levels = dS_list[0::nlev] subtitle_prefix = z_axis_name + ": " xlabel = x_axis_name + " Weights" ylabel = y_axis_name + " Weights" for ax_ind,z_val in enumerate(sorted(unique_z_vals.tolist())): ax = axarr[0,ax_ind] x = w_dS_sorted[w_dS_sorted[:,2]==z_val,0] y = w_dS_sorted[w_dS_sorted[:,2]==z_val,1] dS = w_dS_sorted[w_dS_sorted[:,2]==z_val,col_dS] X = x.reshape(-1, n_unique_z_vals) Y = y.reshape(-1, n_unique_z_vals) DS = dS.reshape(-1, n_unique_z_vals) curr_min_ind = np.argmin(dS) CS = ax.contourf(X, Y, DS, extend='both', levels=levels) CSlines = ax.contour(X, Y, DS, linestyles='solid', colors=('w',), levels=levels) if z_val == min_vals[0,2]: min_x = [min_vals[0,0]] min_y = [min_vals[0,1]] min_ds = [min_vals[0,col_dS]] ann_txt = "Global Minimum: {2:.2f} at ({0:.2f}, {1:.2f})".format(min_x[0], min_y[0], min_ds[0]) ax.scatter(min_x, min_y, c='r', zorder=1) else: min_x = [x[curr_min_ind]] min_y = [y[curr_min_ind]] min_ds = [dS[curr_min_ind]] ann_txt = "Minimum: {2:.2f} at ({0:.2f}, {1:.2f})".format(min_x[0], min_y[0], min_ds[0]) ax.scatter(min_x, min_y, c='k', zorder=1) ax.annotate(ann_txt, xy=(min_x[0], min_y[0]), xytext=(-20,20), textcoords='offset points', ha='center', va='bottom', bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='red'),size=10) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_title("{0} : {1:.2f}".format(subtitle_prefix,z_val)) cbar = plt.colorbar(CS) if col_dS == 3: cbar.ax.set_ylabel('combined_S - Rosetta_S') else: cbar.ax.set_ylabel('combined_S - Amber_S') # Add the contour line levels to the colorbar cbar.add_lines(CSlines) filename = args.output_pre + "/" + title1 + "_" + title2 + "_" + str(col_dS) + ".txt" conv.save_fig(fig, filename, "_weights_vs_deltaS", 4*n_unique_z_vals, 4)
def main(list_sequence_names, canonical_list, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "CLEAVED"][0]) uncleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "UNCLEAVED"][0]) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0, 0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1, 0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2, 0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [ o for o in one_away if o != canonical ] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0] T_mat = trans_matrix(seq_ind_list,edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2,23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x,y,canonical]) fig, ax = conv.create_ax(1, 1) color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0,0].set_xlim(xmin=1) ax[0,0].set_ylim(ymin=0.0, ymax=1.0) ax[0,0].set_xticks(xrange(1,23,3)) lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(list_nodes, output_prefix, metric): cleaved_seq = {} uncleaved_seq = {} middle_seq = {} for nodes, label in list_nodes: sequences = seq_IO.read_sequences(nodes, additional_params=True, header=True) cleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical"] orig_labels_to_plot = sorted([ key for key in sequences["DEMEE"].keys() if key not in labels_non_plot ]) labels_to_plot = sorted(orig_labels_to_plot) else: orig_labels_to_plot = [metric] labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 list_seqs = [ k for d in cleaved_seq.values() for k in d.keys() ] count_seqs = Counter(list_seqs) #seqs_5_l = [ s for s in list_seqs if count_seqs[s] == 5 ] seqs_4_l = [ s for s in list_seqs if count_seqs[s] == 4 ] seqs_3_l = [ s for s in list_seqs if count_seqs[s] == 3 ] seqs_2_l = [ s for s in list_seqs if count_seqs[s] == 2 ] seqs_1_l = [ s for s in list_seqs if count_seqs[s] == 1 ] if metric != "Fraction_Cleaved": #seqs_5 = list_metrics( cleaved_seq, seqs_5_l, orig_labels_to_plot) seqs_4 = list_metrics( cleaved_seq, seqs_4_l, orig_labels_to_plot) seqs_3 = list_metrics( cleaved_seq, seqs_3_l, orig_labels_to_plot) seqs_2 = list_metrics( cleaved_seq, seqs_2_l, orig_labels_to_plot) seqs_1 = list_metrics( cleaved_seq, seqs_1_l, orig_labels_to_plot) for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": data = [ #average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_5_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_4_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_3_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_2_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_1_l)] normed=True else: data = [ #get_data_from_dict(seqs_5, key), get_data_from_dict(seqs_1, key), get_data_from_dict(seqs_2, key), get_data_from_dict(seqs_3, key), get_data_from_dict(seqs_4, key) ] normed=True hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), colors = [ tuple(c) for c in plt.cm.Blues(np.linspace(0.2, 1, 4)).tolist()], log=log, normed=normed, label=["Cl. by 5", "Cl. by 4", "Cl. by 3", "Cl. by 2", "Cl. by 1"], nbins=nbins) axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) #pconv.add_legend(axarr[0,ind], location="upper right") pconv.save_fig(fig, output_prefix, metric, n_to_plot*3, 3, tight=True, size=9) fig_bar, axarr_bar = pconv.create_ax(1, 1, shx=False, shy=False) gradient = np.linspace(1, 0.2, 256) #gradient = np.hstack((gradient, gradient)) gradient = np.array(zip(gradient,gradient)) axarr_bar[0,0].imshow(gradient, aspect='auto', cmap=plt.get_cmap('Blues')) #axarr_bar[0,0].set_axis_off() plt.tick_params( axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off', # labels along the bottom edge are off left='off', # ticks along the bottom edge are off right='off', # ticks along the top edge are off labelright='off') # labels along the bottom edge are off pconv.save_fig(fig_bar, output_prefix, "colorbar", 0.3, 3, tight=True)
def main(seq_file, canonical_file, output_prefix): #canonical_list_seq = seq_IO.read_sequences(canonical_file) canonical_list_seq = ["DEMEE","DEMED"] print "Beginning Script: {0}".format(datetime.datetime.now()) with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ] seq_ind_dict = { seq : ind for seq, ind in seq_ind_list } orig_len = len(seq_ind_list) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, seq_ind in seq_ind_dict.items(): neighbors = gsconv.gen_hamdist_one(seq) edges_set.update([ (seq, n) for n in neighbors if n in seq_ind_dict ]) edges += [((seq, seq_ind), (n,seq_ind_dict[n])) for n in neighbors if n in seq_ind_dict and (n,seq) not in edges_set ] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind_dict = { canonical : [ i for (s, i) in seq_ind_list if s == canonical ][0] for canonical in canonical_list_seq } T_mat = trans_matrix(seq_ind_list,edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) print "Transformed Matrix: {0}".format(datetime.datetime.now()) canon_x = { can : [0,1] for can in canonical_list_seq } canon_y = { can : [0.0, find_frac(T_mat, canon_ind_dict[can], orig_len)] for can in canonical_list_seq } print "Made x and y dicts: {0}".format(datetime.datetime.now()) T_mat_new = T_mat for i in range(2,23): T_mat_new = square_matrix(T_mat_new, T_mat) for can in canonical_list_seq: canon_x[can].append(i) canon_y[can].append(find_frac(T_mat_new, canon_ind_dict[can], orig_len)) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series = [ [canon_x[can],canon_y[can], can] for can in canonical_list_seq ] fig, ax = conv.create_ax(1, 1) color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0,0].set_xlim(xmin=1) ax[0,0].set_ylim(ymin=0.0, ymax=1.0) ax[0,0].set_xticks(xrange(1,23,3)) lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(seq_file, canonical_file, output_prefix): #canonical_list_seq = seq_IO.read_sequences(canonical_file) canonical_list_seq = ["DEMEE", "DEMED"] print "Beginning Script: {0}".format(datetime.datetime.now()) with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)] seq_ind_dict = {seq: ind for seq, ind in seq_ind_list} orig_len = len(seq_ind_list) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, seq_ind in seq_ind_dict.items(): neighbors = gsconv.gen_hamdist_one(seq) edges_set.update([(seq, n) for n in neighbors if n in seq_ind_dict]) edges += [((seq, seq_ind), (n, seq_ind_dict[n])) for n in neighbors if n in seq_ind_dict and (n, seq) not in edges_set] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind_dict = { canonical: [i for (s, i) in seq_ind_list if s == canonical][0] for canonical in canonical_list_seq } T_mat = trans_matrix(seq_ind_list, edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) print "Transformed Matrix: {0}".format(datetime.datetime.now()) canon_x = {can: [0, 1] for can in canonical_list_seq} canon_y = { can: [0.0, find_frac(T_mat, canon_ind_dict[can], orig_len)] for can in canonical_list_seq } print "Made x and y dicts: {0}".format(datetime.datetime.now()) T_mat_new = T_mat for i in range(2, 23): T_mat_new = square_matrix(T_mat_new, T_mat) for can in canonical_list_seq: canon_x[can].append(i) canon_y[can].append( find_frac(T_mat_new, canon_ind_dict[can], orig_len)) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series = [[canon_x[can], canon_y[can], can] for can in canonical_list_seq] fig, ax = conv.create_ax(1, 1) color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series(ax[0, 0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0, 0].set_xlim(xmin=1) ax[0, 0].set_ylim(ymin=0.0, ymax=1.0) ax[0, 0].set_xticks(xrange(1, 23, 3)) lgd = conv.add_legend(ax[0, 0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(rec_corr_path, ddg_path, amber_pdb_path, rosetta_pdb_path, out_csv_path): amber_csv_path = os.path.join(ddg_path, "amber") rosetta_csv_path = os.path.join(ddg_path, "rosetta") amber_inter_mean_path = os.path.join(ddg_path, "amber_inter_mean") rosetta_inter_mean_path = os.path.join(ddg_path, "rosetta_inter_mean") rosetta_inter_path = os.path.join(ddg_path, "rosetta_inter") #Plots to generate #Per pdb - known ddg vs. many diff protocols, each with their own row. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto) #For all pdbs - pred rosetta corr values vs. many diff protocols, each with their series color. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto). 3 rows one for each corr value #For all pdbs - known ddg vs. many diff protocols, each with their own row. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto) list_rec_corr_names = glob.glob(rec_corr_path + "*.rc") #corr_values_dict has the following shape - "Pred" : { "Pred" : [ddg_vals] }, "Amber" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]}, "Rosetta" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]} corr_values_dict = {} all_amber_ddg_dict = {} all_rosetta_ddg_dict = {} all_known_ddg_dict = {} all_pred_ddg_dict = {} k_ddg = [] p_ddg = [] for rec_corr in list_rec_corr_names: print rec_corr rec_corr_list = read_csv_list(rec_corr) #no known ddg if len(rec_corr_list[0]) == 3: continue amber_dg_dict = {} rosetta_dg_dict = {} #read in all amber csvs that correspond to column 3 in rec_corr file and rosetta ones too for record_id, prefix, filename, known_ddg, pred_ddg in rec_corr_list: amber_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"), "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"), "Mean Interaction Energy" : get_mean_txt(os.path.join(amber_inter_mean_path,filename+".txt")) } rosetta_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), "Mean Interaction Energy" : get_mean_txt(os.path.join(rosetta_inter_mean_path,filename+".txt")), "Bottom 3 Interaction Energy" : get_bottom3_csv(os.path.join(rosetta_inter_path,filename+".csv")) } #find wt csv that correspond to wt row in rec_corr_file (column 2) wt_csv_name = [ rec[2] for rec in rec_corr_list if "wt" in rec[1] ][0] amber_ddg_dict = {} rosetta_ddg_dict = {} known_ddg_dict = {} pred_ddg_dict = {} #loops thru other records in rec_corr_dict for rec, prefix, filename, k,p in rec_corr_list: if "wt" not in prefix: if amber_ddg_dict.get(filename) is None: amber_ddg_dict[filename] = {} if rosetta_ddg_dict.get(filename) is None: rosetta_ddg_dict[filename] = {} for key, dg in amber_dg_dict[wt_csv_name].items(): amber_ddg_dict[filename][key] = amber_dg_dict[filename][key] - dg for key, dg in rosetta_dg_dict[wt_csv_name].items(): rosetta_ddg_dict[filename][key] = rosetta_dg_dict[filename][key] - dg known_ddg_dict[filename] = { "Known" : float(k) } pred_ddg_dict[filename] = { "Pred" : float(p) } all_amber_ddg_dict.update(amber_ddg_dict) all_rosetta_ddg_dict.update(rosetta_ddg_dict) all_known_ddg_dict.update(known_ddg_dict) all_pred_ddg_dict.update(pred_ddg_dict) fig, axarr = conv.create_ax(max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()]), 3, shx=True, shy=True) plot_ddg_dict(rosetta_ddg_dict,known_ddg_dict,axarr,0,"Rosetta",corr_values_dict) plot_ddg_dict(amber_ddg_dict,known_ddg_dict,axarr,1,"Amber",corr_values_dict) plot_ddg_dict(pred_ddg_dict,known_ddg_dict,axarr,2,"Pred",corr_values_dict) conv.save_fig(fig, out_csv_path + "/" + os.path.splitext(os.path.basename(rec_corr))[0] + ".txt", "ddg", max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12) #Plot all correlation values fig_all, axarr_all = conv.create_ax(len(corr_values_dict["Rosetta"]),3) #assumes that Rosetta has more protocols than Amber for x_ind,(protocol, vals) in enumerate(corr_values_dict["Rosetta"].items()): if corr_values_dict["Amber"].get(protocol) is not None: amber_vals = corr_values_dict["Amber"][protocol] else: amber_vals = None pred_vals = corr_values_dict["Pred"]["Pred"] labels=["-PCC","-Rho","-Mae"] for ind,(val_list,label) in enumerate(zip(vals,labels)): series = [[val_list,pred_vals[ind],"Rosetta "+protocol]] if amber_vals is not None: series.append([amber_vals[ind],pred_vals[ind],"Amber "+protocol]) scatterplot.plot_series(axarr_all[ind,x_ind], series, protocol,"Pred",label,colors=['coral','cyan'], size=40) scatterplot.add_x_y_line(axarr_all[ind,x_ind]) #if x_ind == 2: # axarr_all[x_ind,y_ind].set_xlim([-0.2,10.0]) # axarr_all[x_ind,y_ind].set_ylim([-0.2,10.0]) # scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],0.0,10.0) #else: #axarr_all[x_ind,y_ind].set_xlim([-1.2,1.2]) # axarr_all[x_ind,y_ind].set_ylim([-1.2,1.2]) # scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],-1.0,1.0) conv.save_fig(fig_all, out_csv_path + "/all.txt", "ddg", 16, 12) fig_all_corr, axarr_all_corr = conv.create_ax(max([len(d) for k, d in all_amber_ddg_dict.items() ]+[len(d) for k,d in all_rosetta_ddg_dict.items()]), 3, shx=True, shy=True) plot_ddg_dict(all_rosetta_ddg_dict,all_known_ddg_dict,axarr_all_corr,0,"Rosetta",corr_values_dict) plot_ddg_dict(all_amber_ddg_dict,all_known_ddg_dict,axarr_all_corr,1,"Amber",corr_values_dict) plot_ddg_dict(all_pred_ddg_dict,all_known_ddg_dict,axarr_all_corr,2,"Pred",corr_values_dict) conv.save_fig(fig_all_corr, out_csv_path + "/all_corr.txt", "ddg",max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12)
def main(input_dir_1, scoretype1, input_dir_2, scoretype2, rmsd_cutoff, output_pre ): #read in and rename arguments title1 = os.path.basename(input_dir_1) title2 = os.path.basename(input_dir_2) d1, n1 = scorefileparse.read_dec_nat(input_dir_1, scoretype1, repl_orig=False) d2, n2 = scorefileparse.read_dec_nat(input_dir_2, scoretype2, repl_orig=False) dec1 = scorefileparse.filter_pdbs_by_rmsd(d1, rmsd_cutoff) nat1 = scorefileparse.filter_pdbs_by_rmsd(n1, rmsd_cutoff) dec2 = scorefileparse.filter_pdbs_by_rmsd(d2, rmsd_cutoff) nat2 = scorefileparse.filter_pdbs_by_rmsd(n2, rmsd_cutoff) dec_norm1 = scorefileparse.norm_pdbs(dec1) nat_norm1 = scorefileparse.norm_pdbs(nat1,dec1) dec_norm2 = scorefileparse.norm_pdbs(dec2) nat_norm2 = scorefileparse.norm_pdbs(nat2,dec2) [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec_norm1, nat_norm1, dec_norm2, nat_norm2]) [dec_inter1, dec_inter2] = scorefileparse.pdbs_scores_intersect([dec_inter1, dec_inter2]) [nat_inter1, nat_inter2] = scorefileparse.pdbs_scores_intersect([nat_inter1, nat_inter2]) dec_filt1 = scorefileparse.filter_norm_pdbs(dec_norm1) nat_filt1 = scorefileparse.filter_norm_pdbs(nat_norm1) dec_filt2 = scorefileparse.filter_norm_pdbs(dec_norm2) nat_filt2 = scorefileparse.filter_norm_pdbs(nat_norm2) [dec_finter1, dec_finter2] = scorefileparse.pdbs_scores_intersect([dec_filt1, dec_filt2]) [nat_finter1, nat_finter2] = scorefileparse.pdbs_scores_intersect([nat_filt1, nat_filt2]) fig, axarr = conv.create_ax(2, len(dec_inter1)) line_plot_data = {} min_naive_by_pdb = {} for x_ind,pdb in enumerate(sorted(dec_inter1.keys())): ax = axarr[x_ind, 0] plot_r_v_r(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2) ax = axarr[x_ind, 1] min_naive = plot_pareto(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2) keys_to_include = ["Amber", "Rosetta","All","Pareto10"] for key, (rank1, rank2, rmsd) in min_naive.items(): #if key not in keys_to_include: # continue if line_plot_data.get(key) is None: line_plot_data[key] = ([],[]) line_plot_data[key][0].append(pdb) line_plot_data[key][1].append(rmsd) if min_naive_by_pdb.get(pdb) is None: min_naive_by_pdb[pdb] = {} min_naive_by_pdb[pdb][key] = rmsd #organize data indices = list(range(len(line_plot_data["All"][1]))) indices.sort(key=lambda x: line_plot_data["All"][1][x]) ranked_pdbs_by_rmsd_all = {} for i, x in enumerate(indices): ranked_pdbs_by_rmsd_all[line_plot_data["All"][0][x]] = i for label, (pdbs, rmsds) in line_plot_data.items(): line_plot_data[label] = tuple(zip(*sorted(zip(pdbs,rmsds), key=lambda x: ranked_pdbs_by_rmsd_all[x[0]] ))) filename = output_pre + "/" + title1 + "_" + title2 + ".txt" #suffix="rmsd_v_rmsd_{0}".format(rmsd_cutoff) #conv.save_fig(fig, filename, suffix, 7, len(dec_inter1)*3) #plot line plot all_pareto_labels = [] for initial in ["R","A"]: ordered_labels = ["All", "Amber", "Rosetta"] for i in range(1,11): ordered_labels.append("Pareto{0}{1}".format(initial,i)) all_pareto_labels.append("Pareto{0}{1}".format(initial,i)) lines = [ (line_plot_data[label][0], line_plot_data[label][1], label) for label in ordered_labels ] fig2, axarr2 = conv.create_ax(1, len(ordered_labels), shx=True, shy=True) for i, label in enumerate(ordered_labels): line.plot_series(axarr2[i,0], lines[0:i+1], "RMSD vs. pdb", "PDB", "RMSD", linestyle='') conv.add_legend(axarr2[i,0]) conv.save_fig(fig2, filename, "_line_{0}".format(initial), 10, len(ordered_labels)*5) #plot histogram plot hist_comp = [ ("Amber","All"), ("Rosetta", "All"), ("ParetoR10", "All"), ("ParetoA10", "All")] hist_comp.extend([ ("ParetoR{0}".format(ind),"Rosetta") for ind in range(1,11) ]) hist_comp.extend([ ("ParetoR{0}".format(ind),"Amber") for ind in range(1,11) ]) hist_comp.extend([ ("ParetoA{0}".format(ind),"Rosetta") for ind in range(1,11) ]) hist_comp.extend([ ("ParetoA{0}".format(ind), "Amber") for ind in range(1,11) ]) fig3, axarr3 = conv.create_ax(2, len(hist_comp), shx=False, shy=False) for ind, (top, bottom) in enumerate(hist_comp): gen_dist_plot(axarr3[ind,0], axarr3[ind,1], top, bottom, min_naive_by_pdb) conv.save_fig(fig3, filename, "_distdeltas", 7, len(hist_comp)*5, tight=False) #plot scatterplot fig4, axarr4 = conv.create_ax(10, 2) for i in range(1,11): gen_scatterplot(axarr4[0,i-1], "ParetoR{0}".format(i), "Rosetta", "Amber", min_naive_by_pdb) gen_scatterplot(axarr4[1,i-1], "ParetoA{0}".format(i), "Rosetta", "Amber", min_naive_by_pdb) conv.save_fig(fig4, filename, "_scattdeltas", 30, 6)
def main(list_nodes, output_prefix, metric, create_keys=False): if not create_keys: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True) else: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True) cleaved_seq = { key: val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq = { key: val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq = { key: val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } print len(cleaved_seq) if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"] #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"]) labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ]) else: labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()] normed = True else: data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ] normed = True print key hist.draw_actual_plot(axarr[0, ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins) axarr[0, ind].ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) #pconv.add_legend(axarr[0,ind], location="middle right") pconv.save_fig(fig, output_prefix, metric, n_to_plot * 2.5, 2.5, tight=True, size=9)
def main(list_input_dirs, energies_names, output_pre): #read in and rename arguments inp_dir1=list_input_dirs[0][0] scoretype1=list_input_dirs[0][1] inp_dir2=list_input_dirs[1][0] scoretype2=list_input_dirs[1][1] title1 = os.path.basename(inp_dir1) title2 = os.path.basename(inp_dir2) column_dict = {} for c in energies_names: column_dict[c[0]] = c[1:] dec1, nat1 = scorefileparse.read_dec_nat(inp_dir1, energies_names[scoretype1], scoretype1) dec2, nat2 = scorefileparse.read_dec_nat(inp_dir2, energies_names[scoretype2], scoretype2) [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec1, nat1, dec2, nat2]) sum_discs = Counter() fig, axarr = conv.create_ax(1, len(dec_inter1)+1, True,True) for x_ind, pdb in enumerate(sorted(dec_inter1.keys())): discs_per_pdb = {} for w_1 in xrange(-10,10,2): for w_2 in xrange(-10,10,2): weight_1 = 2 ** w_1 weight_2 = 2 ** w_2 weighted_1 = scorefileparse.weight_dict(dec_inter1[pdb], weight_1) weighted_2 = scorefileparse.weight_dict(dec_inter2[pdb], weight_2) merged = scorefileparse.merge_dicts([weighted_1, weighted_2]) ddata1 = scorefileparse.convert_disc(merged) disc_divs = [1.0,1.5,2.0,2.5,3.0,4.0,6.0] disc1, d, counts = disc.given_data_run_disc(ddata1, True, disc_divs) discs_per_pdb[(weight_1,weight_2)] = disc1 sorted_disc = sorted(discs_per_pdb.values()) max_title = [ t for t,v in discs_per_pdb.items() if v == sorted_disc[0] ] #header_string = "\t".join("{0:.3f}-{1:.3f}".format(x,y) for x,y in sorted(discs_per_pdb.keys())) + "\tMax_Weight" #values_string = "\t".join(format(x, "10.3f") for (w1,w2),x in sorted(discs_per_pdb.items())) + "\t{0:.3f}".format(max_title[0]) #print header_string #print values_string ax = axarr[x_ind, 0] #ax.set_xlim(-10, 600) #ax.set_ylim(-10, 600) ax.set_xscale('log', basex=2) ax.set_yscale('log', basey=2) x = [ w1 for (w1,w2) in sorted(discs_per_pdb.keys()) ] y = [ w2 for (w1,w2) in sorted(discs_per_pdb.keys()) ] d = [ v for k,v in sorted(discs_per_pdb.items()) ] min_y = min(discs_per_pdb.values()) max_y = max(discs_per_pdb.values()) #print min_y, max_y s = scatterplot.draw_actual_plot(ax, x, y, d, pdb, scoretype1, scoretype2, 'bwr') fig.colorbar(s,ax=ax) #ax.axhline(y=min_y) #ax.set_ylim(min_y-0.05,max_y+0.05) scatterplot.add_x_y_line(ax, 0,600) sum_discs.update(discs_per_pdb) #print "All PDBs {0}".format(len(dec_inter1)) #sorted_disc = sorted(sum_discs.values()) #max_title = [ t for t,v in sum_discs.items() if v == sorted_disc[0] ] #header_string = "\t".join(format(x, "10.3f") for x in sorted(sum_discs.keys())) + "\tMax_Weight" #values_string = "\t".join(format(x/len(dec_inter1), "10.3f") for key,x in sorted(sum_discs.items())) + "\t{0:.3f}".format(max_title[0]) #print header_string #print values_string ax = axarr[len(dec_inter1), 0] min_y = min(x/len(dec_inter1) for x in sum_discs.values()) max_y = max(x/len(dec_inter1) for x in sum_discs.values()) x = [ w1 for w1,w2 in sorted(sum_discs.keys()) ] y = [ w2 for w1,w2 in sorted(sum_discs.keys()) ] d = [ v/len(dec_inter1) for k,v in sorted(sum_discs.items()) ] #fix titles of axes ax.set_xscale('log', basex=2) ax.set_yscale('log', basey=2) s = scatterplot.draw_actual_plot(ax, x,y,d, "All", scoretype1, scoretype2, cm='bwr') fig.colorbar(s,ax=ax) scatterplot.add_x_y_line(ax, 0,600) #ax.axhline(y=min_y) conv.save_fig(fig, output_pre, "_weights_v_disc", 3, len(dec_inter1)*3)