Example #1
0
def main(data_file, output_prefix, degree_file, width, height):

    sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True)
    seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True)

    degree_frac = defaultdict(list)

    for seq, seq_dict in sequences.items():
        degree_frac[seq_degree[seq]['Degree']].append(np.mean(seq_dict["Frac"]))

    data = [ np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items() ]

    degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ]
    degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0,0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False)    
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) 

    fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr2[0,0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
def main(epistasis_file):
    
    dict_epistasis = {} #list of list of sequences, where each item represents a label 

    with open(epistasis_file) as e:
        lines = e.readlines()
        for l in lines[1:]: #ignore header line
	    tokens = l.split(',')
	    #value consists of Starting Ratio, Ending Ratio, Epistasis, Ending Fitness, # of Mutations, list of InterSeqs, list of InterFits, list of InterRatios
	    if dict_epistasis.get((tokens[2], tokens[0])) is None:
                dict_epistasis[(tokens[0],tokens[2])] = [ float(tokens[1]), float(tokens[3]), float(tokens[5]), tokens[4], len(tokens[6::3]), tokens[6::3], tokens[7::3], tokens[8::3] ]

    neg_epistasis = [0] * 4 
    no_epistasis = [0] * 4
    pos_epistasis = [0] * 4 
    n_functional = [0] * 4
    n_should_be_functional = [0] * 4
    n_total = [0] * 4

    for i in xrange(2,6):
	ind = i-2
        neg_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] < -0.000005 and value[4] == i ])
        no_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if abs(value[2]) < 0.000005 and value[4] == i ])
        pos_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] > 0.000005 and value[4] == i ])
	n_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[3] == "CLEAVED" and value[4] == i ])
        n_should_be_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if all(v == "CLEAVED" for v in value[6]) and value[4] == i ])
	n_total[ind] = float(sum([ 1 for key, value in dict_epistasis.items() if value[4] == i]))

    seq_func = set([ key[1] for key,val in dict_epistasis.items() if val[3] == "CLEAVED" ])
    seq_pred_func = set([ key[1] for key,val in dict_epistasis.items() if all(v == "CLEAVED" for v in val[6]) ]) 
    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    fig2, axarr2 = pconv.create_ax(1, 1)
    artists = []
    artists.extend(plot_epi(no_epistasis, n_total, axarr[0,0], "No", color="gray"))
    artists.extend(plot_epi(neg_epistasis, n_total, axarr[0,0], "Neg.", bottom=no_epistasis, color="white"))
    artists.extend(plot_epi(pos_epistasis, n_total, axarr[0,0], "Pos.", bottom=[no + neg for no, neg in zip(no_epistasis, neg_epistasis)], color="black"))
    n_func_frac = [ func/total for func, total in zip(n_functional, n_total) ]
    n_pred_frac = [ pred/total for pred, total in zip(n_should_be_functional, n_total) ]
    scatterplot.plot_series(axarr2[0,0], [(range(2,6),n_func_frac,"% Cleaved"),(range(2,6),n_pred_frac,"% Pred Cleaved")], "", "Number of Mutations", "Fraction of Total Cases", size=40, connect_dots=True, alpha=1.0)
    axarr2[0,0].set_ylim([0,4.0])
    fig_venn, axarr_venn = pconv.create_ax(1, 1)

    venn2([seq_func, seq_pred_func], set_labels = ["Cleaved", "Pred Cleaved"], ax=axarr_venn[0,0])

    lgd = axarr[0,0].legend(artists,["No","Neg.","Pos."], loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0., prop={'size':9}, ncol=1, fancybox=True)


    pconv.save_fig(fig, epistasis_file, "plot", 3, 2.5, tight=False, size=9, extra_artists=lgd)
    pconv.save_fig(fig2, epistasis_file, "pred_v_cl", 5, 5, tight=True, size=10)
    pconv.save_fig(fig_venn, epistasis_file, "venn", 5, 5, tight=True, size=14)
Example #3
0
def plot_dict(dict_to_plot,
              samplesize,
              output_pre,
              suffix,
              x_axis,
              norm="all"):
    fig, axarr = conv.create_ax(len(dict_to_plot), 1, shx=True, shy=True)

    for ind, (key, (val, sample_name)) in enumerate(dict_to_plot.items()):
        samplesize_list = [
            l for k, l in samplesize.items() if k == sample_name
        ][0]  #assume only one item meets that criteria
        title = ''.join(key)[0:28]
        plot_curve(axarr[0, ind],
                   val,
                   samplesize_list,
                   "",
                   title,
                   x_axis,
                   "Sample Size",
                   norm=norm)
    conv.save_fig(fig,
                  output_pre + (norm if norm is not None else "none"),
                  suffix,
                  len(dict_to_plot) * 4,
                  4,
                  tight=True,
                  size=10)
Example #4
0
def main(json_file, output_prefix, metric):
    
    with open(json_file) as data_file:    
        data = json.load(data_file)

    G = json_graph.node_link_graph(data)

    metrics = {}

    #metrics["degree"] = degree(G)
    metrics["closeness"] = closeness_centrality(G).values()
    #TODO: add any other metrics here using a similar format to above line.
    sequences = {}    	

    cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" }

    if metric != "metrics":
	labels_to_plot = [metric]
    else:
	labels_to_plot = metrics.keys()
    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 20    

    for ind, key in enumerate(labels_to_plot):
	normed = True
        hist.draw_actual_plot(axarr[0,ind], metrics["key"], "", key.capitalize(), normed=normed, nbins=nbins)    
        axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

        #pconv.add_legend(axarr[0,ind], location="middle right")
    pconv.save_fig(fig, output_prefix, "metrics", n_to_plot*5, 5, tight=True, size=12) 
Example #5
0
def main(sequence_ratio_file, width, height, pattern, legend):

    sequences = seq_IO.read_sequences(sequence_ratio_file, additional_params=True)

    shell_data = []

    for shell in xrange(1,len(sequences[0])):
        shell_data.append([ seq[shell] for seq in sequences ])

    avg = []
    std = []
    label = xrange(1,4)

    for sd in shell_data:
        avg.append( np.median(sd))
        std.append( np.std(sd))

    #check if std has to be fixed
    #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]):
    #    min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ]
    #    max_err = [ a + s for a,s in zip(avg_ratio, std) ]
    #    err = [min_err, max_err]
    #else:
    #    err = std

    err = std

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr[0,0], avg, ['lightsteelblue','lightblue','darkgray'], "", "Shell", "Fraction Cleaved", tick_label=label, yerr = err)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10)
def main(list_sequence_names, output_prefix):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    cleaved_ind = labels.index("CLEAVED")
    middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")
    frac_uncleaved = {}
    frac_cleaved = {}
    frac_middle = {}
    for seq in list_sequences[cleaved_ind]:
        cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1])
        uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1])
        middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1])
	if cleaved_seqs > 0 or uncleaved_seqs > 0:
	    total = uncleaved_seqs+middle_seqs+cleaved_seqs
            frac_uncleaved[seq] = float(uncleaved_seqs)/total
	    frac_cleaved[seq] = float(cleaved_seqs)/total
            frac_middle[seq] = float(middle_seqs)/total
    fig, ax = pconv.create_ax(3, 1)

    hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20)

    pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
Example #7
0
def main(sequences_ratio_file):

    sequences_ratio = seq_IO.read_sequences(sequences_ratio_file,
                                            additional_params=True)

    seq_ratio_dict = [[l[1], l[2], l[3]] for l in (sequences_ratio)]
    seq_cleaved_dict = [l[4] for l in sequences_ratio]
    seqs = [l[0] for l in sequences_ratio]

    avg_ratio = [sum(v) / 3.0 for v in seq_ratio_dict]
    min_ratio = [sum(v) / 3.0 - min(v) for v in seq_ratio_dict]
    max_ratio = [max(v) - sum(v) / 3.0 for v in seq_ratio_dict]

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    bar.draw_actual_plot(axarr[0, 0],
                         avg_ratio,
                         'c',
                         "",
                         "Sequence",
                         "FLAG/HA Ratio",
                         tick_label=seqs,
                         yerr=[min_ratio, max_ratio])
    pconv.save_fig(fig,
                   sequences_ratio_file,
                   "plot",
                   4,
                   4,
                   tight=True,
                   size=12)
def plot_dict(dict_to_plot, samplesize, output_pre, suffix, x_axis, norm="all"):
    fig, axarr = conv.create_ax(len(dict_to_plot), 1, shx=True, shy=True)

    for ind, (key, (val,sample_name)) in enumerate(dict_to_plot.items()):
        samplesize_list = [ l for k, l in samplesize.items() if k == sample_name ][0] #assume only one item meets that criteria
        title = ''.join(key)[0:28]
        plot_curve(axarr[0,ind], val, samplesize_list, "", title, x_axis, "Sample Size", norm=norm)
    conv.save_fig(fig, output_pre + (norm if norm is not None else "none"), suffix, len(dict_to_plot)*4, 4, tight=True, size=10)
Example #9
0
def main(list_sequence_names, canonical_list, output_prefix, func_labels,
         unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:

        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename)
            distances = [conv.hamdist(seq, canonical) for seq in sequences]

            dict_sequences[label] = {
                i: sum([d for d in distances if d == i])
                for i in xrange(1, 6)
            }

        x = []
        y = []
        for i in xrange(1, 6):
            func = 0.0
            unfunc = 0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
                x.append(i)
                y.append(func / (func + unfunc))
        print x
        print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="# of Mutations",
                            y_axis="Fraction of Variants that are Functional",
                            alpha=1.0,
                            connect_dots=True,
                            size=30,
                            edgecolors='k')
    ax[0, 0].set_xlim(xmin=1, xmax=5)
    ax[0, 0].set_xticks(xrange(1, 6))
    pconv.save_fig(fig,
                   output_prefix,
                   canonical + "_fraction_func_mutant",
                   6,
                   6,
                   size=15)
Example #10
0
def main(data_file, title, output_prefix):

    sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True)

    data = [ seq_dict["Degree"] for seq, seq_dict in sequences.items() ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0,0], data, "", title.capitalize(), normed=True, nbins=30, edgecolor=None, log=False)    
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig, output_prefix, title, 5, 5, tight=True, size=10) 
Example #11
0
def main(list_sequence_names, output_prefix):

    sequence_list = []
    labels = []

    for [filename, label] in list_sequence_names:
        sequence_list.append(set(seq_IO.read_sequences(filename)))
        labels.append(label) 

    fig, ax = pconv.create_ax(1, 1)

    venn3(sequence_list, set_labels = labels, ax=ax[0,0]) 
    
    pconv.save_fig(fig, output_prefix, '_'.join(labels)+"_venn", 10, 10, size=12)
Example #12
0
def main(sequences_ratio_file):

    sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True)

    seq_ratio_dict = [ [l[1],l[2],l[3]] for l in (sequences_ratio) ]
    seq_cleaved_dict = [ l[4] for l in sequences_ratio ]
    seqs = [ l[0] for l in sequences_ratio ]
    
    avg_ratio = [ sum(v)/3.0 for v in seq_ratio_dict]
    min_ratio = [ sum(v)/3.0 - min(v) for v in seq_ratio_dict]
    max_ratio = [ max(v)-sum(v)/3.0 for v in seq_ratio_dict]

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    bar.draw_actual_plot(axarr[0,0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr = [min_ratio, max_ratio] ) 
    pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
Example #13
0
def main(list_sequence_names, output_prefix):

    lines = []

    temp_dict = { "CLEAVED" : {}, "UNCLEAVED" : {}, "MIDDLE" : {} }

    for [filename, label, sample] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename)
        temp_dict[label][sample] = len(sequences)

    lines.append(([ val for k, val in sorted(temp_dict["CLEAVED"].items()) ], "CLEAVED") )
    lines.append(([ val for k, val in sorted(temp_dict["MIDDLE"].items()) ], "MIDDLE") )
    lines.append(([ val for k, val in sorted(temp_dict["UNCLEAVED"].items()) ], "UNCLEAVED") )

    fig, ax = pconv.create_ax(1, 1)

    bar.plot_series( ax[0,0], lines, title="", x_axis="Variant Name", y_axis="Number of Substrate Sequences Sampled", tick_label=sorted(temp_dict["CLEAVED"].keys())) 
    pconv.save_fig(fig, output_prefix, "cleaved_uncleaved_middle", 6, 6, tight=True, size=10)
Example #14
0
def main(list_nodes, output_prefix, metric, create_keys=False):
    
    if not create_keys:
        sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True)
    else:
        sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True)

    cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" }
    middle_seq = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" }
    uncleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" }

    print len(cleaved_seq) 
    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"]
        #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"])
        labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ])
    else:
	labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10    

    for ind, key in enumerate(labels_to_plot):
	if key == "pageranks":
            log = True 
	else:
	    log = False
	if key == "Fraction_Cleaved":
           # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(),
	   #           conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(),
           #          conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()]
	    normed = True
	else:
            data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ]
	    normed = True 
	print key
        hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins)    
        axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

        #pconv.add_legend(axarr[0,ind], location="middle right")
    pconv.save_fig(fig, output_prefix, metric, n_to_plot*2.5, 2.5, tight=True, size=9) 
Example #15
0
def gen_plots(c, r, ax2, output_pre, dirname, st=""):
    fig, axarr = conv.create_ax(1, 31, shx=True, shy=True)

    counters = []
    coeffs = []
    pvals = []
    for i in xrange(1,31):
        m = combine_counts_ratios(c, r, i, i+9, st=st)
        coeff, pval = find_coeff_pval(m, axarr[i-1,0], "Sliding Window: {0} to {1}".format(i, i+9) )
        print i, coeff, pval
        counters.append(i)
        coeffs.append(coeff)
        pvals.append(pval)

    plot_coeff_pval(axarr[30,0], counters, coeffs, pvals)
    suffix = os.path.normpath(dirname).split(os.sep)[-3]
    conv.save_fig(fig, output_pre + "correlation_plot.txt", "{0}_{1}".format(suffix,st), 4, 20*4)

    plot_coeff_pval(ax2, counters, coeffs, pvals, suffix)
Example #16
0
def main(ratios_file1, ratios_file2, output_pre, use_sel):
   if use_sel:
       counts1_dict = read_ratios(ratios_file1)[2]
       counts2_dict = read_ratios(ratios_file2)[2]
       title1 = os.path.basename(ratios_file1).split("_")[1]
       title2 = os.path.basename(ratios_file2).split("_")[1]
   else:
       counts1_dict = read_ratios(ratios_file1)[1]
       counts2_dict = read_ratios(ratios_file2)[1]
       title1 = os.path.basename(ratios_file1).split("_")[6]
       title2 = os.path.basename(ratios_file2).split("_")[6]
   fig, axarr = conv.create_ax(2, 1)

   c1, c2 = common_points(counts1_dict, counts2_dict)
   plot_corr(c1, c2, axarr[0,0], title1, title2)
   c1, c2 = common_points(counts1_dict, counts2_dict, filtered=True)
   plot_corr(c1, c2, axarr[0,1], title1, title2)


   conv.save_fig(fig, output_pre + "/corrcounts.txt", title1 + "_" + title2, 20, 10, tight=True)
def main(args):
    #read in and rename arguments
    title1 = os.path.basename(args.input_dir_1)
    title2 = os.path.basename(args.input_dir_2)

    d1, n1 = scorefileparse.read_dec_nat(args.input_dir_1, [], args.scoretype1, True)
    d2, n2 = scorefileparse.read_dec_nat(args.input_dir_2, [], args.scoretype2, True)

    dec1 = scorefileparse.filter_pdbs_by_rmsd(d1, args.rmsd_cutoff)
    nat1 = scorefileparse.filter_pdbs_by_rmsd(n1, args.rmsd_cutoff)
    dec2 = scorefileparse.filter_pdbs_by_rmsd(d2, args.rmsd_cutoff)
    nat2 = scorefileparse.filter_pdbs_by_rmsd(n2, args.rmsd_cutoff)

    dec_norm1 = scorefileparse.norm_pdbs(dec1)
    nat_norm1 = scorefileparse.norm_pdbs(nat1,dec1)
    dec_norm2 = scorefileparse.norm_pdbs(dec2)
    nat_norm2 = scorefileparse.norm_pdbs(nat2,dec2)

    [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec_norm1, nat_norm1, dec_norm2, nat_norm2]) 
    [dec_inter1, dec_inter2] = scorefileparse.pdbs_scores_intersect([dec_inter1, dec_inter2])       
    [nat_inter1, nat_inter2] = scorefileparse.pdbs_scores_intersect([nat_inter1, nat_inter2])       

    dec_filt1 = scorefileparse.filter_norm_pdbs(dec_norm1)
    nat_filt1 = scorefileparse.filter_norm_pdbs(nat_norm1)
    dec_filt2 = scorefileparse.filter_norm_pdbs(dec_norm2)
    nat_filt2 = scorefileparse.filter_norm_pdbs(nat_norm2)

    [dec_finter1, dec_finter2] = scorefileparse.pdbs_scores_intersect([dec_filt1, dec_filt2])
    [nat_finter1, nat_finter2] = scorefileparse.pdbs_scores_intersect([nat_filt1, nat_filt2])

    fig, axarr = conv.create_ax(2, len(dec_inter1))

    for x_ind,pdb in enumerate(sorted(dec_inter1.keys())):

        ax = axarr[x_ind, 0] 

	    plot(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2)

	    ax = axarr[x_ind, 1]

	    plot(dec_finter1, dec_finter2, nat_finter1, nat_finter2, ax, pdb, title1, title2)
def main(args):
    #read in and rename arguments
    inp_dir=args[1]
    scoretype=args[2]

    dec, nat = scorefileparse.read_dec_nat(inp_dir, [], scoretype)

    disc = discparse.read_dir(inp_dir)

    dec_norm = scorefileparse.norm_pdbs(dec)
    nat_norm = scorefileparse.norm_pdbs(nat,dec)

    [dec_inter, nat_inter, disc_inter] = scorefileparse.pdbs_intersect([dec_norm, nat_norm, disc]) 

    #labels = ["Average","1.0","1.5","2.0","2.5","3.0","4.0","6.0"]
    labels = ["Average"]
    energy_gap = [[] for l in labels]
    avg_disc = [[] for l in labels]

    for pdb in dec_inter.keys():

        for ind in xrange(0,len(labels)):
            lowest_dec = min([ e[0] for e in dec_inter[pdb].values() ])
            lowest_nat = min([ n[0] for n in nat_inter[pdb].values() if n[1] < 2.0 ])
            energy_gap[ind].append(lowest_nat - lowest_dec)
            avg_disc[ind].append(disc_inter[pdb][0])

    fig, axarr = conv.create_ax(len(labels), 1)

    for x_ind,l in enumerate(labels):
        ax = axarr[0,x_ind] 

        scatterplot.draw_actual_plot(ax, avg_disc[x_ind], energy_gap[x_ind], [], l,"Disc","Energy Gap")

        scatterplot.plot_regression(ax, avg_disc[x_ind], energy_gap[x_ind], False, False)

    title = os.path.basename(inp_dir)

    filename=inp_dir + "/test.txt"

    conv.save_fig(fig, filename, "disc_v_egap", len(labels)*3, 4)
Example #19
0
def gen_plots(c, r, ax2, output_pre, dirname, st=""):
    fig, axarr = conv.create_ax(1, 31, shx=True, shy=True)

    counters = []
    coeffs = []
    pvals = []
    for i in xrange(1, 31):
        m = combine_counts_ratios(c, r, i, i + 9, st=st)
        coeff, pval = find_coeff_pval(
            m, axarr[i - 1, 0], "Sliding Window: {0} to {1}".format(i, i + 9))
        print i, coeff, pval
        counters.append(i)
        coeffs.append(coeff)
        pvals.append(pval)

    plot_coeff_pval(axarr[30, 0], counters, coeffs, pvals)
    suffix = os.path.normpath(dirname).split(os.sep)[-3]
    conv.save_fig(fig, output_pre + "correlation_plot.txt",
                  "{0}_{1}".format(suffix, st), 4, 20 * 4)

    plot_coeff_pval(ax2, counters, coeffs, pvals, suffix)
Example #20
0
def process_dir(dirnames, unsel, output_pre):
    fig_all, axarr_all = conv.create_ax(len(dirnames), 3, shx=True, shy=True)
    for ind,dirname in enumerate(dirnames):
        print dirname
        counts_fn = dirname + '/counts_' + unsel + '*_PRO_qc'
        ratios_fn = dirname + '/ratios_*_PRO_qc'

        c_fn = glob.glob(counts_fn)
        if len(c_fn) == 0:
            c_fn = None
        else:
            c_fn = c_fn[0]
        r_fn = glob.glob(ratios_fn)[0]
        
        c,r = read_files(c_fn, r_fn)
        
        gen_plots(c, r, axarr_all[0,ind], output_pre, dirname, st="")
        gen_plots(c, r, axarr_all[1,ind], output_pre, dirname, st="mean")
        gen_plots(c, r, axarr_all[2,ind], output_pre, dirname, st="median")

    conv.save_fig(fig_all, output_pre + "all_coeff_pval.txt", "", 4*len(dirnames), 12)
Example #21
0
def process_dir(dirnames, unsel, output_pre):
    fig_all, axarr_all = conv.create_ax(len(dirnames), 3, shx=True, shy=True)
    for ind, dirname in enumerate(dirnames):
        print dirname
        counts_fn = dirname + '/counts_' + unsel + '*_PRO_qc'
        ratios_fn = dirname + '/ratios_*_PRO_qc'

        c_fn = glob.glob(counts_fn)
        if len(c_fn) == 0:
            c_fn = None
        else:
            c_fn = c_fn[0]
        r_fn = glob.glob(ratios_fn)[0]

        c, r = read_files(c_fn, r_fn)

        gen_plots(c, r, axarr_all[0, ind], output_pre, dirname, st="")
        gen_plots(c, r, axarr_all[1, ind], output_pre, dirname, st="mean")
        gen_plots(c, r, axarr_all[2, ind], output_pre, dirname, st="median")

    conv.save_fig(fig_all, output_pre + "all_coeff_pval.txt", "",
                  4 * len(dirnames), 12)
def main(list_sequence_names, canonical_list, output_prefix ):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] )

    uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] )

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
	if seq in canonical_list_seq:
            print seq
	    print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]	
    

    fig, ax = pconv.create_ax(1, 3)


    hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)


    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:
	
        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename) 
            distances = [ conv.hamdist(seq, canonical) for seq in sequences ]
        
            dict_sequences[label] =  { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } 

        x = []
        y = []
        for i in xrange(1,6):
            func=0.0
            unfunc=0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
		x.append(i)
                y.append( func/(func+unfunc) )
        print x
	print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k')
    ax[0,0].set_xlim(xmin=1,xmax=5)
    ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def plot(disc_metrics_1, disc_metrics_2, title1, title2, output_pre, add_slash=True):
    pdbs = sorted(disc_metrics_1.keys())
    n_metrics = len(disc_metrics_1[pdbs[0]])

    fig, axarr = conv.create_ax(n_metrics, 1)

    for x_ind,metric_name in enumerate(disc_metrics_1[pdbs[0]].keys()):
        x = []
        y = []
        ax = axarr[0,x_ind]
        for pdb in pdbs:
            x.append(disc_metrics_1[pdb][metric_name])
            y.append(disc_metrics_2[pdb][metric_name])

        scatterplot.draw_actual_plot(ax, x, y, 'b', metric_name, title1,title2, size=20, edgecolors='k')
        scatterplot.plot_regression(ax,x,y,False)

    if add_slash:
        filename = output_pre + "/" + title1 + "_" + title2 + ".txt"
    else:
	filename = output_pre + title1 + "_" + title2 + ".txt"
    suffix="disc_v_disc"
 
    conv.save_fig(fig, filename, suffix, n_metrics*3, 3, size=9)
Example #25
0
def main(data_file, output_prefix, degree_file, width, height):

    sequences = seq_IO.read_sequences(data_file,
                                      additional_params=True,
                                      header=True,
                                      list_vals=True)
    seq_degree = seq_IO.read_sequences(degree_file,
                                       additional_params=True,
                                       header=True)

    degree_frac = defaultdict(list)

    for seq, seq_dict in sequences.items():
        degree_frac[seq_degree[seq]['Degree']].append(np.mean(
            seq_dict["Frac"]))

    data = [np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items()]

    degree_frac_avg = [
        np.mean(list_fracs) for degree, list_fracs in degree_frac.items()
    ]
    degree_frac_std = [
        np.std(list_fracs) for degree, list_fracs in degree_frac.items()
    ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0, 0],
                          data,
                          "",
                          "",
                          normed=False,
                          nbins=30,
                          edgecolor=None,
                          log=False)
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig,
                   output_prefix,
                   "hist",
                   width,
                   height,
                   tight=True,
                   size=10)

    fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr2[0, 0],
                         degree_frac_avg,
                         'g',
                         "",
                         "Degree",
                         "Fraction Shortest Path Uncleaved",
                         tick_label=degree_frac.keys(),
                         yerr=degree_frac_std)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig2,
                   output_prefix,
                   "bar",
                   width,
                   height,
                   tight=True,
                   size=10)
Example #26
0
def main(epistasis_file):

    dict_epistasis = {
    }  #list of list of sequences, where each item represents a label

    with open(epistasis_file) as e:
        lines = e.readlines()
        for l in lines[1:]:  #ignore header line
            tokens = l.split(',')
            #value consists of Starting Fitness, Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Intermediates
            if dict_epistasis.get((tokens[2], tokens[0])) is None:
                dict_epistasis[tokens[0]] = [
                    tokens[1], tokens[2],
                    float(tokens[3]), tokens[4::2],
                    [t.strip() for t in tokens[5::2]]
                ]
    '''
    n_functional = [0] * 4
    n_should_be_functional = [0] * 4
    n_total = [0] * 4

    for i in xrange(2,6):
	ind = i-2
        neg_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] < -0.000005 and value[4] == i ])
        no_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if abs(value[2]) < 0.000005 and value[4] == i ])
        pos_epistasis[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[2] > 0.000005 and value[4] == i ])
	n_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if value[3] == "CLEAVED" and value[4] == i ])
        n_should_be_functional[ind] = sum([ 1 for key, value in dict_epistasis.items() if all(v == "CLEAVED" for v in value[6]) and value[4] == i ])
	n_total[ind] = float(sum([ 1 for key, value in dict_epistasis.items() if value[4] == i]))
    '''

    seq_func = set(
        [key for key, val in dict_epistasis.items() if val[1] == "CLEAVED"])
    seq_pred_func = set([
        key for key, val in dict_epistasis.items()
        if all(v == "CLEAVED" for v in val[4])
    ])

    seq_unfunc = set(
        [key for key, val in dict_epistasis.items() if val[1] == "UNCLEAVED"])
    seq_pred_unfunc = set([
        key for key, val in dict_epistasis.items()
        if any(v == "UNCLEAVED" for v in val[4]) or sum(v == "MIDDLE"
                                                        for v in val[4]) == 2
    ])

    seq_midfunc = set(
        [key for key, val in dict_epistasis.items() if val[1] == "MIDDLE"])
    seq_pred_midfunc = set([
        key for key, val in dict_epistasis.items()
        if any(v == "MIDDLE" for v in val[4])
    ])

    #fig, axarr = pconv.create_ax(3, 1, shx=True, shy=True)
    #fig2, axarr2 = pconv.create_ax(1, 1)
    #plot_epi(neg_epistasis, n_total, axarr[0,0], "Negative")
    #plot_epi(no_epistasis, n_total, axarr[0,1], "No")
    #plot_epi(pos_epistasis, n_total, axarr[0,2], "Positive")
    #n_func_frac = [ func/total for func, total in zip(n_functional, n_total) ]
    #n_pred_frac = [ pred/total for pred, total in zip(n_should_be_functional, n_total) ]
    #scatterplot.plot_series(axarr2[0,0], [(range(2,6),n_func_frac,"% Cleaved"),(range(2,6),n_pred_frac,"% Pred Cleaved")], "", "Number of Mutations", "Fraction of Total Cases", size=40, connect_dots=True, alpha=1.0)
    #axarr2[0,0].set_ylim([0,1.0])
    fig_venn, axarr_venn = pconv.create_ax(1, 1)
    fig_vennun, axarr_vennun = pconv.create_ax(1, 1)
    fig_vennmid, axarr_vennmid = pconv.create_ax(1, 1)

    venn2([seq_func, seq_pred_func],
          set_labels=["Cleaved", "Pred Cleaved"],
          ax=axarr_venn[0, 0])
    venn2([seq_unfunc, seq_pred_unfunc],
          set_labels=["Uncleaved", "Pred Uncleaved"],
          ax=axarr_vennun[0, 0])
    venn2([seq_midfunc, seq_pred_midfunc],
          set_labels=["Middle", "Pred Middle"],
          ax=axarr_vennmid[0, 0])

    #pconv.save_fig(fig, epistasis_file, "plot", 12, 4, tight=True, size=12)
    #pconv.save_fig(fig2, epistasis_file, "pred_v_cl", 5, 5, tight=True, size=10)
    pconv.save_fig(fig_venn,
                   epistasis_file,
                   "venn",
                   5,
                   5,
                   tight=False,
                   size=14)
    pconv.save_fig(fig_vennun,
                   epistasis_file,
                   "vennun",
                   5,
                   5,
                   tight=False,
                   size=14)
    pconv.save_fig(fig_vennmid,
                   epistasis_file,
                   "vennmid",
                   5,
                   5,
                   tight=False,
                   size=14)
Example #27
0
def main(sequence_ratio_file, width, height, pattern, legend):

    sequence_ratio = seq_IO.read_sequences(sequence_ratio_file,
                                           additional_params=True)

    seqs = [s[0] for s in sequence_ratio]
    avg_ratio = [s[1] for s in sequence_ratio]
    std = [s[2] for s in sequence_ratio]
    label = [s[3] for s in sequence_ratio]

    if len(sequence_ratio[0]) > 4:
        color = [s[4] for s in sequence_ratio]
    else:
        color = [convert_label_color(l) for l in label]

    #check if std has to be fixed
    #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]):
    #    min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ]
    #    max_err = [ a + s for a,s in zip(avg_ratio, std) ]
    #    err = [min_err, max_err]
    #else:
    #    err = std

    err = std

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)

    if legend:
        label_legend = [
            l if l not in ["CLEAVED", "MIDDLE", "UNCLEAVED"] else None
            for l in label
        ]
        patches, labels = bar.draw_actual_plot(axarr[0, 0],
                                               avg_ratio,
                                               color,
                                               "",
                                               "",
                                               "FLAG/HA Ratio",
                                               tick_label=seqs,
                                               yerr=err,
                                               pattern=pattern,
                                               label=label_legend)
        lgd = axarr[0, 0].legend(patches,
                                 labels,
                                 loc="upper center",
                                 bbox_to_anchor=(0.5, 1.05),
                                 borderaxespad=0.,
                                 prop={'size': 9},
                                 ncol=2,
                                 fancybox=True)
        print patches
        print labels
    else:
        bar.draw_actual_plot(axarr[0, 0],
                             avg_ratio,
                             color,
                             "",
                             "",
                             "FLAG/HA Ratio",
                             tick_label=seqs,
                             yerr=err,
                             pattern=pattern)
        lgd = None
    axarr[0, 0].set_ylim([0, 1.3])
    pconv.save_fig(fig,
                   sequence_ratio_file,
                   "plot",
                   width,
                   height,
                   tight=True,
                   size=10,
                   extra_artists=lgd)
Example #28
0
def main(epistasis_file):

    dict_epistasis = {
    }  #list of list of sequences, where each item represents a label

    with open(epistasis_file) as e:
        lines = e.readlines()
        for l in lines[1:]:  #ignore header line
            tokens = l.split(',')
            #value consists of Starting Ratio, Ending Ratio, Epistasis, Ending Fitness, # of Mutations, list of InterSeqs, list of InterFits, list of InterRatios
            if dict_epistasis.get((tokens[2], tokens[0])) is None:
                dict_epistasis[(tokens[0], tokens[2])] = [
                    float(tokens[1]),
                    float(tokens[3]),
                    float(tokens[5]), tokens[4],
                    len(tokens[6::3]), tokens[6::3], tokens[7::3], tokens[8::3]
                ]

    neg_epistasis = [0] * 4
    no_epistasis = [0] * 4
    pos_epistasis = [0] * 4
    n_functional = [0] * 4
    n_should_be_functional = [0] * 4
    n_total = [0] * 4

    for i in xrange(2, 6):
        ind = i - 2
        neg_epistasis[ind] = sum([
            1 for key, value in dict_epistasis.items()
            if value[2] < -0.000005 and value[4] == i
        ])
        no_epistasis[ind] = sum([
            1 for key, value in dict_epistasis.items()
            if abs(value[2]) < 0.000005 and value[4] == i
        ])
        pos_epistasis[ind] = sum([
            1 for key, value in dict_epistasis.items()
            if value[2] > 0.000005 and value[4] == i
        ])
        n_functional[ind] = sum([
            1 for key, value in dict_epistasis.items()
            if value[3] == "CLEAVED" and value[4] == i
        ])
        n_should_be_functional[ind] = sum([
            1 for key, value in dict_epistasis.items()
            if all(v == "CLEAVED" for v in value[6]) and value[4] == i
        ])
        n_total[ind] = float(
            sum([1 for key, value in dict_epistasis.items() if value[4] == i]))

    seq_func = set(
        [key[1] for key, val in dict_epistasis.items() if val[3] == "CLEAVED"])
    seq_pred_func = set([
        key[1] for key, val in dict_epistasis.items()
        if all(v == "CLEAVED" for v in val[6])
    ])
    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    fig2, axarr2 = pconv.create_ax(1, 1)
    artists = []
    artists.extend(
        plot_epi(no_epistasis, n_total, axarr[0, 0], "No", color="gray"))
    artists.extend(
        plot_epi(neg_epistasis,
                 n_total,
                 axarr[0, 0],
                 "Neg.",
                 bottom=no_epistasis,
                 color="white"))
    artists.extend(
        plot_epi(
            pos_epistasis,
            n_total,
            axarr[0, 0],
            "Pos.",
            bottom=[no + neg for no, neg in zip(no_epistasis, neg_epistasis)],
            color="black"))
    n_func_frac = [func / total for func, total in zip(n_functional, n_total)]
    n_pred_frac = [
        pred / total for pred, total in zip(n_should_be_functional, n_total)
    ]
    scatterplot.plot_series(axarr2[0, 0],
                            [(range(2, 6), n_func_frac, "% Cleaved"),
                             (range(2, 6), n_pred_frac, "% Pred Cleaved")],
                            "",
                            "Number of Mutations",
                            "Fraction of Total Cases",
                            size=40,
                            connect_dots=True,
                            alpha=1.0)
    axarr2[0, 0].set_ylim([0, 4.0])
    fig_venn, axarr_venn = pconv.create_ax(1, 1)

    venn2([seq_func, seq_pred_func],
          set_labels=["Cleaved", "Pred Cleaved"],
          ax=axarr_venn[0, 0])

    lgd = axarr[0, 0].legend(artists, ["No", "Neg.", "Pos."],
                             loc="center left",
                             bbox_to_anchor=(1.05, 0.5),
                             borderaxespad=0.,
                             prop={'size': 9},
                             ncol=1,
                             fancybox=True)

    pconv.save_fig(fig,
                   epistasis_file,
                   "plot",
                   3,
                   2.5,
                   tight=False,
                   size=9,
                   extra_artists=lgd)
    pconv.save_fig(fig2,
                   epistasis_file,
                   "pred_v_cl",
                   5,
                   5,
                   tight=True,
                   size=10)
    pconv.save_fig(fig_venn, epistasis_file, "venn", 5, 5, tight=True, size=14)
Example #29
0
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
            seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)]
        orig_len = len(seq_ind_list)
        if canonical not in seq_list:
            one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [o for o in one_away if o != canonical] + [canonical]
            seq_ind_list = seq_ind_list[:] + [
                (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list))
            ]

        edges = [(seq2, seq)
                 for seq, seq2 in itertools.combinations(seq_ind_list, 2)
                 if gsconv.hamdist(seq2[0], seq[0]) < 2]
        print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())

        numpy.set_printoptions(threshold='nan')

        canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0]

        T_mat = trans_matrix(seq_ind_list, edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2, 23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind,
                                            orig_len)
            y.append(frac)

            print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

        series.append([x, y, canonical])

    fig, ax = conv.create_ax(1, 1)

    color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="Number of Steps",
                            colors=color,
                            y_axis="Fraction Cleaved Variants Reached",
                            alpha=0.85,
                            connect_dots=True,
                            size=15,
                            edgecolors='k',
                            linewidth=0)
    ax[0, 0].set_xlim(xmin=1)
    ax[0, 0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0, 0].set_xticks(xrange(1, 23, 3))
    lgd = conv.add_legend(ax[0, 0],
                          location='upper center',
                          bbox_to_anchor=(0.5, 1.05),
                          ncol=2,
                          size=8)
    conv.save_fig(fig,
                  output_prefix,
                  "fraction_func",
                  2.5,
                  3,
                  size=9.5,
                  extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())
def plot_weights_dS(w_dS_sorted, col_dS, unique_z_vals, title1, title2, x_axis_name, y_axis_name, z_axis_name):
    n_unique_z_vals = unique_z_vals.shape[0]

    fig, axarr = conv.create_ax(n_unique_z_vals, 1, True,True)

    #find values in row of total_min
    min_vals = w_dS_sorted[w_dS_sorted[:,col_dS]==np.amin(w_dS_sorted[:,col_dS]),:]
    min_dS = min_vals[0,col_dS]
    max_dS = np.amax(w_dS_sorted[:,col_dS])
    dS_list = sorted(w_dS_sorted[:,col_dS].tolist())
    nlev = len(dS_list)/10 if len(dS_list)>10 else len(dS_list)/2
    levels = dS_list[0::nlev]

    subtitle_prefix = z_axis_name + ": "
    xlabel = x_axis_name + " Weights"
    ylabel = y_axis_name + " Weights"

    for ax_ind,z_val in enumerate(sorted(unique_z_vals.tolist())):
        ax = axarr[0,ax_ind]
        x = w_dS_sorted[w_dS_sorted[:,2]==z_val,0]
        y = w_dS_sorted[w_dS_sorted[:,2]==z_val,1]
        dS = w_dS_sorted[w_dS_sorted[:,2]==z_val,col_dS]
        X = x.reshape(-1, n_unique_z_vals)
        Y = y.reshape(-1, n_unique_z_vals)
        DS = dS.reshape(-1, n_unique_z_vals)

        curr_min_ind = np.argmin(dS)

        CS = ax.contourf(X, Y, DS,
                   extend='both', levels=levels)

        CSlines = ax.contour(X, Y, DS, linestyles='solid',
                  colors=('w',), levels=levels)
        if z_val == min_vals[0,2]:
            min_x = [min_vals[0,0]]
            min_y = [min_vals[0,1]]
            min_ds = [min_vals[0,col_dS]]
            ann_txt = "Global Minimum: {2:.2f} at ({0:.2f}, {1:.2f})".format(min_x[0], min_y[0], min_ds[0])
            ax.scatter(min_x, min_y, c='r', zorder=1)
        else:
            min_x = [x[curr_min_ind]]
            min_y = [y[curr_min_ind]]
            min_ds = [dS[curr_min_ind]]
            ann_txt = "Minimum: {2:.2f} at ({0:.2f}, {1:.2f})".format(min_x[0], min_y[0], min_ds[0])
            ax.scatter(min_x, min_y, c='k', zorder=1)

        ax.annotate(ann_txt, xy=(min_x[0], min_y[0]), xytext=(-20,20),
            textcoords='offset points', ha='center', va='bottom',
            bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3),
            arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5',
                            color='red'),size=10)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_title("{0} : {1:.2f}".format(subtitle_prefix,z_val))

    cbar = plt.colorbar(CS)
    if col_dS == 3:
    	cbar.ax.set_ylabel('combined_S - Rosetta_S')
    else:
	cbar.ax.set_ylabel('combined_S - Amber_S')
    # Add the contour line levels to the colorbar
    cbar.add_lines(CSlines)

    filename = args.output_pre + "/" + title1 + "_" + title2 + "_" + str(col_dS) + ".txt"

    conv.save_fig(fig, filename, "_weights_vs_deltaS", 4*n_unique_z_vals, 4)
def main(list_sequence_names, canonical_list, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "CLEAVED"][0])

    uncleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "UNCLEAVED"][0])

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
        if seq in canonical_list_seq:
            print seq
            print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]

    fig, ax = pconv.create_ax(1, 3)

    hist.draw_actual_plot(ax[0, 0],
                          min_dist,
                          "Min. Distance from Boundary",
                          "Minimum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[1, 0],
                          avg_dist,
                          "Avg. Distance from Boundary",
                          "Average Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[2, 0],
                          max_dist,
                          "Max. Distance from Boundary",
                          "Maximum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)

    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
	    seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ]
	orig_len = len(seq_ind_list)
        if canonical not in seq_list:
	    one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [ o for o in one_away if o != canonical ] + [canonical]
	    seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ]

        edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ]
	print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())    

        numpy.set_printoptions(threshold='nan')

        canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0]

        T_mat = trans_matrix(seq_ind_list,edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2,23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len)
	    y.append(frac)

	    print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

	series.append([x,y,canonical])

    fig, ax = conv.create_ax(1, 1)

    color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0)
    ax[0,0].set_xlim(xmin=1)
    ax[0,0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0,0].set_xticks(xrange(1,23,3))
    lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8)
    conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())    
Example #33
0
def main(list_nodes, output_prefix, metric):

    cleaved_seq = {}
    uncleaved_seq = {}
    middle_seq = {}    

    for nodes, label in list_nodes:
        sequences = seq_IO.read_sequences(nodes, additional_params=True, header=True)

        cleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" }
        middle_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" }
        uncleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" }

    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical"]
	orig_labels_to_plot = sorted([ key for key in sequences["DEMEE"].keys() if key not in labels_non_plot ])
        labels_to_plot = sorted(orig_labels_to_plot) 
    else:
	orig_labels_to_plot = [metric]
	labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10    

    list_seqs = [ k for d in cleaved_seq.values() for k in d.keys() ]

    count_seqs = Counter(list_seqs)

    #seqs_5_l = [ s for s in list_seqs if count_seqs[s] == 5 ]
    seqs_4_l = [ s for s in list_seqs if count_seqs[s] == 4 ]
    seqs_3_l = [ s for s in list_seqs if count_seqs[s] == 3 ]
    seqs_2_l = [ s for s in list_seqs if count_seqs[s] == 2 ]
    seqs_1_l = [ s for s in list_seqs if count_seqs[s] == 1 ]


    if metric != "Fraction_Cleaved":
        #seqs_5 = list_metrics( cleaved_seq, seqs_5_l, orig_labels_to_plot)
        seqs_4 = list_metrics( cleaved_seq, seqs_4_l, orig_labels_to_plot)
        seqs_3 = list_metrics( cleaved_seq, seqs_3_l, orig_labels_to_plot)
        seqs_2 = list_metrics( cleaved_seq, seqs_2_l, orig_labels_to_plot)
        seqs_1 = list_metrics( cleaved_seq, seqs_1_l, orig_labels_to_plot)

    for ind, key in enumerate(labels_to_plot):
	if key == "pageranks":
            log = True
	else:
	    log = False
	if key == "Fraction_Cleaved":
            data = [ #average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_5_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_4_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_3_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_2_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_1_l)]
	    normed=True
        else:
            data = [ #get_data_from_dict(seqs_5, key), 
		get_data_from_dict(seqs_1, key), get_data_from_dict(seqs_2, key), get_data_from_dict(seqs_3, key), get_data_from_dict(seqs_4, key) ]
	    normed=True 
        hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), colors = [ tuple(c) for c in plt.cm.Blues(np.linspace(0.2, 1, 4)).tolist()], log=log, normed=normed, label=["Cl. by 5", "Cl. by 4", "Cl. by 3", "Cl. by 2", "Cl. by 1"], nbins=nbins)    
        axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

        #pconv.add_legend(axarr[0,ind], location="upper right")
    pconv.save_fig(fig, output_prefix, metric, n_to_plot*3, 3, tight=True, size=9) 

    fig_bar, axarr_bar = pconv.create_ax(1, 1, shx=False, shy=False)

    gradient = np.linspace(1, 0.2, 256)
    #gradient = np.hstack((gradient, gradient))
    gradient = np.array(zip(gradient,gradient))
    axarr_bar[0,0].imshow(gradient, aspect='auto', cmap=plt.get_cmap('Blues'))
    #axarr_bar[0,0].set_axis_off()
    plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off', # labels along the bottom edge are off
    left='off',      # ticks along the bottom edge are off
    right='off',         # ticks along the top edge are off
    labelright='off') # labels along the bottom edge are off

    pconv.save_fig(fig_bar, output_prefix, "colorbar", 0.3, 3, tight=True)
def main(seq_file, canonical_file, output_prefix):

    #canonical_list_seq = seq_IO.read_sequences(canonical_file)
    canonical_list_seq = ["DEMEE","DEMED"]
    print "Beginning Script: {0}".format(datetime.datetime.now())

    with open(seq_file) as strings:
        seq_list = strings.read().splitlines()
	seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ]
    
    seq_ind_dict = { seq : ind for seq, ind in seq_ind_list }

    orig_len = len(seq_ind_list)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now())

    for seq, seq_ind in seq_ind_dict.items():
        neighbors = gsconv.gen_hamdist_one(seq)
        edges_set.update([ (seq, n) for n in neighbors if n in seq_ind_dict ])
        edges += [((seq, seq_ind), (n,seq_ind_dict[n])) for n in neighbors if n in seq_ind_dict and (n,seq) not in edges_set ]

    print len(seq_ind_list)
    print "Generated Edges: {0}".format(datetime.datetime.now())    

    numpy.set_printoptions(threshold='nan')

    canon_ind_dict = { canonical : [ i for (s, i) in seq_ind_list if s == canonical ][0] for canonical in canonical_list_seq }

    T_mat = trans_matrix(seq_ind_list,edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
    print "Transformed Matrix: {0}".format(datetime.datetime.now())

    canon_x = { can : [0,1] for can in canonical_list_seq }
    canon_y = { can : [0.0, find_frac(T_mat, canon_ind_dict[can], orig_len)] for can in canonical_list_seq }

    print "Made x and y dicts: {0}".format(datetime.datetime.now())

    T_mat_new = T_mat

    for i in range(2,23):

	T_mat_new = square_matrix(T_mat_new, T_mat)

	for can in canonical_list_seq:
            canon_x[can].append(i)
	    canon_y[can].append(find_frac(T_mat_new, canon_ind_dict[can], orig_len))

	print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())


    series = [ [canon_x[can],canon_y[can], can] for can in canonical_list_seq ]

    fig, ax = conv.create_ax(1, 1)

    color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0)
    ax[0,0].set_xlim(xmin=1)
    ax[0,0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0,0].set_xticks(xrange(1,23,3))
    lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8)
    conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())    
Example #35
0
def main(seq_file, canonical_file, output_prefix):

    #canonical_list_seq = seq_IO.read_sequences(canonical_file)
    canonical_list_seq = ["DEMEE", "DEMED"]
    print "Beginning Script: {0}".format(datetime.datetime.now())

    with open(seq_file) as strings:
        seq_list = strings.read().splitlines()
        seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)]

    seq_ind_dict = {seq: ind for seq, ind in seq_ind_list}

    orig_len = len(seq_ind_list)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now())

    for seq, seq_ind in seq_ind_dict.items():
        neighbors = gsconv.gen_hamdist_one(seq)
        edges_set.update([(seq, n) for n in neighbors if n in seq_ind_dict])
        edges += [((seq, seq_ind), (n, seq_ind_dict[n])) for n in neighbors
                  if n in seq_ind_dict and (n, seq) not in edges_set]

    print len(seq_ind_list)
    print "Generated Edges: {0}".format(datetime.datetime.now())

    numpy.set_printoptions(threshold='nan')

    canon_ind_dict = {
        canonical: [i for (s, i) in seq_ind_list if s == canonical][0]
        for canonical in canonical_list_seq
    }

    T_mat = trans_matrix(seq_ind_list, edges)
    #print raise_matrix(T_mat,1)
    #print raise_matrix(T_mat,3)
    #T = raise_matrix(T_mat,10)
    #T = raise_matrix(T_mat,20)
    print "Transformed Matrix: {0}".format(datetime.datetime.now())

    canon_x = {can: [0, 1] for can in canonical_list_seq}
    canon_y = {
        can: [0.0, find_frac(T_mat, canon_ind_dict[can], orig_len)]
        for can in canonical_list_seq
    }

    print "Made x and y dicts: {0}".format(datetime.datetime.now())

    T_mat_new = T_mat

    for i in range(2, 23):

        T_mat_new = square_matrix(T_mat_new, T_mat)

        for can in canonical_list_seq:
            canon_x[can].append(i)
            canon_y[can].append(
                find_frac(T_mat_new, canon_ind_dict[can], orig_len))

        print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

    series = [[canon_x[can], canon_y[can], can] for can in canonical_list_seq]

    fig, ax = conv.create_ax(1, 1)

    color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="Number of Steps",
                            colors=color,
                            y_axis="Fraction Cleaved Variants Reached",
                            alpha=0.85,
                            connect_dots=True,
                            size=15,
                            edgecolors='k',
                            linewidth=0)
    ax[0, 0].set_xlim(xmin=1)
    ax[0, 0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0, 0].set_xticks(xrange(1, 23, 3))
    lgd = conv.add_legend(ax[0, 0],
                          location='upper center',
                          bbox_to_anchor=(0.5, 1.05),
                          ncol=2,
                          size=8)
    conv.save_fig(fig,
                  output_prefix,
                  "fraction_func",
                  2.5,
                  3,
                  size=9.5,
                  extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(rec_corr_path, ddg_path, amber_pdb_path, rosetta_pdb_path, out_csv_path):
    amber_csv_path = os.path.join(ddg_path, "amber")
    rosetta_csv_path = os.path.join(ddg_path, "rosetta")
    amber_inter_mean_path = os.path.join(ddg_path, "amber_inter_mean")
    rosetta_inter_mean_path = os.path.join(ddg_path, "rosetta_inter_mean")
    rosetta_inter_path = os.path.join(ddg_path, "rosetta_inter")


    #Plots to generate
    #Per pdb - known ddg vs. many diff protocols, each with their own row.  A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto)
    #For all pdbs - pred rosetta corr values vs. many diff protocols, each with their series color.  A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto).  3 rows one for each corr value
    #For all pdbs - known ddg vs. many diff protocols, each with their own row. A protocol may have more than one plot depending on filtering method (i.e. mean, bottom 3, pareto)

    list_rec_corr_names = glob.glob(rec_corr_path + "*.rc")

    #corr_values_dict has the following shape - "Pred" : { "Pred" : [ddg_vals] }, "Amber" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]}, "Rosetta" : { "Mean.." : [ddg_vals], "Bott.." : [ddg_vals]} 
    corr_values_dict = {}
    
    all_amber_ddg_dict = {}
    all_rosetta_ddg_dict = {}
    all_known_ddg_dict = {}
    all_pred_ddg_dict = {}

    k_ddg = []
    p_ddg = []

    for rec_corr in list_rec_corr_names:
        print rec_corr
        rec_corr_list = read_csv_list(rec_corr)
        #no known ddg
        if len(rec_corr_list[0]) == 3:
            continue
        amber_dg_dict = {}
        rosetta_dg_dict = {}
        
        #read in all amber csvs that correspond to column 3 in rec_corr file and rosetta ones too
        for record_id, prefix, filename, known_ddg, pred_ddg in rec_corr_list:
            amber_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"),
                                         "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(amber_csv_path,filename+".csv"), protocol="amber"),
                                         "Mean Interaction Energy" : get_mean_txt(os.path.join(amber_inter_mean_path,filename+".txt")) }
            rosetta_dg_dict[filename] = { "Mean Binding Energy" : get_mean_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), 
                                         "Bottom 3 Binding Energy" : get_bottom3_csv(os.path.join(rosetta_csv_path,filename+".csv"), protocol="rosetta"), 
                                         "Mean Interaction Energy" : get_mean_txt(os.path.join(rosetta_inter_mean_path,filename+".txt")),
                                         "Bottom 3 Interaction Energy" : get_bottom3_csv(os.path.join(rosetta_inter_path,filename+".csv")) }
    
        #find wt csv that correspond to wt row in rec_corr_file (column 2)
        wt_csv_name = [ rec[2] for rec in rec_corr_list if "wt" in rec[1] ][0]

        amber_ddg_dict = {}
        rosetta_ddg_dict = {}
        known_ddg_dict = {}
        pred_ddg_dict = {}
 
        #loops thru other records in rec_corr_dict
        for rec, prefix, filename, k,p in rec_corr_list:
            if "wt" not in prefix:
                if amber_ddg_dict.get(filename) is None:
                    amber_ddg_dict[filename] = {}
                if rosetta_ddg_dict.get(filename) is None:
                    rosetta_ddg_dict[filename] = {}
                for key, dg in amber_dg_dict[wt_csv_name].items():
                    amber_ddg_dict[filename][key] = amber_dg_dict[filename][key] - dg
                for key, dg in rosetta_dg_dict[wt_csv_name].items():
                    rosetta_ddg_dict[filename][key] = rosetta_dg_dict[filename][key] - dg
                known_ddg_dict[filename] = { "Known" : float(k) }
                pred_ddg_dict[filename] = { "Pred" : float(p) }

        all_amber_ddg_dict.update(amber_ddg_dict)
        all_rosetta_ddg_dict.update(rosetta_ddg_dict)
        all_known_ddg_dict.update(known_ddg_dict)
        all_pred_ddg_dict.update(pred_ddg_dict)        
   
        fig, axarr = conv.create_ax(max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()]), 3, shx=True, shy=True)
        plot_ddg_dict(rosetta_ddg_dict,known_ddg_dict,axarr,0,"Rosetta",corr_values_dict)
        plot_ddg_dict(amber_ddg_dict,known_ddg_dict,axarr,1,"Amber",corr_values_dict)
        plot_ddg_dict(pred_ddg_dict,known_ddg_dict,axarr,2,"Pred",corr_values_dict)
       
        conv.save_fig(fig, out_csv_path + "/" + os.path.splitext(os.path.basename(rec_corr))[0] + ".txt", "ddg", max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12)
 

    #Plot all correlation values
    
    fig_all, axarr_all = conv.create_ax(len(corr_values_dict["Rosetta"]),3)

    #assumes that Rosetta has more protocols than Amber
    for x_ind,(protocol, vals) in enumerate(corr_values_dict["Rosetta"].items()):
        if corr_values_dict["Amber"].get(protocol) is not None:
            amber_vals = corr_values_dict["Amber"][protocol]
        else:
            amber_vals = None
        pred_vals = corr_values_dict["Pred"]["Pred"]
        labels=["-PCC","-Rho","-Mae"]
        for ind,(val_list,label) in enumerate(zip(vals,labels)):
            series = [[val_list,pred_vals[ind],"Rosetta "+protocol]]
            if amber_vals is not None:
                series.append([amber_vals[ind],pred_vals[ind],"Amber "+protocol])
            scatterplot.plot_series(axarr_all[ind,x_ind], series, protocol,"Pred",label,colors=['coral','cyan'], size=40)
            scatterplot.add_x_y_line(axarr_all[ind,x_ind])

 	    #if x_ind == 2:   
 	    #    axarr_all[x_ind,y_ind].set_xlim([-0.2,10.0])
        #        axarr_all[x_ind,y_ind].set_ylim([-0.2,10.0])
    	#    	scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],0.0,10.0)
	    #else:
		#axarr_all[x_ind,y_ind].set_xlim([-1.2,1.2])
        #        axarr_all[x_ind,y_ind].set_ylim([-1.2,1.2])
        #        scatterplot.add_x_y_line(axarr_all[x_ind,y_ind],-1.0,1.0)
	    
    conv.save_fig(fig_all, out_csv_path + "/all.txt", "ddg", 16, 12)

    fig_all_corr, axarr_all_corr = conv.create_ax(max([len(d) for k, d in all_amber_ddg_dict.items() ]+[len(d) for k,d in all_rosetta_ddg_dict.items()]), 3, shx=True, shy=True)
    plot_ddg_dict(all_rosetta_ddg_dict,all_known_ddg_dict,axarr_all_corr,0,"Rosetta",corr_values_dict)
    plot_ddg_dict(all_amber_ddg_dict,all_known_ddg_dict,axarr_all_corr,1,"Amber",corr_values_dict)
    plot_ddg_dict(all_pred_ddg_dict,all_known_ddg_dict,axarr_all_corr,2,"Pred",corr_values_dict)
    
    conv.save_fig(fig_all_corr, out_csv_path + "/all_corr.txt", "ddg",max([len(d) for k, d in amber_dg_dict.items() ]+[len(d) for k,d in rosetta_dg_dict.items()])*4, 12) 
def main(input_dir_1, scoretype1, input_dir_2, scoretype2, rmsd_cutoff, output_pre ):
    #read in and rename arguments
    title1 = os.path.basename(input_dir_1)
    title2 = os.path.basename(input_dir_2)

    d1, n1 = scorefileparse.read_dec_nat(input_dir_1, scoretype1, repl_orig=False)
    d2, n2 = scorefileparse.read_dec_nat(input_dir_2, scoretype2, repl_orig=False)

    dec1 = scorefileparse.filter_pdbs_by_rmsd(d1, rmsd_cutoff)
    nat1 = scorefileparse.filter_pdbs_by_rmsd(n1, rmsd_cutoff)
    dec2 = scorefileparse.filter_pdbs_by_rmsd(d2, rmsd_cutoff)
    nat2 = scorefileparse.filter_pdbs_by_rmsd(n2, rmsd_cutoff)

    dec_norm1 = scorefileparse.norm_pdbs(dec1)
    nat_norm1 = scorefileparse.norm_pdbs(nat1,dec1)
    dec_norm2 = scorefileparse.norm_pdbs(dec2)
    nat_norm2 = scorefileparse.norm_pdbs(nat2,dec2)

    [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec_norm1, nat_norm1, dec_norm2, nat_norm2]) 
    [dec_inter1, dec_inter2] = scorefileparse.pdbs_scores_intersect([dec_inter1, dec_inter2])       
    [nat_inter1, nat_inter2] = scorefileparse.pdbs_scores_intersect([nat_inter1, nat_inter2])       

    dec_filt1 = scorefileparse.filter_norm_pdbs(dec_norm1)
    nat_filt1 = scorefileparse.filter_norm_pdbs(nat_norm1)
    dec_filt2 = scorefileparse.filter_norm_pdbs(dec_norm2)
    nat_filt2 = scorefileparse.filter_norm_pdbs(nat_norm2)

    [dec_finter1, dec_finter2] = scorefileparse.pdbs_scores_intersect([dec_filt1, dec_filt2])
    [nat_finter1, nat_finter2] = scorefileparse.pdbs_scores_intersect([nat_filt1, nat_filt2])

    fig, axarr = conv.create_ax(2, len(dec_inter1))

    line_plot_data = {}

    min_naive_by_pdb = {}

    for x_ind,pdb in enumerate(sorted(dec_inter1.keys())):

        ax = axarr[x_ind, 0] 

        plot_r_v_r(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2)

        ax = axarr[x_ind, 1]

        min_naive = plot_pareto(dec_inter1, dec_inter2, nat_inter1, nat_inter2, ax, pdb, title1, title2)
        keys_to_include = ["Amber", "Rosetta","All","Pareto10"]
        for key, (rank1, rank2, rmsd) in min_naive.items():
	     #if key not in keys_to_include:
	     #    continue
	     if line_plot_data.get(key) is None:
	         line_plot_data[key] = ([],[])
       	     line_plot_data[key][0].append(pdb)
	     line_plot_data[key][1].append(rmsd)
	     if min_naive_by_pdb.get(pdb) is None:
                 min_naive_by_pdb[pdb] = {}
             min_naive_by_pdb[pdb][key] = rmsd

    #organize data
    indices = list(range(len(line_plot_data["All"][1])))
    indices.sort(key=lambda x: line_plot_data["All"][1][x])
    
    ranked_pdbs_by_rmsd_all = {}

    for i, x in enumerate(indices):
        ranked_pdbs_by_rmsd_all[line_plot_data["All"][0][x]] = i

    for label, (pdbs, rmsds) in line_plot_data.items():
	line_plot_data[label] = tuple(zip(*sorted(zip(pdbs,rmsds), key=lambda x: ranked_pdbs_by_rmsd_all[x[0]] )))    

    filename = output_pre + "/" + title1 + "_" + title2 + ".txt"   
    
    #suffix="rmsd_v_rmsd_{0}".format(rmsd_cutoff)
 
    #conv.save_fig(fig, filename, suffix, 7, len(dec_inter1)*3)

    #plot line plot
    all_pareto_labels = []

    for initial in ["R","A"]:
        ordered_labels = ["All", "Amber", "Rosetta"]
        for i in range(1,11):
            ordered_labels.append("Pareto{0}{1}".format(initial,i))
            all_pareto_labels.append("Pareto{0}{1}".format(initial,i))
        
        lines = [ (line_plot_data[label][0], line_plot_data[label][1], label) for label in ordered_labels ]

        fig2, axarr2 = conv.create_ax(1, len(ordered_labels), shx=True, shy=True)

        for i, label in enumerate(ordered_labels):

            line.plot_series(axarr2[i,0], lines[0:i+1], "RMSD vs. pdb", "PDB", "RMSD", linestyle='')
    
            conv.add_legend(axarr2[i,0])
        conv.save_fig(fig2, filename, "_line_{0}".format(initial), 10, len(ordered_labels)*5)

    #plot histogram plot

    hist_comp = [ ("Amber","All"), ("Rosetta", "All"), ("ParetoR10", "All"), ("ParetoA10", "All")]

    hist_comp.extend([ ("ParetoR{0}".format(ind),"Rosetta") for ind in range(1,11) ])
    hist_comp.extend([ ("ParetoR{0}".format(ind),"Amber") for ind in range(1,11) ])
    hist_comp.extend([ ("ParetoA{0}".format(ind),"Rosetta") for ind in range(1,11) ])
    hist_comp.extend([ ("ParetoA{0}".format(ind), "Amber") for ind in range(1,11) ])

    fig3, axarr3 = conv.create_ax(2, len(hist_comp), shx=False, shy=False)

    for ind, (top, bottom) in enumerate(hist_comp):
        gen_dist_plot(axarr3[ind,0], axarr3[ind,1], top, bottom, min_naive_by_pdb)

    conv.save_fig(fig3, filename, "_distdeltas", 7, len(hist_comp)*5, tight=False)

    #plot scatterplot
    fig4, axarr4 = conv.create_ax(10, 2)
    for i in range(1,11):
        gen_scatterplot(axarr4[0,i-1], "ParetoR{0}".format(i), "Rosetta", "Amber", min_naive_by_pdb)
        gen_scatterplot(axarr4[1,i-1], "ParetoA{0}".format(i), "Rosetta", "Amber", min_naive_by_pdb)

    conv.save_fig(fig4, filename, "_scattdeltas", 30, 6)
Example #38
0
def main(list_nodes, output_prefix, metric, create_keys=False):

    if not create_keys:
        sequences = seq_IO.read_sequences(list_nodes,
                                          additional_params=True,
                                          header=True)
    else:
        sequences = seq_IO.read_sequences(list_nodes,
                                          additional_params=True,
                                          header=True,
                                          create_keys=True)

    cleaved_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "CLEAVED"
    }
    middle_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "MIDDLE"
    }
    uncleaved_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "UNCLEAVED"
    }

    print len(cleaved_seq)
    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"]
        #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"])
        labels_to_plot = sorted([
            key for key in sequences["YNYIN"].keys()
            if key not in labels_non_plot
        ])
    else:
        labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10

    for ind, key in enumerate(labels_to_plot):
        if key == "pageranks":
            log = True
        else:
            log = False
        if key == "Fraction_Cleaved":
            # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(),
            #           conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(),
            #          conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()]
            normed = True
        else:
            data = [
                get_data_from_dict(cleaved_seq, key),
                get_data_from_dict(middle_seq, key),
                get_data_from_dict(uncleaved_seq, key)
            ]
            normed = True
        print key
        hist.draw_actual_plot(axarr[0, ind],
                              data,
                              "",
                              key.capitalize(),
                              log=log,
                              normed=normed,
                              label=["Cleaved", "Middle", "Uncleaved"],
                              nbins=nbins)
        axarr[0, ind].ticklabel_format(axis='x',
                                       style='sci',
                                       scilimits=(-2, 2))

    #pconv.add_legend(axarr[0,ind], location="middle right")
    pconv.save_fig(fig,
                   output_prefix,
                   metric,
                   n_to_plot * 2.5,
                   2.5,
                   tight=True,
                   size=9)
def main(list_input_dirs, energies_names, output_pre):
    #read in and rename arguments
    inp_dir1=list_input_dirs[0][0]
    scoretype1=list_input_dirs[0][1]
    inp_dir2=list_input_dirs[1][0]
    scoretype2=list_input_dirs[1][1]

    title1 = os.path.basename(inp_dir1)
    title2 = os.path.basename(inp_dir2)

    column_dict = {}

    for c in energies_names:
        column_dict[c[0]] = c[1:]

    dec1, nat1 = scorefileparse.read_dec_nat(inp_dir1, energies_names[scoretype1], scoretype1)
    dec2, nat2 = scorefileparse.read_dec_nat(inp_dir2, energies_names[scoretype2], scoretype2)

    [dec_inter1, nat_inter1, dec_inter2, nat_inter2] = scorefileparse.pdbs_intersect([dec1, nat1, dec2, nat2]) 

    sum_discs = Counter()

    fig, axarr = conv.create_ax(1, len(dec_inter1)+1, True,True)

    for x_ind, pdb in enumerate(sorted(dec_inter1.keys())):

        discs_per_pdb = {}

        for w_1 in xrange(-10,10,2):
            for w_2 in xrange(-10,10,2): 
                weight_1 = 2 ** w_1
                weight_2 = 2 ** w_2
                weighted_1 = scorefileparse.weight_dict(dec_inter1[pdb], weight_1)
                weighted_2 = scorefileparse.weight_dict(dec_inter2[pdb], weight_2)
                merged = scorefileparse.merge_dicts([weighted_1, weighted_2])
                ddata1 = scorefileparse.convert_disc(merged)

                disc_divs = [1.0,1.5,2.0,2.5,3.0,4.0,6.0]

                disc1, d, counts = disc.given_data_run_disc(ddata1, True, disc_divs)
                discs_per_pdb[(weight_1,weight_2)] = disc1

        sorted_disc = sorted(discs_per_pdb.values())
        max_title = [ t for t,v in discs_per_pdb.items() if v == sorted_disc[0] ]
        
        #header_string = "\t".join("{0:.3f}-{1:.3f}".format(x,y) for x,y in sorted(discs_per_pdb.keys())) + "\tMax_Weight"
        #values_string = "\t".join(format(x, "10.3f") for (w1,w2),x in sorted(discs_per_pdb.items())) + "\t{0:.3f}".format(max_title[0])
        
        #print header_string
        #print values_string

        ax = axarr[x_ind, 0]

        #ax.set_xlim(-10, 600)
        #ax.set_ylim(-10, 600)

        ax.set_xscale('log', basex=2)
        ax.set_yscale('log', basey=2)

        x = [ w1 for (w1,w2) in sorted(discs_per_pdb.keys()) ]
        y = [ w2 for (w1,w2) in sorted(discs_per_pdb.keys()) ]
        d = [ v for k,v in sorted(discs_per_pdb.items()) ]
  
        min_y = min(discs_per_pdb.values())
        max_y = max(discs_per_pdb.values())
        #print min_y, max_y
        s = scatterplot.draw_actual_plot(ax, x, y, d, pdb, scoretype1, scoretype2, 'bwr')
        fig.colorbar(s,ax=ax)
        #ax.axhline(y=min_y)
        #ax.set_ylim(min_y-0.05,max_y+0.05)
        scatterplot.add_x_y_line(ax, 0,600)

        sum_discs.update(discs_per_pdb)

    #print "All PDBs {0}".format(len(dec_inter1))

    #sorted_disc = sorted(sum_discs.values())
    #max_title = [ t for t,v in sum_discs.items() if v == sorted_disc[0] ]

    #header_string = "\t".join(format(x, "10.3f") for x in sorted(sum_discs.keys())) + "\tMax_Weight"
    #values_string = "\t".join(format(x/len(dec_inter1), "10.3f") for key,x in sorted(sum_discs.items())) + "\t{0:.3f}".format(max_title[0])
  
    #print header_string
    #print values_string 

    ax = axarr[len(dec_inter1), 0]

    min_y = min(x/len(dec_inter1) for x in sum_discs.values())   
    max_y = max(x/len(dec_inter1) for x in sum_discs.values())

    x = [ w1 for w1,w2 in sorted(sum_discs.keys()) ]
    y = [ w2 for w1,w2 in sorted(sum_discs.keys()) ]
    d = [ v/len(dec_inter1) for k,v in sorted(sum_discs.items()) ]
    #fix titles of axes

    ax.set_xscale('log', basex=2)
    ax.set_yscale('log', basey=2)

    s = scatterplot.draw_actual_plot(ax, x,y,d, "All", scoretype1, scoretype2, cm='bwr')
    fig.colorbar(s,ax=ax)
    scatterplot.add_x_y_line(ax, 0,600)
    #ax.axhline(y=min_y)

    conv.save_fig(fig, output_pre, "_weights_v_disc", 3, len(dec_inter1)*3)