def main(list_sequence_names, output_prefix):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    cleaved_ind = labels.index("CLEAVED")
    middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")
    frac_uncleaved = {}
    frac_cleaved = {}
    frac_middle = {}
    for seq in list_sequences[cleaved_ind]:
        cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1])
        uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1])
        middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1])
	if cleaved_seqs > 0 or uncleaved_seqs > 0:
	    total = uncleaved_seqs+middle_seqs+cleaved_seqs
            frac_uncleaved[seq] = float(uncleaved_seqs)/total
	    frac_cleaved[seq] = float(cleaved_seqs)/total
            frac_middle[seq] = float(middle_seqs)/total
    fig, ax = pconv.create_ax(3, 1)

    hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20)

    pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
def main(sequence_list, canonical_seq_list, known_cleaved):    

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    canonical_seqs = seq_IO.read_sequences(canonical_seq_list)

    known_cleaved_list = seq_IO.read_sequences(known_cleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list]
    uncleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list]

    cl_s_dist = [ s[2] for s in cleaved_seqs]
    uncl_s_dist = [s[2] for s in uncleaved_seqs]

    print max(cl_s_dist)
    print min(uncl_s_dist)

    cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4]
    cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4]
    uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4]
    uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4]

    outfile = '%s_selected.csv' % (base)

    out = open(outfile,"w")
    out.write("Cleaved_seqs_low_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_low_ham ] )) 
    out.write("\nCleaved_seqs_high_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_hi_ham ] ))
    out.write("\nUncleaved_seqs_low_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_low_ham ] ))
    out.write("\nUncleaved_seqs_high_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_hi_ham ] ))
Exemple #3
0
def main(sequence_list, trained_cleaved, trained_uncleaved):

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    trained_cleaved_list = seq_IO.read_sequences(trained_cleaved)

    trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [(s[0], s[1],
                     min([conv.hamdist(s[0], c)
                          for c in trained_cleaved_list])) for s in sequences
                    if s[1] == 'CLEAVED']
    uncleaved_seqs = [
        (s[0], s[1],
         min([conv.hamdist(s[0], c) for c in trained_uncleaved_list]))
        for s in sequences if s[1] == 'UNCLEAVED'
    ]

    outfile = '%s_selected_hamm.csv' % (base)

    out = open(outfile, "w")
    out.write("Cleaved_seqs\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs]))
    out.write("\nUncleaved_seqs\n")
    out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs]))
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences

    dict_sequences = {}

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)

        extended_list_sequences.extend(new_seqs[:])
	dict_sequences.update({ n[0] : n for n in new_seqs })
        labels.append(label)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now()) 

    for seq, fitness, canonical_like in extended_list_sequences:
        neighbors = conv.gen_hamdist_one(seq)
        edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ])
	edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ]
    
    print "Generated Edges: {0}".format(datetime.datetime.now())
    print edges[0:10]
    seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } 
  
    nodes = []
    for seqs, label in zip(list_sequences, labels):
        nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ])  
   
    print "Generated List of Nodes: {0}".format(datetime.datetime.now()) 
    links = []

    for canonical_seq in canonical_seqs: 
	print canonical_seq
        for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
	    links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } )        

    print "Generated List of Edges: {0}".format(datetime.datetime.now())

    output = { "nodes" : nodes, "links" : links }

    with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp:
        json.dump(output, fp)

    print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())    
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)

        extended_list_sequences.extend(new_seqs[:])
	dict_sequences = { n[0] : n for n in new_seqs }
        labels.append(label)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now()) 

    for seq, fitness, canonical_like in extended_list_sequences:
        neighbors = conv.gen_hamdist_one(seq)
        edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ])
	edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ]
    
    print "Generated Edges: {0}".format(datetime.datetime.now())
    print edges[0:10]
    seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } 
  
    nodes = []
    for seqs, label in zip(list_sequences, labels):
        nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ])  
   
    print "Generated List of Nodes: {0}".format(datetime.datetime.now()) 
    links = []

    for canonical_seq in canonical_seqs: 
	print canonical_seq
        for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
	    links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } )        

    print "Generated List of Edges: {0}".format(datetime.datetime.now())

    output = { "nodes" : nodes, "links" : links }

    with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp:
        json.dump(output, fp)

    print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())    
Exemple #6
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    extended_list_sequences = []  #flat list of sequences
    labels = []  #labels for list_sequences

    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename,
                                          additional_params=True,
                                          ind_type={1: float})
        new_seqs = [(seq, fitness,
                     min([conv.hamdist(seq, can)
                          for can in canonical_seqs]) <= 2)
                    for seq, fitness in sequences]
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [
        (seq2, seq)
        for seq, seq2 in itertools.combinations(extended_list_sequences, 2)
        if conv.hamdist(seq2[0], seq[0]) == hamming_dist
    ]

    for canonical_seq in canonical_seqs:
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges, "w")
        edges_out.write("Source,Target,Weight\n")
        print canonical_seq
        for ([seq1, fit1, can1], [seq2, fit2, can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
            out_str = "{0},{1},{2}\n".format(seq_lower, seq_upper,
                                             fit_lower / float(fit_upper))
            edges_out.write(
                out_str)  #does this have the correct directionality?
        edges_out.close()

    already_written_nodes = []

    nodes_out = open(outfile_nodes, "w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs, label in zip(list_sequences, labels):
        nodes_out.write("\n".join(
            "{0},{0},{1},{2},{3}".format(x, label, fitness, can)
            for (x, fitness, can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([s[0] for s in seqs])
        nodes_out.write("\n")
Exemple #7
0
def main(sequence_list, canonical_seq_list, known_cleaved):

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    canonical_seqs = seq_IO.read_sequences(canonical_seq_list)

    known_cleaved_list = seq_IO.read_sequences(known_cleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [
        (s[0], s[1], s[2], min([conv.hamdist(s[0], c)
                                for c in canonical_seqs])) for s in sequences
        if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list
    ]
    uncleaved_seqs = [(s[0], s[1], s[2],
                       min([conv.hamdist(s[0], c) for c in canonical_seqs]))
                      for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0
                      and s[0] not in known_cleaved_list]

    cl_s_dist = [s[2] for s in cleaved_seqs]
    uncl_s_dist = [s[2] for s in uncleaved_seqs]

    print max(cl_s_dist)
    print min(uncl_s_dist)

    cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x:
                                  (x[3], -x[2]))[0:4]
    cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x:
                                 (-x[3], -x[2]))[0:4]
    uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x:
                                    (x[3], x[2]))[0:4]
    uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x:
                                   (-x[3], x[2]))[0:4]

    outfile = '%s_selected.csv' % (base)

    out = open(outfile, "w")
    out.write("Cleaved_seqs_low_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_low_ham]))
    out.write("\nCleaved_seqs_high_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_hi_ham]))
    out.write("\nUncleaved_seqs_low_hamming_distance\n")
    out.write("\n".join(
        [",".join(map(str, s)) for s in uncleaved_seqs_low_ham]))
    out.write("\nUncleaved_seqs_high_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s))
                         for s in uncleaved_seqs_hi_ham]))
def find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist):
    if hamm_dist == -1:
        set_sequences = set([seq for seq in sequences if chem_sim(seq, can)])
    else:
        set_sequences = set(
            [seq for seq in sequences if conv.hamdist(seq, can) > hamm_dist])
    return set_sequences
Exemple #9
0
def main(input_dir, canonical_file, output_prefix, hamm_dist):

    list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt"))

    dict_sequences = {}
    canonical_sequences = []
    canonical_sequences = seq_IO.read_sequences(canonical_file)

    for filename in list_seq_files:
        sequences = seq_IO.read_sequences(filename)
        for can in canonical_sequences:
            if hamm_dist == -1:
                seq_sim = [seq for seq in sequences if chem_sim(seq, can)]
            else:
                seq_sim = [
                    seq for seq in sequences
                    if conv.hamdist(seq, can) <= hamm_dist
                ]
            if seq_sim:
                dict_sequences[(filename, can)] = seq_sim

    outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix,
                                                       hamm_dist)

    canon_out = open(outfile_canon, "w")

    for (filename, can), seqs in dict_sequences.items():
        canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
Exemple #10
0
def find_seqs_less_than(can, sequences, set_sequences, hamm_dist):
    if hamm_dist == -1:
        set_sequences = set_sequences.union(
            [seq for seq in sequences if chem_sim(seq, can)])
    else:
        set_sequences = set_sequences.union(
            [seq for seq in sequences if conv.hamdist(seq, can) <= hamm_dist])
    return set_sequences
def main(list_sequence_names, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences


    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    dict_sequences = { seq : fitness for (seq, fitness) in extended_list_sequences }

    epi = {}

    for canonical_seq in canonical_seqs: 
        mut_func = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] }
        mut_nonfunc = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] }

        outfile_epi = '%s_%s_epi.csv' % (output_prefix, canonical_seq)
        epi_out = open(outfile_epi,"w")
	print canonical_seq
	epi = {}
        double_mut = [ seq for seq in extended_list_sequences if conv.hamdist(canonical_seq, seq[0]) == 2 ]
        for seq_fit in extended_list_sequences:
            seq = seq_fit[0]
            fit = seq_fit[1] 
            mut_dict = mut_func if fit == 1000 else mut_nonfunc
            list_fit = get_inter_fitness(canonical_seq, seq, dict_sequences)
	    if len(list_fit) <=  1:
                continue
            if all(list_fit):
		if seq_fit in double_mut:
                    
                    sum_fit = sum(list_fit)
	            print sum_fit
                    if sum_fit == 2000:
                        mut_dict["Both_Functional"].append((canonical_seq, seq))
                    elif sum_fit == 0:
                        mut_dict["Both_Nonfunctional"].append((canonical_seq, seq))
                    elif sum_fit == 1000:
                        mut_dict["One_Functional"].append((canonical_seq, seq))
                epi[seq] = (calc_epi(list_fit, fit),list_fit+[fit])

        epi_out.write("Total Double Mutants,%s\n" % (len(double_mut)))

	for label, list_muts in mut_func.items():
            for (can, seq) in list_muts:
                epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) )
        for label, list_muts in mut_nonfunc.items():
            for (can, seq) in list_muts:
                epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) )	   
	epi_out.write("\n".join(["{0},{1},{2}".format(seq,epi,",".join([str(f) for f in fits])) for seq, (epi,fits) in epi.items()] ) ) 
	epi_out.close()
Exemple #12
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ]

    for canonical_seq in canonical_seqs: 
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges,"w")
        edges_out.write("Source,Target,Weight\n")
	print canonical_seq
        for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1        
	    out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper))
	    edges_out.write(out_str) #does this have the correct directionality?
	edges_out.close()

    already_written_nodes = []
   
    nodes_out = open(outfile_nodes,"w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs,label in zip(list_sequences,labels):
        nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([ s[0] for s in seqs])
        nodes_out.write("\n")
Exemple #13
0
def main(list_sequence_names, canonical_list, output_prefix, func_labels,
         unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:

        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename)
            distances = [conv.hamdist(seq, canonical) for seq in sequences]

            dict_sequences[label] = {
                i: sum([d for d in distances if d == i])
                for i in xrange(1, 6)
            }

        x = []
        y = []
        for i in xrange(1, 6):
            func = 0.0
            unfunc = 0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
                x.append(i)
                y.append(func / (func + unfunc))
        print x
        print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="# of Mutations",
                            y_axis="Fraction of Variants that are Functional",
                            alpha=1.0,
                            connect_dots=True,
                            size=30,
                            edgecolors='k')
    ax[0, 0].set_xlim(xmin=1, xmax=5)
    ax[0, 0].set_xticks(xrange(1, 6))
    pconv.save_fig(fig,
                   output_prefix,
                   canonical + "_fraction_func_mutant",
                   6,
                   6,
                   size=15)
def main(json_file, output_prefix, novel_seqs_file, canonical_file):

    print "Started Script: {0}".format(datetime.datetime.now())
    
    with open(json_file) as data_file:    
        data = json.load(data_file)

    G = json_graph.node_link_graph(data, directed=False)

    print "Finished Reading in Graph: {0}".format(datetime.datetime.now())

    id_seq = networkx.get_node_attributes(G, "sequence")
    id_status = networkx.get_node_attributes(G, "status")
    seq_id = { seq : node_id for node_id, seq in id_seq.items()}

    print "Created inverse lookup table: {0}".format(datetime.datetime.now())

    novel_seqs = seq_IO.read_sequences(novel_seqs_file)
    canonical_seqs = seq_IO.read_sequences(canonical_file)

    novel_fracs = {}    

    print "Ready to enter loop: {0}".format(datetime.datetime.now())

    for n in novel_seqs:
	novel_fracs[n] = {}
        hamm_dist = sorted([ (conv.hamdist(n,c),c) for c in canonical_seqs ]) 
	min_hamm_dist = hamm_dist[0][0]
        print "Found hamming distances: {0}".format(datetime.datetime.now())

        for hamm, c in hamm_dist:
	    #only analyze min_dist canonical sequences
	    if hamm != min_hamm_dist:
	        continue
	    novel_fracs[n][c] = []
	    #generate list of 5 paths
            #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5)
            paths = [ networkx.shortest_path(G, seq_id[n], seq_id[c]) ]

            for path in paths:
	        inter_nodes = path[1:-1]
                novel_fracs[n][c].append(float(sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ]))/len(inter_nodes))
    
    base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0])
    base_c_file = os.path.basename(os.path.splitext(canonical_file)[0])

    with open("{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o:
        for n, c_dict in novel_fracs.items():
	    for c, fracs_list in c_dict.items():
                o.write("{0},{1},".format(n,c))
	        o.write(",".join(map(str,fracs_list)))
		o.write("\n")
	
    print "Output paths: {0}".format(datetime.datetime.now())
Exemple #15
0
def main(sequence_list, trained_cleaved, trained_uncleaved):    

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    trained_cleaved_list = seq_IO.read_sequences(trained_cleaved)

    trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED' ]
    uncleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ]

    outfile = '%s_selected_hamm.csv' % (base)

    out = open(outfile,"w")
    out.write("Cleaved_seqs\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs ] )) 
    out.write("\nUncleaved_seqs\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs] ))
def main(list_sequence_names, canonical_list, output_prefix ):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] )

    uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] )

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
	if seq in canonical_list_seq:
            print seq
	    print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]	
    

    fig, ax = pconv.create_ax(1, 3)


    hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)


    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:
	
        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename) 
            distances = [ conv.hamdist(seq, canonical) for seq in sequences ]
        
            dict_sequences[label] =  { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } 

        x = []
        y = []
        for i in xrange(1,6):
            func=0.0
            unfunc=0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
		x.append(i)
                y.append( func/(func+unfunc) )
        print x
	print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k')
    ax[0,0].set_xlim(xmin=1,xmax=5)
    ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
Exemple #18
0
def main(input_dir, canonical_file, output_prefix, hamm_dist):

    list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt"))
    
    dict_sequences = {}
    canonical_sequences = [] 
    canonical_sequences = seq_IO.read_sequences(canonical_file)

    for filename in list_seq_files:
        sequences = seq_IO.read_sequences(filename) 
        for can in canonical_sequences:
            if hamm_dist == -1:
                seq_sim = [ seq for seq in sequences if chem_sim(seq, can) ]
	    else:
       	        seq_sim = [ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ]
	    if seq_sim:
                 dict_sequences[(filename, can)] = seq_sim

    outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist)

    canon_out = open(outfile_canon,"w")

    for (filename, can), seqs in dict_sequences.items():
	canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
def main(json_file, output_prefix, novel_seqs_file, canonical_file):

    print "Started Script: {0}".format(datetime.datetime.now())

    with open(json_file) as data_file:
        data = json.load(data_file)

    G = json_graph.node_link_graph(data, directed=False)

    print "Finished Reading in Graph: {0}".format(datetime.datetime.now())

    id_seq = networkx.get_node_attributes(G, "sequence")
    id_status = networkx.get_node_attributes(G, "status")
    seq_id = {seq: node_id for node_id, seq in id_seq.items()}

    print "Created inverse lookup table: {0}".format(datetime.datetime.now())

    novel_seqs = seq_IO.read_sequences(novel_seqs_file)
    canonical_seqs = seq_IO.read_sequences(canonical_file)

    novel_fracs = {}

    print "Ready to enter loop: {0}".format(datetime.datetime.now())

    for n in novel_seqs:
        novel_fracs[n] = {}
        hamm_dist = sorted([(conv.hamdist(n, c), c) for c in canonical_seqs])
        min_hamm_dist = hamm_dist[0][0]
        print "Found hamming distances: {0}".format(datetime.datetime.now())

        for hamm, c in hamm_dist:
            #only analyze min_dist canonical sequences
            if hamm != min_hamm_dist:
                continue
            novel_fracs[n][c] = []
            #generate list of 5 paths
            #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5)
            paths = [networkx.shortest_path(G, seq_id[n], seq_id[c])]

            for path in paths:
                inter_nodes = path[1:-1]
                novel_fracs[n][c].append(
                    float(
                        sum([
                            1 for node_id in inter_nodes
                            if id_status[node_id] == "UNCLEAVED"
                        ])) / len(inter_nodes))

    base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0])
    base_c_file = os.path.basename(os.path.splitext(canonical_file)[0])

    with open(
            "{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file,
                                                base_c_file), 'w') as o:
        for n, c_dict in novel_fracs.items():
            for c, fracs_list in c_dict.items():
                o.write("{0},{1},".format(n, c))
                o.write(",".join(map(str, fracs_list)))
                o.write("\n")

    print "Output paths: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ['DEMEE']

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ]

    tallies = { 2 : {2:0,1.5:0,1:0}, 1.5 : {2:0,1.5:0,1:0}, 1 : {2:0,1.5:0,1:0} }
    

    for edge in edges:
        tallies[edge[0][1]][edge[1][1]] += 1

    frequencies = { 2 : {}, 1.5 : {}, 1 : {} }

    for source, tallies_dict in tallies.items():
        n_tallies = float(sum(tallies_dict.values()))
        frequencies[source] = { k : v/n_tallies for k, v in tallies_dict.items() }

    new_edges = []

    for edge in edges:
	fitness_source = edge[0][1]
        fitness_target = np.random.choice([2,1.5,1],p=[frequencies[fitness_source][2],frequencies[fitness_source][1.5],frequencies[fitness_source][1]])   
 	seqs = list_sequences[labels.index(conv_fitness_label(fitness_target))]
        new_edges.append((edge[0],seqs[np.random.randint(0,len(seqs)-1)]))

    edges = new_edges
      
    for canonical_seq in canonical_seqs: 
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges,"w")
        edges_out.write("Source,Target,Weight\n")
	print canonical_seq
        for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1        
	    out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper))
	    edges_out.write(out_str) #does this have the correct directionality?
	edges_out.close()

    already_written_nodes = []
   
    nodes_out = open(outfile_nodes,"w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs,label in zip(list_sequences,labels):
        nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([ s[0] for s in seqs])
        nodes_out.write("\n")
Exemple #21
0
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
            seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)]
        orig_len = len(seq_ind_list)
        if canonical not in seq_list:
            one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [o for o in one_away if o != canonical] + [canonical]
            seq_ind_list = seq_ind_list[:] + [
                (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list))
            ]

        edges = [(seq2, seq)
                 for seq, seq2 in itertools.combinations(seq_ind_list, 2)
                 if gsconv.hamdist(seq2[0], seq[0]) < 2]
        print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())

        numpy.set_printoptions(threshold='nan')

        canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0]

        T_mat = trans_matrix(seq_ind_list, edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2, 23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind,
                                            orig_len)
            y.append(frac)

            print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

        series.append([x, y, canonical])

    fig, ax = conv.create_ax(1, 1)

    color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="Number of Steps",
                            colors=color,
                            y_axis="Fraction Cleaved Variants Reached",
                            alpha=0.85,
                            connect_dots=True,
                            size=15,
                            edgecolors='k',
                            linewidth=0)
    ax[0, 0].set_xlim(xmin=1)
    ax[0, 0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0, 0].set_xticks(xrange(1, 23, 3))
    lgd = conv.add_legend(ax[0, 0],
                          location='upper center',
                          bbox_to_anchor=(0.5, 1.05),
                          ncol=2,
                          size=8)
    conv.save_fig(fig,
                  output_prefix,
                  "fraction_func",
                  2.5,
                  3,
                  size=9.5,
                  extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
	    seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ]
	orig_len = len(seq_ind_list)
        if canonical not in seq_list:
	    one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [ o for o in one_away if o != canonical ] + [canonical]
	    seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ]

        edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ]
	print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())    

        numpy.set_printoptions(threshold='nan')

        canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0]

        T_mat = trans_matrix(seq_ind_list,edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2,23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len)
	    y.append(frac)

	    print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

	series.append([x,y,canonical])

    fig, ax = conv.create_ax(1, 1)

    color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0)
    ax[0,0].set_xlim(xmin=1)
    ax[0,0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0,0].set_xticks(xrange(1,23,3))
    lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8)
    conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())    
def main(list_sequence_names, output_prefix):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    extended_list_sequences = []  #flat list of sequences
    labels = []  #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename,
                                          additional_params=True,
                                          ind_type={
                                              1: float,
                                              2: float
                                          })
        print sequences[0:10]
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    print len(extended_list_sequences)
    dict_seq_fit = {
        seq: fitness
        for (seq, fitness, ratio) in extended_list_sequences
    }
    dict_seq_ratio = {
        seq: ratio
        for (seq, fitness, ratio) in extended_list_sequences
    }
    print len(dict_seq_fit)

    epi = {}
    outfile_epi = '%s_epi_double.csv' % (output_prefix)
    epi_double_out = open(outfile_epi, "w")
    outfile_epi = '%s_epi.csv' % (output_prefix)
    epi_out = open(outfile_epi, "w")

    mut_func = {
        "Both_Functional": [],
        "Both_Nonfunctional": [],
        "One_Functional": []
    }
    mut_nonfunc = {
        "Both_Functional": [],
        "Both_Nonfunctional": [],
        "One_Functional": []
    }

    prod = itertools.product(list_sequences[labels.index("CLEAVED")],
                             extended_list_sequences)
    pairs = set()
    counter = 0
    counter_prod = 0
    for x, y in prod:
        counter_prod += 1
        if x[0] != y[0]:
            counter += 1
            pairs.add(frozenset((x, y)))
    print counter_prod
    print len(pairs)
    print counter
    print "done making set"
    for can, seq_fit in pairs:
        canonical_seq = can[0]
        seq = seq_fit[0]
        fit = seq_fit[1]
        mut_dict = mut_func if fit == 1 else mut_nonfunc

        dist = conv.hamdist(canonical_seq, seq)
        if dist <= 1:
            continue
        list_inter, list_fit = get_inter_fitness(canonical_seq, seq,
                                                 dict_seq_fit)
        if None not in list_fit:
            if dist == 2:
                sum_fit = sum(list_fit)
                if sum_fit > 1.95:
                    mut_dict["Both_Functional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
                elif sum_fit < 0.05:
                    mut_dict["Both_Nonfunctional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
                else:  #either one uncleaved or one middle
                    mut_dict["One_Functional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
            epi[(canonical_seq, seq)] = (calc_epi(list_fit, fit), fit,
                                         list_fit, list_inter)
    print "done calc epi"
    '''epi_double_out.write("Starting,Starting_Ratio,Ending,Ending_Ratio,Status_Ending,Status_Intermediates,Inter1_Seq,Inter1_Fit,Inter1_Ratio,Inter2_Seq,Inter2_Fit,Inter2_Ratio\n")
    for label, list_muts in mut_func.items():
        for (can, seq, list_inter, list_fit) in list_muts:
            epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Cleaved,{label},{data}\n".format(label=label,start=can,end=seq,
					start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq],
					data = ",".join([ "{0},{1},{2}".format(seq,fitness_to_str(fit),dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) )
    for label, list_muts in mut_nonfunc.items():
        for (can, seq, list_inter, list_fit) in list_muts:
            epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Uncleaved,{label},{data}\n".format(label=label,start=can,end=seq,
                                        start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq],
                                        data = ",".join([ "{0},{1},{2}".format(seq,fit,dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) 
    '''
    epi_out.write(
        "Starting,Starting_Ratio,Ending,Ending_Ratio,Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Ratios_Intermediates\n"
    )
    epi_out.write("\n".join([
        "{0},{1},{2},{3},{4},{5},{6}".format(
            can, dict_seq_ratio[can], seq, dict_seq_ratio[seq],
            fitness_to_str(fit), e, ",".join([
                "{0},{1},{2}".format(s, fitness_to_str(f), dict_seq_ratio[s])
                for f, s in zip(list_fit, list_inter)
            ])) for (can, seq), (e, fit, list_fit, list_inter) in epi.items()
    ]))
    epi_out.close()
    epi_double_out.close()
    print "done writing epi"
def main(list_sequence_names, canonical_list, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "CLEAVED"][0])

    uncleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "UNCLEAVED"][0])

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
        if seq in canonical_list_seq:
            print seq
            print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]

    fig, ax = pconv.create_ax(1, 3)

    hist.draw_actual_plot(ax[0, 0],
                          min_dist,
                          "Min. Distance from Boundary",
                          "Minimum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[1, 0],
                          avg_dist,
                          "Avg. Distance from Boundary",
                          "Average Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[2, 0],
                          max_dist,
                          "Max. Distance from Boundary",
                          "Maximum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)

    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
Exemple #25
0
def find_seqs_less_than(can, sequences, set_sequences, hamm_dist):
    if hamm_dist == -1:
        set_sequences = set_sequences.union([ seq for seq in sequences if chem_sim(seq, can) ])
    else:
        set_sequences = set_sequences.union([ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ])
    return set_sequences
Exemple #26
0
def find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist):
    if hamm_dist == -1:
        set_sequences = set([ seq for seq in sequences if chem_sim(seq, can) ])
    else:
        set_sequences = set([ seq for seq in sequences if conv.hamdist(seq,can) > hamm_dist ])
    return set_sequences