def main(list_sequence_names, output_prefix): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) cleaved_ind = labels.index("CLEAVED") middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") frac_uncleaved = {} frac_cleaved = {} frac_middle = {} for seq in list_sequences[cleaved_ind]: cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1]) uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1]) middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1]) if cleaved_seqs > 0 or uncleaved_seqs > 0: total = uncleaved_seqs+middle_seqs+cleaved_seqs frac_uncleaved[seq] = float(uncleaved_seqs)/total frac_cleaved[seq] = float(cleaved_seqs)/total frac_middle[seq] = float(middle_seqs)/total fig, ax = pconv.create_ax(3, 1) hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20) pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
def main(sequence_list, canonical_seq_list, known_cleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) canonical_seqs = seq_IO.read_sequences(canonical_seq_list) known_cleaved_list = seq_IO.read_sequences(known_cleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list] uncleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list] cl_s_dist = [ s[2] for s in cleaved_seqs] uncl_s_dist = [s[2] for s in uncleaved_seqs] print max(cl_s_dist) print min(uncl_s_dist) cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4] cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4] uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4] uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4] outfile = '%s_selected.csv' % (base) out = open(outfile,"w") out.write("Cleaved_seqs_low_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_low_ham ] )) out.write("\nCleaved_seqs_high_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_hi_ham ] )) out.write("\nUncleaved_seqs_low_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_low_ham ] )) out.write("\nUncleaved_seqs_high_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_hi_ham ] ))
def main(sequence_list, trained_cleaved, trained_uncleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) trained_cleaved_list = seq_IO.read_sequences(trained_cleaved) trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [(s[0], s[1], min([conv.hamdist(s[0], c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED'] uncleaved_seqs = [ (s[0], s[1], min([conv.hamdist(s[0], c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ] outfile = '%s_selected_hamm.csv' % (base) out = open(outfile, "w") out.write("Cleaved_seqs\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs])) out.write("\nUncleaved_seqs\n") out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs]))
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) dict_sequences.update({ n[0] : n for n in new_seqs }) labels.append(label) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, fitness, canonical_like in extended_list_sequences: neighbors = conv.gen_hamdist_one(seq) edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ]) edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ] print "Generated Edges: {0}".format(datetime.datetime.now()) print edges[0:10] seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } nodes = [] for seqs, label in zip(list_sequences, labels): nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ]) print "Generated List of Nodes: {0}".format(datetime.datetime.now()) links = [] for canonical_seq in canonical_seqs: print canonical_seq for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } ) print "Generated List of Edges: {0}".format(datetime.datetime.now()) output = { "nodes" : nodes, "links" : links } with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp: json.dump(output, fp) print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) dict_sequences = { n[0] : n for n in new_seqs } labels.append(label) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, fitness, canonical_like in extended_list_sequences: neighbors = conv.gen_hamdist_one(seq) edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ]) edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ] print "Generated Edges: {0}".format(datetime.datetime.now()) print edges[0:10] seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } nodes = [] for seqs, label in zip(list_sequences, labels): nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ]) print "Generated List of Nodes: {0}".format(datetime.datetime.now()) links = [] for canonical_seq in canonical_seqs: print canonical_seq for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } ) print "Generated List of Edges: {0}".format(datetime.datetime.now()) output = { "nodes" : nodes, "links" : links } with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp: json.dump(output, fp) print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [ ] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1: float}) new_seqs = [(seq, fitness, min([conv.hamdist(seq, can) for can in canonical_seqs]) <= 2) for seq, fitness in sequences] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [ (seq2, seq) for seq, seq2 in itertools.combinations(extended_list_sequences, 2) if conv.hamdist(seq2[0], seq[0]) == hamming_dist ] for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges, "w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1, fit1, can1], [seq2, fit2, can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower, seq_upper, fit_lower / float(fit_upper)) edges_out.write( out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes, "w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs, label in zip(list_sequences, labels): nodes_out.write("\n".join( "{0},{0},{1},{2},{3}".format(x, label, fitness, can) for (x, fitness, can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([s[0] for s in seqs]) nodes_out.write("\n")
def main(sequence_list, canonical_seq_list, known_cleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) canonical_seqs = seq_IO.read_sequences(canonical_seq_list) known_cleaved_list = seq_IO.read_sequences(known_cleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0], s[1], s[2], min([conv.hamdist(s[0], c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list ] uncleaved_seqs = [(s[0], s[1], s[2], min([conv.hamdist(s[0], c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list] cl_s_dist = [s[2] for s in cleaved_seqs] uncl_s_dist = [s[2] for s in uncleaved_seqs] print max(cl_s_dist) print min(uncl_s_dist) cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4] cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4] uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4] uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4] outfile = '%s_selected.csv' % (base) out = open(outfile, "w") out.write("Cleaved_seqs_low_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_low_ham])) out.write("\nCleaved_seqs_high_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_hi_ham])) out.write("\nUncleaved_seqs_low_hamming_distance\n") out.write("\n".join( [",".join(map(str, s)) for s in uncleaved_seqs_low_ham])) out.write("\nUncleaved_seqs_high_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs_hi_ham]))
def find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist): if hamm_dist == -1: set_sequences = set([seq for seq in sequences if chem_sim(seq, can)]) else: set_sequences = set( [seq for seq in sequences if conv.hamdist(seq, can) > hamm_dist]) return set_sequences
def main(input_dir, canonical_file, output_prefix, hamm_dist): list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt")) dict_sequences = {} canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) for filename in list_seq_files: sequences = seq_IO.read_sequences(filename) for can in canonical_sequences: if hamm_dist == -1: seq_sim = [seq for seq in sequences if chem_sim(seq, can)] else: seq_sim = [ seq for seq in sequences if conv.hamdist(seq, can) <= hamm_dist ] if seq_sim: dict_sequences[(filename, can)] = seq_sim outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist) canon_out = open(outfile_canon, "w") for (filename, can), seqs in dict_sequences.items(): canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
def find_seqs_less_than(can, sequences, set_sequences, hamm_dist): if hamm_dist == -1: set_sequences = set_sequences.union( [seq for seq in sequences if chem_sim(seq, can)]) else: set_sequences = set_sequences.union( [seq for seq in sequences if conv.hamdist(seq, can) <= hamm_dist]) return set_sequences
def main(list_sequence_names, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) dict_sequences = { seq : fitness for (seq, fitness) in extended_list_sequences } epi = {} for canonical_seq in canonical_seqs: mut_func = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] } mut_nonfunc = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] } outfile_epi = '%s_%s_epi.csv' % (output_prefix, canonical_seq) epi_out = open(outfile_epi,"w") print canonical_seq epi = {} double_mut = [ seq for seq in extended_list_sequences if conv.hamdist(canonical_seq, seq[0]) == 2 ] for seq_fit in extended_list_sequences: seq = seq_fit[0] fit = seq_fit[1] mut_dict = mut_func if fit == 1000 else mut_nonfunc list_fit = get_inter_fitness(canonical_seq, seq, dict_sequences) if len(list_fit) <= 1: continue if all(list_fit): if seq_fit in double_mut: sum_fit = sum(list_fit) print sum_fit if sum_fit == 2000: mut_dict["Both_Functional"].append((canonical_seq, seq)) elif sum_fit == 0: mut_dict["Both_Nonfunctional"].append((canonical_seq, seq)) elif sum_fit == 1000: mut_dict["One_Functional"].append((canonical_seq, seq)) epi[seq] = (calc_epi(list_fit, fit),list_fit+[fit]) epi_out.write("Total Double Mutants,%s\n" % (len(double_mut))) for label, list_muts in mut_func.items(): for (can, seq) in list_muts: epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) ) for label, list_muts in mut_nonfunc.items(): for (can, seq) in list_muts: epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) ) epi_out.write("\n".join(["{0},{1},{2}".format(seq,epi,",".join([str(f) for f in fits])) for seq, (epi,fits) in epi.items()] ) ) epi_out.close()
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ] for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges,"w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper)) edges_out.write(out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes,"w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs,label in zip(list_sequences,labels): nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([ s[0] for s in seqs]) nodes_out.write("\n")
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [conv.hamdist(seq, canonical) for seq in sequences] dict_sequences[label] = { i: sum([d for d in distances if d == i]) for i in xrange(1, 6) } x = [] y = [] for i in xrange(1, 6): func = 0.0 unfunc = 0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append(func / (func + unfunc)) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series(ax[0, 0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0, 0].set_xlim(xmin=1, xmax=5) ax[0, 0].set_xticks(xrange(1, 6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def main(json_file, output_prefix, novel_seqs_file, canonical_file): print "Started Script: {0}".format(datetime.datetime.now()) with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data, directed=False) print "Finished Reading in Graph: {0}".format(datetime.datetime.now()) id_seq = networkx.get_node_attributes(G, "sequence") id_status = networkx.get_node_attributes(G, "status") seq_id = { seq : node_id for node_id, seq in id_seq.items()} print "Created inverse lookup table: {0}".format(datetime.datetime.now()) novel_seqs = seq_IO.read_sequences(novel_seqs_file) canonical_seqs = seq_IO.read_sequences(canonical_file) novel_fracs = {} print "Ready to enter loop: {0}".format(datetime.datetime.now()) for n in novel_seqs: novel_fracs[n] = {} hamm_dist = sorted([ (conv.hamdist(n,c),c) for c in canonical_seqs ]) min_hamm_dist = hamm_dist[0][0] print "Found hamming distances: {0}".format(datetime.datetime.now()) for hamm, c in hamm_dist: #only analyze min_dist canonical sequences if hamm != min_hamm_dist: continue novel_fracs[n][c] = [] #generate list of 5 paths #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5) paths = [ networkx.shortest_path(G, seq_id[n], seq_id[c]) ] for path in paths: inter_nodes = path[1:-1] novel_fracs[n][c].append(float(sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ]))/len(inter_nodes)) base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0]) base_c_file = os.path.basename(os.path.splitext(canonical_file)[0]) with open("{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o: for n, c_dict in novel_fracs.items(): for c, fracs_list in c_dict.items(): o.write("{0},{1},".format(n,c)) o.write(",".join(map(str,fracs_list))) o.write("\n") print "Output paths: {0}".format(datetime.datetime.now())
def main(sequence_list, trained_cleaved, trained_uncleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) trained_cleaved_list = seq_IO.read_sequences(trained_cleaved) trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED' ] uncleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ] outfile = '%s_selected_hamm.csv' % (base) out = open(outfile,"w") out.write("Cleaved_seqs\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs ] )) out.write("\nUncleaved_seqs\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs] ))
def main(list_sequence_names, canonical_list, output_prefix ): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] ) uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] ) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [ conv.hamdist(seq, canonical) for seq in sequences ] dict_sequences[label] = { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } x = [] y = [] for i in xrange(1,6): func=0.0 unfunc=0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append( func/(func+unfunc) ) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0,0].set_xlim(xmin=1,xmax=5) ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def main(input_dir, canonical_file, output_prefix, hamm_dist): list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt")) dict_sequences = {} canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) for filename in list_seq_files: sequences = seq_IO.read_sequences(filename) for can in canonical_sequences: if hamm_dist == -1: seq_sim = [ seq for seq in sequences if chem_sim(seq, can) ] else: seq_sim = [ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ] if seq_sim: dict_sequences[(filename, can)] = seq_sim outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist) canon_out = open(outfile_canon,"w") for (filename, can), seqs in dict_sequences.items(): canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
def main(json_file, output_prefix, novel_seqs_file, canonical_file): print "Started Script: {0}".format(datetime.datetime.now()) with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data, directed=False) print "Finished Reading in Graph: {0}".format(datetime.datetime.now()) id_seq = networkx.get_node_attributes(G, "sequence") id_status = networkx.get_node_attributes(G, "status") seq_id = {seq: node_id for node_id, seq in id_seq.items()} print "Created inverse lookup table: {0}".format(datetime.datetime.now()) novel_seqs = seq_IO.read_sequences(novel_seqs_file) canonical_seqs = seq_IO.read_sequences(canonical_file) novel_fracs = {} print "Ready to enter loop: {0}".format(datetime.datetime.now()) for n in novel_seqs: novel_fracs[n] = {} hamm_dist = sorted([(conv.hamdist(n, c), c) for c in canonical_seqs]) min_hamm_dist = hamm_dist[0][0] print "Found hamming distances: {0}".format(datetime.datetime.now()) for hamm, c in hamm_dist: #only analyze min_dist canonical sequences if hamm != min_hamm_dist: continue novel_fracs[n][c] = [] #generate list of 5 paths #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5) paths = [networkx.shortest_path(G, seq_id[n], seq_id[c])] for path in paths: inter_nodes = path[1:-1] novel_fracs[n][c].append( float( sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ])) / len(inter_nodes)) base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0]) base_c_file = os.path.basename(os.path.splitext(canonical_file)[0]) with open( "{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o: for n, c_dict in novel_fracs.items(): for c, fracs_list in c_dict.items(): o.write("{0},{1},".format(n, c)) o.write(",".join(map(str, fracs_list))) o.write("\n") print "Output paths: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ['DEMEE'] for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ] tallies = { 2 : {2:0,1.5:0,1:0}, 1.5 : {2:0,1.5:0,1:0}, 1 : {2:0,1.5:0,1:0} } for edge in edges: tallies[edge[0][1]][edge[1][1]] += 1 frequencies = { 2 : {}, 1.5 : {}, 1 : {} } for source, tallies_dict in tallies.items(): n_tallies = float(sum(tallies_dict.values())) frequencies[source] = { k : v/n_tallies for k, v in tallies_dict.items() } new_edges = [] for edge in edges: fitness_source = edge[0][1] fitness_target = np.random.choice([2,1.5,1],p=[frequencies[fitness_source][2],frequencies[fitness_source][1.5],frequencies[fitness_source][1]]) seqs = list_sequences[labels.index(conv_fitness_label(fitness_target))] new_edges.append((edge[0],seqs[np.random.randint(0,len(seqs)-1)])) edges = new_edges for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges,"w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper)) edges_out.write(out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes,"w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs,label in zip(list_sequences,labels): nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([ s[0] for s in seqs]) nodes_out.write("\n")
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [o for o in one_away if o != canonical] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2, seq) for seq, seq2 in itertools.combinations(seq_ind_list, 2) if gsconv.hamdist(seq2[0], seq[0]) < 2] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0] T_mat = trans_matrix(seq_ind_list, edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2, 23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x, y, canonical]) fig, ax = conv.create_ax(1, 1) color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series(ax[0, 0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0, 0].set_xlim(xmin=1) ax[0, 0].set_ylim(ymin=0.0, ymax=1.0) ax[0, 0].set_xticks(xrange(1, 23, 3)) lgd = conv.add_legend(ax[0, 0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [ o for o in one_away if o != canonical ] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0] T_mat = trans_matrix(seq_ind_list,edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2,23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x,y,canonical]) fig, ax = conv.create_ax(1, 1) color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0,0].set_xlim(xmin=1) ax[0,0].set_ylim(ymin=0.0, ymax=1.0) ax[0,0].set_xticks(xrange(1,23,3)) lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(list_sequence_names, output_prefix): list_sequences = [ ] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={ 1: float, 2: float }) print sequences[0:10] list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) print len(extended_list_sequences) dict_seq_fit = { seq: fitness for (seq, fitness, ratio) in extended_list_sequences } dict_seq_ratio = { seq: ratio for (seq, fitness, ratio) in extended_list_sequences } print len(dict_seq_fit) epi = {} outfile_epi = '%s_epi_double.csv' % (output_prefix) epi_double_out = open(outfile_epi, "w") outfile_epi = '%s_epi.csv' % (output_prefix) epi_out = open(outfile_epi, "w") mut_func = { "Both_Functional": [], "Both_Nonfunctional": [], "One_Functional": [] } mut_nonfunc = { "Both_Functional": [], "Both_Nonfunctional": [], "One_Functional": [] } prod = itertools.product(list_sequences[labels.index("CLEAVED")], extended_list_sequences) pairs = set() counter = 0 counter_prod = 0 for x, y in prod: counter_prod += 1 if x[0] != y[0]: counter += 1 pairs.add(frozenset((x, y))) print counter_prod print len(pairs) print counter print "done making set" for can, seq_fit in pairs: canonical_seq = can[0] seq = seq_fit[0] fit = seq_fit[1] mut_dict = mut_func if fit == 1 else mut_nonfunc dist = conv.hamdist(canonical_seq, seq) if dist <= 1: continue list_inter, list_fit = get_inter_fitness(canonical_seq, seq, dict_seq_fit) if None not in list_fit: if dist == 2: sum_fit = sum(list_fit) if sum_fit > 1.95: mut_dict["Both_Functional"].append( (canonical_seq, seq, list_inter, list_fit)) elif sum_fit < 0.05: mut_dict["Both_Nonfunctional"].append( (canonical_seq, seq, list_inter, list_fit)) else: #either one uncleaved or one middle mut_dict["One_Functional"].append( (canonical_seq, seq, list_inter, list_fit)) epi[(canonical_seq, seq)] = (calc_epi(list_fit, fit), fit, list_fit, list_inter) print "done calc epi" '''epi_double_out.write("Starting,Starting_Ratio,Ending,Ending_Ratio,Status_Ending,Status_Intermediates,Inter1_Seq,Inter1_Fit,Inter1_Ratio,Inter2_Seq,Inter2_Fit,Inter2_Ratio\n") for label, list_muts in mut_func.items(): for (can, seq, list_inter, list_fit) in list_muts: epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Cleaved,{label},{data}\n".format(label=label,start=can,end=seq, start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq], data = ",".join([ "{0},{1},{2}".format(seq,fitness_to_str(fit),dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) for label, list_muts in mut_nonfunc.items(): for (can, seq, list_inter, list_fit) in list_muts: epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Uncleaved,{label},{data}\n".format(label=label,start=can,end=seq, start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq], data = ",".join([ "{0},{1},{2}".format(seq,fit,dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) ''' epi_out.write( "Starting,Starting_Ratio,Ending,Ending_Ratio,Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Ratios_Intermediates\n" ) epi_out.write("\n".join([ "{0},{1},{2},{3},{4},{5},{6}".format( can, dict_seq_ratio[can], seq, dict_seq_ratio[seq], fitness_to_str(fit), e, ",".join([ "{0},{1},{2}".format(s, fitness_to_str(f), dict_seq_ratio[s]) for f, s in zip(list_fit, list_inter) ])) for (can, seq), (e, fit, list_fit, list_inter) in epi.items() ])) epi_out.close() epi_double_out.close() print "done writing epi"
def main(list_sequence_names, canonical_list, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "CLEAVED"][0]) uncleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "UNCLEAVED"][0]) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0, 0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1, 0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2, 0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def find_seqs_less_than(can, sequences, set_sequences, hamm_dist): if hamm_dist == -1: set_sequences = set_sequences.union([ seq for seq in sequences if chem_sim(seq, can) ]) else: set_sequences = set_sequences.union([ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ]) return set_sequences
def find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist): if hamm_dist == -1: set_sequences = set([ seq for seq in sequences if chem_sim(seq, can) ]) else: set_sequences = set([ seq for seq in sequences if conv.hamdist(seq,can) > hamm_dist ]) return set_sequences