def main(input_dir, canonical_file, output_prefix, hamm_dist): list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt")) dict_sequences = {} canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) for filename in list_seq_files: sequences = seq_IO.read_sequences(filename) for can in canonical_sequences: if hamm_dist == -1: seq_sim = [seq for seq in sequences if chem_sim(seq, can)] else: seq_sim = [ seq for seq in sequences if conv.hamdist(seq, can) <= hamm_dist ] if seq_sim: dict_sequences[(filename, can)] = seq_sim outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist) canon_out = open(outfile_canon, "w") for (filename, can), seqs in dict_sequences.items(): canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
def main(data_file, output_prefix, degree_file, width, height): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True) seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True) degree_frac = defaultdict(list) for seq, seq_dict in sequences.items(): degree_frac[seq_degree[seq]['Degree']].append(np.mean(seq_dict["Frac"])) data = [ np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items() ] degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ] degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0,0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr2[0,0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
def main(sequence_list, canonical_seq_list, known_cleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) canonical_seqs = seq_IO.read_sequences(canonical_seq_list) known_cleaved_list = seq_IO.read_sequences(known_cleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list] uncleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list] cl_s_dist = [ s[2] for s in cleaved_seqs] uncl_s_dist = [s[2] for s in uncleaved_seqs] print max(cl_s_dist) print min(uncl_s_dist) cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4] cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4] uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4] uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4] outfile = '%s_selected.csv' % (base) out = open(outfile,"w") out.write("Cleaved_seqs_low_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_low_ham ] )) out.write("\nCleaved_seqs_high_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_hi_ham ] )) out.write("\nUncleaved_seqs_low_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_low_ham ] )) out.write("\nUncleaved_seqs_high_hamming_distance\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_hi_ham ] ))
def main(sequence_list, trained_cleaved, trained_uncleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) trained_cleaved_list = seq_IO.read_sequences(trained_cleaved) trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [(s[0], s[1], min([conv.hamdist(s[0], c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED'] uncleaved_seqs = [ (s[0], s[1], min([conv.hamdist(s[0], c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ] outfile = '%s_selected_hamm.csv' % (base) out = open(outfile, "w") out.write("Cleaved_seqs\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs])) out.write("\nUncleaved_seqs\n") out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs]))
def main(input_file, canonical_file, output_prefix, hamm_dist, less_than, more_than): set_sequences = set() canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) sequences = seq_IO.read_sequences(input_file) for ind, can in enumerate(canonical_sequences): if less_than and more_than: raise ValueError('Cannot set both --less_than and --more_than') elif less_than: set_sequences = find_seqs_less_than(can, sequences, set_sequences, hamm_dist) elif more_than and ind == 0: set_sequences = find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist) elif more_than: set_sequences = find_seqs_more_than(can, sequences, set_sequences, hamm_dist) else: raise ValueError( 'Cannot have both --less_than and --more_than as false') less_v_more = "less" if less_than else "more" outfile_canon = '%scanonical_sim_cleaved_%s_%d.csv' % ( output_prefix, less_v_more, hamm_dist) with open(outfile_canon, "w") as canon_out: canon_out.write('\n'.join(set_sequences))
def main(list_sequence_names, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) dict_sequences = { seq : fitness for (seq, fitness) in extended_list_sequences } epi = {} for canonical_seq in canonical_seqs: mut_func = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] } mut_nonfunc = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] } outfile_epi = '%s_%s_epi.csv' % (output_prefix, canonical_seq) epi_out = open(outfile_epi,"w") print canonical_seq epi = {} double_mut = [ seq for seq in extended_list_sequences if conv.hamdist(canonical_seq, seq[0]) == 2 ] for seq_fit in extended_list_sequences: seq = seq_fit[0] fit = seq_fit[1] mut_dict = mut_func if fit == 1000 else mut_nonfunc list_fit = get_inter_fitness(canonical_seq, seq, dict_sequences) if len(list_fit) <= 1: continue if all(list_fit): if seq_fit in double_mut: sum_fit = sum(list_fit) print sum_fit if sum_fit == 2000: mut_dict["Both_Functional"].append((canonical_seq, seq)) elif sum_fit == 0: mut_dict["Both_Nonfunctional"].append((canonical_seq, seq)) elif sum_fit == 1000: mut_dict["One_Functional"].append((canonical_seq, seq)) epi[seq] = (calc_epi(list_fit, fit),list_fit+[fit]) epi_out.write("Total Double Mutants,%s\n" % (len(double_mut))) for label, list_muts in mut_func.items(): for (can, seq) in list_muts: epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) ) for label, list_muts in mut_nonfunc.items(): for (can, seq) in list_muts: epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) ) epi_out.write("\n".join(["{0},{1},{2}".format(seq,epi,",".join([str(f) for f in fits])) for seq, (epi,fits) in epi.items()] ) ) epi_out.close()
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [ ] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1: float}) new_seqs = [(seq, fitness, min([conv.hamdist(seq, can) for can in canonical_seqs]) <= 2) for seq, fitness in sequences] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [ (seq2, seq) for seq, seq2 in itertools.combinations(extended_list_sequences, 2) if conv.hamdist(seq2[0], seq[0]) == hamming_dist ] for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges, "w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1, fit1, can1], [seq2, fit2, can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower, seq_upper, fit_lower / float(fit_upper)) edges_out.write( out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes, "w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs, label in zip(list_sequences, labels): nodes_out.write("\n".join( "{0},{0},{1},{2},{3}".format(x, label, fitness, can) for (x, fitness, can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([s[0] for s in seqs]) nodes_out.write("\n")
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [conv.hamdist(seq, canonical) for seq in sequences] dict_sequences[label] = { i: sum([d for d in distances if d == i]) for i in xrange(1, 6) } x = [] y = [] for i in xrange(1, 6): func = 0.0 unfunc = 0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append(func / (func + unfunc)) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series(ax[0, 0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0, 0].set_xlim(xmin=1, xmax=5) ax[0, 0].set_xticks(xrange(1, 6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def main(json_file, output_prefix, novel_seqs_file, canonical_file): print "Started Script: {0}".format(datetime.datetime.now()) with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data, directed=False) print "Finished Reading in Graph: {0}".format(datetime.datetime.now()) id_seq = networkx.get_node_attributes(G, "sequence") id_status = networkx.get_node_attributes(G, "status") seq_id = { seq : node_id for node_id, seq in id_seq.items()} print "Created inverse lookup table: {0}".format(datetime.datetime.now()) novel_seqs = seq_IO.read_sequences(novel_seqs_file) canonical_seqs = seq_IO.read_sequences(canonical_file) novel_fracs = {} print "Ready to enter loop: {0}".format(datetime.datetime.now()) for n in novel_seqs: novel_fracs[n] = {} hamm_dist = sorted([ (conv.hamdist(n,c),c) for c in canonical_seqs ]) min_hamm_dist = hamm_dist[0][0] print "Found hamming distances: {0}".format(datetime.datetime.now()) for hamm, c in hamm_dist: #only analyze min_dist canonical sequences if hamm != min_hamm_dist: continue novel_fracs[n][c] = [] #generate list of 5 paths #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5) paths = [ networkx.shortest_path(G, seq_id[n], seq_id[c]) ] for path in paths: inter_nodes = path[1:-1] novel_fracs[n][c].append(float(sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ]))/len(inter_nodes)) base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0]) base_c_file = os.path.basename(os.path.splitext(canonical_file)[0]) with open("{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o: for n, c_dict in novel_fracs.items(): for c, fracs_list in c_dict.items(): o.write("{0},{1},".format(n,c)) o.write(",".join(map(str,fracs_list))) o.write("\n") print "Output paths: {0}".format(datetime.datetime.now())
def main(sequence_list, canonical_seq_list, known_cleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) canonical_seqs = seq_IO.read_sequences(canonical_seq_list) known_cleaved_list = seq_IO.read_sequences(known_cleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0], s[1], s[2], min([conv.hamdist(s[0], c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list ] uncleaved_seqs = [(s[0], s[1], s[2], min([conv.hamdist(s[0], c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list] cl_s_dist = [s[2] for s in cleaved_seqs] uncl_s_dist = [s[2] for s in uncleaved_seqs] print max(cl_s_dist) print min(uncl_s_dist) cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4] cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4] uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4] uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4] outfile = '%s_selected.csv' % (base) out = open(outfile, "w") out.write("Cleaved_seqs_low_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_low_ham])) out.write("\nCleaved_seqs_high_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_hi_ham])) out.write("\nUncleaved_seqs_low_hamming_distance\n") out.write("\n".join( [",".join(map(str, s)) for s in uncleaved_seqs_low_ham])) out.write("\nUncleaved_seqs_high_hamming_distance\n") out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs_hi_ham]))
def main(sequence_ratio_file, width, height, pattern, legend): sequences = seq_IO.read_sequences(sequence_ratio_file, additional_params=True) shell_data = [] for shell in xrange(1,len(sequences[0])): shell_data.append([ seq[shell] for seq in sequences ]) avg = [] std = [] label = xrange(1,4) for sd in shell_data: avg.append( np.median(sd)) std.append( np.std(sd)) #check if std has to be fixed #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]): # min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ] # max_err = [ a + s for a,s in zip(avg_ratio, std) ] # err = [min_err, max_err] #else: # err = std err = std fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0,0], avg, ['lightsteelblue','lightblue','darkgray'], "", "Shell", "Fraction Cleaved", tick_label=label, yerr = err) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10)
def main(list_sequence_names, output_prefix): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) cleaved_ind = labels.index("CLEAVED") middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") frac_uncleaved = {} frac_cleaved = {} frac_middle = {} for seq in list_sequences[cleaved_ind]: cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1]) uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1]) middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1]) if cleaved_seqs > 0 or uncleaved_seqs > 0: total = uncleaved_seqs+middle_seqs+cleaved_seqs frac_uncleaved[seq] = float(uncleaved_seqs)/total frac_cleaved[seq] = float(cleaved_seqs)/total frac_middle[seq] = float(middle_seqs)/total fig, ax = pconv.create_ax(3, 1) hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20) hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20) pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) dict_sequences.update({ n[0] : n for n in new_seqs }) labels.append(label) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, fitness, canonical_like in extended_list_sequences: neighbors = conv.gen_hamdist_one(seq) edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ]) edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ] print "Generated Edges: {0}".format(datetime.datetime.now()) print edges[0:10] seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } nodes = [] for seqs, label in zip(list_sequences, labels): nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ]) print "Generated List of Nodes: {0}".format(datetime.datetime.now()) links = [] for canonical_seq in canonical_seqs: print canonical_seq for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } ) print "Generated List of Edges: {0}".format(datetime.datetime.now()) output = { "nodes" : nodes, "links" : links } with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp: json.dump(output, fp) print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())
def main(list_sequence_names, output_prefix): list_sequences = [] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) cleaved_ind = labels.index("CLEAVED") middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[cleaved_ind]) fracs_uncleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[uncleaved_ind]) fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind]) with open("{0}_cleaved.csv".format(output_prefix),'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_cleaved.items() ])) with open("{0}_middle.csv".format(output_prefix),'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_middle.items() ])) with open("{0}_uncleaved.csv".format(output_prefix),'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_uncleaved.items() ]))
def main(list_sequence_names, conversion_type="alpha"): if list_sequence_names == "random": sequences = conv.generate_random_seqs(5) #used 5 as current length because that's my current use for it, can be customized later else: sequences = seq_IO.read_sequences(list_sequence_names) sequence_features = {} if conversion_type == "alpha": sequence_features = conv_alpha_seq(sequences) elif conversion_type == "binary": sequence_features = conv_binary_seq(sequences) else: raise Exception("Conversion type must be binary or alpha") base = os.path.splitext(list_sequence_names)[0] outfile = '%s_sequence_features_%s.csv' % (base,conversion_type) out = open(outfile,"w") #out.write(','.join(["Sequence"] + [ str(i) for i in xrange(1,len(sequences[0])+1)] )) #out.write("\n") for seq, features in sorted(sequence_features.items()): out.write(",".join( [seq] + ( features ) )) #out.write(",".join( features ) ) out.write("\n")
def main(list_sequence_names, conversion_type="alpha"): if list_sequence_names == "random": sequences = conv.generate_random_seqs( 5 ) #used 5 as current length because that's my current use for it, can be customized later else: sequences = seq_IO.read_sequences(list_sequence_names) sequence_features = {} if conversion_type == "alpha": sequence_features = conv_alpha_seq(sequences) elif conversion_type == "binary": sequence_features = conv_binary_seq(sequences) else: raise Exception("Conversion type must be binary or alpha") base = os.path.splitext(list_sequence_names)[0] outfile = '%s_sequence_features_%s.csv' % (base, conversion_type) out = open(outfile, "w") #out.write(','.join(["Sequence"] + [ str(i) for i in xrange(1,len(sequences[0])+1)] )) #out.write("\n") for seq, features in sorted(sequence_features.items()): out.write(",".join([seq] + (features))) #out.write(",".join( features ) ) out.write("\n")
def main(sequences_ratio_file): sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True) seq_ratio_dict = [[l[1], l[2], l[3]] for l in (sequences_ratio)] seq_cleaved_dict = [l[4] for l in sequences_ratio] seqs = [l[0] for l in sequences_ratio] avg_ratio = [sum(v) / 3.0 for v in seq_ratio_dict] min_ratio = [sum(v) / 3.0 - min(v) for v in seq_ratio_dict] max_ratio = [max(v) - sum(v) / 3.0 for v in seq_ratio_dict] fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0, 0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr=[min_ratio, max_ratio]) pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) dict_sequences = { n[0] : n for n in new_seqs } labels.append(label) edges = [] edges_set = set() print "Read in Data: {0}".format(datetime.datetime.now()) for seq, fitness, canonical_like in extended_list_sequences: neighbors = conv.gen_hamdist_one(seq) edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ]) edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ] print "Generated Edges: {0}".format(datetime.datetime.now()) print edges[0:10] seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } nodes = [] for seqs, label in zip(list_sequences, labels): nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ]) print "Generated List of Nodes: {0}".format(datetime.datetime.now()) links = [] for canonical_seq in canonical_seqs: print canonical_seq for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } ) print "Generated List of Edges: {0}".format(datetime.datetime.now()) output = { "nodes" : nodes, "links" : links } with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp: json.dump(output, fp) print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences canonical_seqs = seq_IO.read_sequences(canonical_file) for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ] for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges,"w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper)) edges_out.write(out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes,"w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs,label in zip(list_sequences,labels): nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([ s[0] for s in seqs]) nodes_out.write("\n")
def main(list_nodes, output_prefix, metric, create_keys=False): if not create_keys: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True) else: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True) cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } print len(cleaved_seq) if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"] #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"]) labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ]) else: labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()] normed = True else: data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ] normed = True print key hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins) axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) #pconv.add_legend(axarr[0,ind], location="middle right") pconv.save_fig(fig, output_prefix, metric, n_to_plot*2.5, 2.5, tight=True, size=9)
def main(sequence_list, trained_cleaved, trained_uncleaved): sequences = seq_IO.read_sequences(sequence_list, additional_params=True) trained_cleaved_list = seq_IO.read_sequences(trained_cleaved) trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved) base = os.path.splitext(sequence_list)[0] cleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED' ] uncleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ] outfile = '%s_selected_hamm.csv' % (base) out = open(outfile,"w") out.write("Cleaved_seqs\n") out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs ] )) out.write("\nUncleaved_seqs\n") out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs] ))
def read_sequence_lists( list_sequence_names ): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float, 2:float}) list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) return list_sequences, extended_list_sequences, labels
def main(data_file, title, output_prefix): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True) data = [ seq_dict["Degree"] for seq, seq_dict in sequences.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0,0], data, "", title.capitalize(), normed=True, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, title, 5, 5, tight=True, size=10)
def main(list_sequence_names, canonical_list, output_prefix ): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] ) uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] ) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) for canonical in canonical_list_seq: dict_sequences = {} for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) distances = [ conv.hamdist(seq, canonical) for seq in sequences ] dict_sequences[label] = { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } x = [] y = [] for i in xrange(1,6): func=0.0 unfunc=0.0 for label, dict_sums in dict_sequences.items(): if label in func_labels: func = func + dict_sums[i] elif label in unfunc_labels: unfunc = unfunc + dict_sums[i] if unfunc != 0: x.append(i) y.append( func/(func+unfunc) ) print x print y series.append([x, y, canonical]) fig, ax = pconv.create_ax(1, 1) scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k') ax[0,0].set_xlim(xmin=1,xmax=5) ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
def main(list_sequence_names, output_prefix): sequence_list = [] labels = [] for [filename, label] in list_sequence_names: sequence_list.append(set(seq_IO.read_sequences(filename))) labels.append(label) fig, ax = pconv.create_ax(1, 1) venn3(sequence_list, set_labels = labels, ax=ax[0,0]) pconv.save_fig(fig, output_prefix, '_'.join(labels)+"_venn", 10, 10, size=12)
def main(list_nodes, output_prefix): sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True) mod = get_data_from_dict( sequences, "modularity_class" ) count_mods = Counter(mod) total = float(sum(count_mods.values())) freq_mods = [ key for key, val in count_mods.items() if val/total > 0.01 ] for mod_class in freq_mods: nodes = [ key for key, val in sequences.items() if val["modularity_class"] == mod_class ] with open(output_prefix + "_{0}.txt".format(mod_class), 'w') as f: f.write('\n'.join(nodes))
def main(list_sequence_names, output_prefix, index): list_sequences = [] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) print "Read in Sequences at: {0}".format(datetime.datetime.now()) cleaved_ind = labels.index("CLEAVED") #middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") adj_list_cleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[cleaved_ind]), ignore_middle=False) adj_list_uncleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[uncleaved_ind]), ignore_middle=False) fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[cleaved_ind], ignore_middle=True) fracs_uncleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[uncleaved_ind], ignore_middle=True) #fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind]) print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now()) adj_list_cleaved.update(adj_list_uncleaved) fracs_cleaved.update(fracs_uncleaved) fracs_per_seq = {} start_ind = (index-1)*10000 end_ind = (index)*10000 if start_ind > len(list_sequences[uncleaved_ind]): print "This index is not valid" exit if end_ind > len(list_sequences[uncleaved_ind]): end_ind = len(list_sequences[uncleaved_ind]) for seq in list_sequences[uncleaved_ind][start_ind:end_ind]: new_neighbors = [seq] fracs_per_seq[seq] = [] for x in xrange(0,3): frac, new_neighbors = find_fraction_for_shell(new_neighbors, adj_list_cleaved, fracs_cleaved) fracs_per_seq[seq].append(frac) print "Found Fracs for Uncleaved Sequences at: {0}".format(datetime.datetime.now()) with open("{0}_uncleaved_{1}.csv".format(output_prefix, index),'w') as f: f.write("Sequence,1,2,3\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_per_seq.items() ]))
def main(list_sequence_names, output_prefix, source): list_sequences = [ ] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) print "Read in Sequences at: {0}".format(datetime.datetime.now()) cleaved_ind = labels.index("CLEAVED") uncleaved_ind = labels.index("UNCLEAVED") cleaved_dna = set([ dna_seq for aa_seq in list_sequences[cleaved_ind] for dna_seq in dna_conv.rev_translate(aa_seq) ]) print "Converted to dna at: {0} for # sequences: {1}".format( datetime.datetime.now(), len(cleaved_dna)) total = float(len(cleaved_dna)) fracs = {} for s in source: source_dna = dna_conv.rev_translate(s) neighbors_set = set.union( *[set(dna_conv.gen_hamdist_one(seq)) for seq in source_dna]) neighbors_nostop = set( [n for n in neighbors_set if '_' not in dna_conv.translate(n)]) cl_neighbors = neighbors_set.intersection(cleaved_dna) fracs[s] = (float(len(cl_neighbors)) / len(neighbors_set), float(len(cl_neighbors)) / len(neighbors_nostop)) print "Found Fracs for Cleaved Sequences at: {0}".format( datetime.datetime.now()) with open("{0}_frac_neighbors_dna.csv".format(output_prefix), 'w') as f: f.write("\n".join([ "{0},{1},{2}".format(s, str(frac1), str(frac2)) for s, (frac1, frac2) in fracs.items() ]))
def main(input_file, canonical_file, output_prefix, hamm_dist, less_than, more_than): set_sequences = set() canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) sequences = seq_IO.read_sequences(input_file) for ind, can in enumerate(canonical_sequences): if less_than and more_than: raise ValueError('Cannot set both --less_than and --more_than') elif less_than: set_sequences = find_seqs_less_than(can, sequences, set_sequences, hamm_dist) elif more_than and ind == 0: set_sequences = find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist) elif more_than: set_sequences = find_seqs_more_than(can, sequences, set_sequences, hamm_dist) else: raise ValueError('Cannot have both --less_than and --more_than as false') less_v_more = "less" if less_than else "more" outfile_canon = '%scanonical_sim_cleaved_%s_%d.csv' % (output_prefix, less_v_more, hamm_dist) with open(outfile_canon, "w") as canon_out: canon_out.write('\n'.join(set_sequences))
def main(input_dir, canonical_file, output_prefix, hamm_dist): list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt")) dict_sequences = {} canonical_sequences = [] canonical_sequences = seq_IO.read_sequences(canonical_file) for filename in list_seq_files: sequences = seq_IO.read_sequences(filename) for can in canonical_sequences: if hamm_dist == -1: seq_sim = [ seq for seq in sequences if chem_sim(seq, can) ] else: seq_sim = [ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ] if seq_sim: dict_sequences[(filename, can)] = seq_sim outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist) canon_out = open(outfile_canon,"w") for (filename, can), seqs in dict_sequences.items(): canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
def main(sequences_ratio_file): sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True) seq_ratio_dict = [ [l[1],l[2],l[3]] for l in (sequences_ratio) ] seq_cleaved_dict = [ l[4] for l in sequences_ratio ] seqs = [ l[0] for l in sequences_ratio ] avg_ratio = [ sum(v)/3.0 for v in seq_ratio_dict] min_ratio = [ sum(v)/3.0 - min(v) for v in seq_ratio_dict] max_ratio = [ max(v)-sum(v)/3.0 for v in seq_ratio_dict] fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr[0,0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr = [min_ratio, max_ratio] ) pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
def main(list_sequence_names, output_prefix): list_sequences = [ ] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) cleaved_ind = labels.index("CLEAVED") middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[cleaved_ind]) fracs_uncleaved = conv.fraction_neighbors_all( list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[uncleaved_ind]) fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind]) with open("{0}_cleaved.csv".format(output_prefix), 'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2])) for k, v in fracs_cleaved.items() ])) with open("{0}_middle.csv".format(output_prefix), 'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2])) for k, v in fracs_middle.items() ])) with open("{0}_uncleaved.csv".format(output_prefix), 'w') as f: f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2])) for k, v in fracs_uncleaved.items() ]))
def main(list_sequence_names, output_prefix): lines = [] temp_dict = { "CLEAVED" : {}, "UNCLEAVED" : {}, "MIDDLE" : {} } for [filename, label, sample] in list_sequence_names: sequences = seq_IO.read_sequences(filename) temp_dict[label][sample] = len(sequences) lines.append(([ val for k, val in sorted(temp_dict["CLEAVED"].items()) ], "CLEAVED") ) lines.append(([ val for k, val in sorted(temp_dict["MIDDLE"].items()) ], "MIDDLE") ) lines.append(([ val for k, val in sorted(temp_dict["UNCLEAVED"].items()) ], "UNCLEAVED") ) fig, ax = pconv.create_ax(1, 1) bar.plot_series( ax[0,0], lines, title="", x_axis="Variant Name", y_axis="Number of Substrate Sequences Sampled", tick_label=sorted(temp_dict["CLEAVED"].keys())) pconv.save_fig(fig, output_prefix, "cleaved_uncleaved_middle", 6, 6, tight=True, size=10)
def main(json_file, output_prefix, nbunch_file): with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data) sequences = seq_IO.read_sequences(nbunch_file) id_seq = networkx.get_node_attributes(G, "sequence") seq_id = { seq : node_id for node_id, seq in id_seq.items()} nbunch = [ seq_id[seq] for seq in sequences ] degrees = networkx.degree(G, nbunch) with open("{0}_degree.txt".format(output_prefix), 'w') as o: o.write("\n".join([ "{0},{1}".format(id_seq[k], str(d)) for k,d in degrees.items() ]))
def main(json_file, output_prefix, nbunch_file): with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data) sequences = seq_IO.read_sequences(nbunch_file) id_seq = networkx.get_node_attributes(G, "sequence") seq_id = {seq: node_id for node_id, seq in id_seq.items()} nbunch = [seq_id[seq] for seq in sequences] degrees = networkx.degree(G, nbunch) with open("{0}_degree.txt".format(output_prefix), 'w') as o: o.write("\n".join( ["{0},{1}".format(id_seq[k], str(d)) for k, d in degrees.items()]))
def main(list_sequence_names, output_prefix, source): list_sequences = [] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) print "Read in Sequences at: {0}".format(datetime.datetime.now()) cleaved_ind = labels.index("CLEAVED") uncleaved_ind = labels.index("UNCLEAVED") cleaved_dna = set([ dna_seq for aa_seq in list_sequences[cleaved_ind] for dna_seq in dna_conv.rev_translate(aa_seq) ]) print "Converted to dna at: {0} for # sequences: {1}".format(datetime.datetime.now(), len(cleaved_dna)) adj_list_cleaved = dna_conv.adj_list_cleaved(cleaved_dna, cleaved_dna) print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now()) total = float(len(cleaved_dna)) list_x = [0] list_y = [1/total] source_dna = dna_conv.rev_translate(source) new_neighbors = source_dna for x in xrange(1,3): frac, new_neighbors = find_fraction_for_shell(new_neighbors, adj_list_cleaved, total) list_x.append(x) list_y.append(frac) print "Found Fracs for Cleaved Sequences at: {0}".format(datetime.datetime.now()) with open("{0}_{1}.csv".format(output_prefix,source),'w') as f: f.write("\n".join([ "{0},{1}".format(str(x),str(y)) for x, y in zip(list_x,list_y) ]))
def main(sequence_ratio_file, width, height, pattern, legend): sequence_ratio = seq_IO.read_sequences(sequence_ratio_file, additional_params=True) seqs = [s[0] for s in sequence_ratio] avg_ratio = [s[1] for s in sequence_ratio] std = [s[2] for s in sequence_ratio] label = [s[3] for s in sequence_ratio] if len(sequence_ratio[0]) > 4: color = [s[4] for s in sequence_ratio] else: color = [convert_label_color(l) for l in label] #check if std has to be fixed #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]): # min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ] # max_err = [ a + s for a,s in zip(avg_ratio, std) ] # err = [min_err, max_err] #else: # err = std err = std fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True) if legend: label_legend = [ l if l not in ["CLEAVED", "MIDDLE", "UNCLEAVED"] else None for l in label ] patches, labels = bar.draw_actual_plot(axarr[0, 0], avg_ratio, color, "", "", "FLAG/HA Ratio", tick_label=seqs, yerr=err, pattern=pattern, label=label_legend) lgd = axarr[0, 0].legend(patches, labels, loc="upper center", bbox_to_anchor=(0.5, 1.05), borderaxespad=0., prop={'size': 9}, ncol=2, fancybox=True) print patches print labels else: bar.draw_actual_plot(axarr[0, 0], avg_ratio, color, "", "", "FLAG/HA Ratio", tick_label=seqs, yerr=err, pattern=pattern) lgd = None axarr[0, 0].set_ylim([0, 1.3]) pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10, extra_artists=lgd)
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [o for o in one_away if o != canonical] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2, seq) for seq, seq2 in itertools.combinations(seq_ind_list, 2) if gsconv.hamdist(seq2[0], seq[0]) < 2] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0] T_mat = trans_matrix(seq_ind_list, edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2, 23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x, y, canonical]) fig, ax = conv.create_ax(1, 1) color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series(ax[0, 0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0, 0].set_xlim(xmin=1) ax[0, 0].set_ylim(ymin=0.0, ymax=1.0) ax[0, 0].set_xticks(xrange(1, 23, 3)) lgd = conv.add_legend(ax[0, 0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file): list_sequences = [] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences #canonical_seqs = seq_IO.read_sequences(canonical_file) canonical_seqs = ['DEMEE'] for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float}) new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] list_sequences.append(new_seqs) extended_list_sequences.extend(new_seqs[:]) labels.append(label) outfile_nodes = '%s_nodes.csv' % (output_prefix) edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ] tallies = { 2 : {2:0,1.5:0,1:0}, 1.5 : {2:0,1.5:0,1:0}, 1 : {2:0,1.5:0,1:0} } for edge in edges: tallies[edge[0][1]][edge[1][1]] += 1 frequencies = { 2 : {}, 1.5 : {}, 1 : {} } for source, tallies_dict in tallies.items(): n_tallies = float(sum(tallies_dict.values())) frequencies[source] = { k : v/n_tallies for k, v in tallies_dict.items() } new_edges = [] for edge in edges: fitness_source = edge[0][1] fitness_target = np.random.choice([2,1.5,1],p=[frequencies[fitness_source][2],frequencies[fitness_source][1.5],frequencies[fitness_source][1]]) seqs = list_sequences[labels.index(conv_fitness_label(fitness_target))] new_edges.append((edge[0],seqs[np.random.randint(0,len(seqs)-1)])) edges = new_edges for canonical_seq in canonical_seqs: outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq) edges_out = open(outfile_edges,"w") edges_out.write("Source,Target,Weight\n") print canonical_seq for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges: dist_seq1 = conv.hamdist(canonical_seq, seq1) dist_seq2 = conv.hamdist(canonical_seq, seq2) fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2 fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1 fit_upper = fit_upper if fit_upper > 0 else 0.001 seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2 seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1 out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper)) edges_out.write(out_str) #does this have the correct directionality? edges_out.close() already_written_nodes = [] nodes_out = open(outfile_nodes,"w") nodes_out.write("Id,Label,Type,Fitness,Canonical\n") for seqs,label in zip(list_sequences,labels): nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes)) already_written_nodes.extend([ s[0] for s in seqs]) nodes_out.write("\n")
def main(list_sequence_names, canonical_list, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_list) cleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "CLEAVED"][0]) uncleaved_seqs = seq_IO.read_sequences( [s for s, l in list_sequence_names if l == "UNCLEAVED"][0]) min_dist = [] avg_dist = [] max_dist = [] for seq in cleaved_seqs: distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs] min_dist.append(min(distances)) avg_dist.append(numpy.mean(distances)) max_dist.append(max(distances)) if seq in canonical_list_seq: print seq print min_dist[-1] print avg_dist[-1] print max_dist[-1] fig, ax = pconv.create_ax(1, 3) hist.draw_actual_plot(ax[0, 0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[1, 0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False) hist.draw_actual_plot(ax[2, 0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False) #ax[0,0].set_xlim(xmin=1,xmax=5) #ax[0,0].set_xticks(xrange(1,6)) pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
def main(list_nodes, output_prefix, metric): cleaved_seq = {} uncleaved_seq = {} middle_seq = {} for nodes, label in list_nodes: sequences = seq_IO.read_sequences(nodes, additional_params=True, header=True) cleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical"] orig_labels_to_plot = sorted([ key for key in sequences["DEMEE"].keys() if key not in labels_non_plot ]) labels_to_plot = sorted(orig_labels_to_plot) else: orig_labels_to_plot = [metric] labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 list_seqs = [ k for d in cleaved_seq.values() for k in d.keys() ] count_seqs = Counter(list_seqs) #seqs_5_l = [ s for s in list_seqs if count_seqs[s] == 5 ] seqs_4_l = [ s for s in list_seqs if count_seqs[s] == 4 ] seqs_3_l = [ s for s in list_seqs if count_seqs[s] == 3 ] seqs_2_l = [ s for s in list_seqs if count_seqs[s] == 2 ] seqs_1_l = [ s for s in list_seqs if count_seqs[s] == 1 ] if metric != "Fraction_Cleaved": #seqs_5 = list_metrics( cleaved_seq, seqs_5_l, orig_labels_to_plot) seqs_4 = list_metrics( cleaved_seq, seqs_4_l, orig_labels_to_plot) seqs_3 = list_metrics( cleaved_seq, seqs_3_l, orig_labels_to_plot) seqs_2 = list_metrics( cleaved_seq, seqs_2_l, orig_labels_to_plot) seqs_1 = list_metrics( cleaved_seq, seqs_1_l, orig_labels_to_plot) for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": data = [ #average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_5_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_4_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_3_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_2_l), average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_1_l)] normed=True else: data = [ #get_data_from_dict(seqs_5, key), get_data_from_dict(seqs_1, key), get_data_from_dict(seqs_2, key), get_data_from_dict(seqs_3, key), get_data_from_dict(seqs_4, key) ] normed=True hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), colors = [ tuple(c) for c in plt.cm.Blues(np.linspace(0.2, 1, 4)).tolist()], log=log, normed=normed, label=["Cl. by 5", "Cl. by 4", "Cl. by 3", "Cl. by 2", "Cl. by 1"], nbins=nbins) axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) #pconv.add_legend(axarr[0,ind], location="upper right") pconv.save_fig(fig, output_prefix, metric, n_to_plot*3, 3, tight=True, size=9) fig_bar, axarr_bar = pconv.create_ax(1, 1, shx=False, shy=False) gradient = np.linspace(1, 0.2, 256) #gradient = np.hstack((gradient, gradient)) gradient = np.array(zip(gradient,gradient)) axarr_bar[0,0].imshow(gradient, aspect='auto', cmap=plt.get_cmap('Blues')) #axarr_bar[0,0].set_axis_off() plt.tick_params( axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off', # labels along the bottom edge are off left='off', # ticks along the bottom edge are off right='off', # ticks along the top edge are off labelright='off') # labels along the bottom edge are off pconv.save_fig(fig_bar, output_prefix, "colorbar", 0.3, 3, tight=True)
def main(list_sequence_names, output_prefix): list_sequences = [ ] #list of list of sequences, where each item represents a label extended_list_sequences = [] #flat list of sequences labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={ 1: float, 2: float }) print sequences[0:10] list_sequences.append(sequences) extended_list_sequences.extend(sequences[:]) labels.append(label) print len(extended_list_sequences) dict_seq_fit = { seq: fitness for (seq, fitness, ratio) in extended_list_sequences } dict_seq_ratio = { seq: ratio for (seq, fitness, ratio) in extended_list_sequences } print len(dict_seq_fit) epi = {} outfile_epi = '%s_epi_double.csv' % (output_prefix) epi_double_out = open(outfile_epi, "w") outfile_epi = '%s_epi.csv' % (output_prefix) epi_out = open(outfile_epi, "w") mut_func = { "Both_Functional": [], "Both_Nonfunctional": [], "One_Functional": [] } mut_nonfunc = { "Both_Functional": [], "Both_Nonfunctional": [], "One_Functional": [] } prod = itertools.product(list_sequences[labels.index("CLEAVED")], extended_list_sequences) pairs = set() counter = 0 counter_prod = 0 for x, y in prod: counter_prod += 1 if x[0] != y[0]: counter += 1 pairs.add(frozenset((x, y))) print counter_prod print len(pairs) print counter print "done making set" for can, seq_fit in pairs: canonical_seq = can[0] seq = seq_fit[0] fit = seq_fit[1] mut_dict = mut_func if fit == 1 else mut_nonfunc dist = conv.hamdist(canonical_seq, seq) if dist <= 1: continue list_inter, list_fit = get_inter_fitness(canonical_seq, seq, dict_seq_fit) if None not in list_fit: if dist == 2: sum_fit = sum(list_fit) if sum_fit > 1.95: mut_dict["Both_Functional"].append( (canonical_seq, seq, list_inter, list_fit)) elif sum_fit < 0.05: mut_dict["Both_Nonfunctional"].append( (canonical_seq, seq, list_inter, list_fit)) else: #either one uncleaved or one middle mut_dict["One_Functional"].append( (canonical_seq, seq, list_inter, list_fit)) epi[(canonical_seq, seq)] = (calc_epi(list_fit, fit), fit, list_fit, list_inter) print "done calc epi" '''epi_double_out.write("Starting,Starting_Ratio,Ending,Ending_Ratio,Status_Ending,Status_Intermediates,Inter1_Seq,Inter1_Fit,Inter1_Ratio,Inter2_Seq,Inter2_Fit,Inter2_Ratio\n") for label, list_muts in mut_func.items(): for (can, seq, list_inter, list_fit) in list_muts: epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Cleaved,{label},{data}\n".format(label=label,start=can,end=seq, start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq], data = ",".join([ "{0},{1},{2}".format(seq,fitness_to_str(fit),dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) for label, list_muts in mut_nonfunc.items(): for (can, seq, list_inter, list_fit) in list_muts: epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Uncleaved,{label},{data}\n".format(label=label,start=can,end=seq, start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq], data = ",".join([ "{0},{1},{2}".format(seq,fit,dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) ''' epi_out.write( "Starting,Starting_Ratio,Ending,Ending_Ratio,Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Ratios_Intermediates\n" ) epi_out.write("\n".join([ "{0},{1},{2},{3},{4},{5},{6}".format( can, dict_seq_ratio[can], seq, dict_seq_ratio[seq], fitness_to_str(fit), e, ",".join([ "{0},{1},{2}".format(s, fitness_to_str(f), dict_seq_ratio[s]) for f, s in zip(list_fit, list_inter) ])) for (can, seq), (e, fit, list_fit, list_inter) in epi.items() ])) epi_out.close() epi_double_out.close() print "done writing epi"
def main(list_sequence_names, output_prefix, index): list_sequences = [ ] #list of list of sequences, where each item represents a label labels = [] #labels for list_sequences for [filename, label] in list_sequence_names: sequences = seq_IO.read_sequences(filename) list_sequences.append(sequences) labels.append(label) print "Read in Sequences at: {0}".format(datetime.datetime.now()) cleaved_ind = labels.index("CLEAVED") #middle_ind = labels.index("MIDDLE") uncleaved_ind = labels.index("UNCLEAVED") adj_list_cleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[cleaved_ind]), ignore_middle=False) adj_list_uncleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[uncleaved_ind]), ignore_middle=False) fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[cleaved_ind], ignore_middle=True) fracs_uncleaved = conv.fraction_neighbors_all( list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[uncleaved_ind], ignore_middle=True) #fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind]) print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now()) adj_list_cleaved.update(adj_list_uncleaved) fracs_cleaved.update(fracs_uncleaved) fracs_per_seq = {} start_ind = (index - 1) * 10000 end_ind = (index) * 10000 if start_ind > len(list_sequences[cleaved_ind]): print "This index is not valid" exit if end_ind > len(list_sequences[cleaved_ind]): end_ind = len(list_sequences[cleaved_ind]) for seq in list_sequences[cleaved_ind][start_ind:end_ind]: new_neighbors = [seq] fracs_per_seq[seq] = [] for x in xrange(0, 3): frac, new_neighbors = find_fraction_for_shell( new_neighbors, adj_list_cleaved, fracs_cleaved) fracs_per_seq[seq].append(frac) print "Found Fracs for Cleaved Sequences at: {0}".format( datetime.datetime.now()) with open("{0}_cleaved_{1}.csv".format(output_prefix, index), 'w') as f: f.write("Sequence,1,2,3\n") f.write("".join([ "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2])) for k, v in fracs_per_seq.items() ]))
def main(seq_file, canonical_file, output_prefix): series = [] canonical_list_seq = seq_IO.read_sequences(canonical_file) print "Beginning Script: {0}".format(datetime.datetime.now()) for canonical in canonical_list_seq: with open(seq_file) as strings: seq_list = strings.read().splitlines() seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ] orig_len = len(seq_ind_list) if canonical not in seq_list: one_away = gsconv.gen_hamdist_one(canonical) one_away = [ o for o in one_away if o != canonical ] + [canonical] seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ] edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ] print len(seq_ind_list) print "Generated Edges: {0}".format(datetime.datetime.now()) numpy.set_printoptions(threshold='nan') canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0] T_mat = trans_matrix(seq_ind_list,edges) #print raise_matrix(T_mat,1) #print raise_matrix(T_mat,3) #T = raise_matrix(T_mat,10) #T = raise_matrix(T_mat,20) x = [0] y = [0] print "Transformed Matrix: {0}".format(datetime.datetime.now()) x.append(1) y.append(find_frac(T_mat, canon_ind, orig_len)) T_mat_new = T_mat for i in range(2,23): x.append(i) T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len) y.append(frac) print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now()) series.append([x,y,canonical]) fig, ax = conv.create_ax(1, 1) color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue'] scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0) ax[0,0].set_xlim(xmin=1) ax[0,0].set_ylim(ymin=0.0, ymax=1.0) ax[0,0].set_xticks(xrange(1,23,3)) lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8) conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd) print "Outputted Figure: {0}".format(datetime.datetime.now())
def main(data_file, output_prefix, degree_file, width, height): sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True) seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True) degree_frac = defaultdict(list) for seq, seq_dict in sequences.items(): degree_frac[seq_degree[seq]['Degree']].append(np.mean( seq_dict["Frac"])) data = [np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items()] degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ] degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ] fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False) hist.draw_actual_plot(axarr[0, 0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False) #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True) bar.draw_actual_plot(axarr2[0, 0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std) #axarr[0,0].set_ylim([0,1.3]) pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
def main(json_file, output_prefix, novel_seqs_file, canonical_file): print "Started Script: {0}".format(datetime.datetime.now()) with open(json_file) as data_file: data = json.load(data_file) G = json_graph.node_link_graph(data, directed=False) print "Finished Reading in Graph: {0}".format(datetime.datetime.now()) id_seq = networkx.get_node_attributes(G, "sequence") id_status = networkx.get_node_attributes(G, "status") seq_id = {seq: node_id for node_id, seq in id_seq.items()} print "Created inverse lookup table: {0}".format(datetime.datetime.now()) novel_seqs = seq_IO.read_sequences(novel_seqs_file) canonical_seqs = seq_IO.read_sequences(canonical_file) novel_fracs = {} print "Ready to enter loop: {0}".format(datetime.datetime.now()) for n in novel_seqs: novel_fracs[n] = {} hamm_dist = sorted([(conv.hamdist(n, c), c) for c in canonical_seqs]) min_hamm_dist = hamm_dist[0][0] print "Found hamming distances: {0}".format(datetime.datetime.now()) for hamm, c in hamm_dist: #only analyze min_dist canonical sequences if hamm != min_hamm_dist: continue novel_fracs[n][c] = [] #generate list of 5 paths #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5) paths = [networkx.shortest_path(G, seq_id[n], seq_id[c])] for path in paths: inter_nodes = path[1:-1] novel_fracs[n][c].append( float( sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ])) / len(inter_nodes)) base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0]) base_c_file = os.path.basename(os.path.splitext(canonical_file)[0]) with open( "{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o: for n, c_dict in novel_fracs.items(): for c, fracs_list in c_dict.items(): o.write("{0},{1},".format(n, c)) o.write(",".join(map(str, fracs_list))) o.write("\n") print "Output paths: {0}".format(datetime.datetime.now())
def main(list_nodes, output_prefix, metric, create_keys=False): if not create_keys: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True) else: sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True) cleaved_seq = { key: val for key, val in sequences.items() if val["type"] == "CLEAVED" } middle_seq = { key: val for key, val in sequences.items() if val["type"] == "MIDDLE" } uncleaved_seq = { key: val for key, val in sequences.items() if val["type"] == "UNCLEAVED" } print len(cleaved_seq) if metric == "metrics": labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"] #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"]) labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ]) else: labels_to_plot = [metric] n_to_plot = len(labels_to_plot) fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False) nbins = 10 for ind, key in enumerate(labels_to_plot): if key == "pageranks": log = True else: log = False if key == "Fraction_Cleaved": # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(), # conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()] normed = True else: data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ] normed = True print key hist.draw_actual_plot(axarr[0, ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins) axarr[0, ind].ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) #pconv.add_legend(axarr[0,ind], location="middle right") pconv.save_fig(fig, output_prefix, metric, n_to_plot * 2.5, 2.5, tight=True, size=9)