def main(argv=sys.argv): G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] path_or_edges = [c.split("~") for c in path_or_edges.split("|")] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)] sub_path = sub_path.split("~") v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge(v1, v2, e_score=G_asm.sg_edges[(v1, v2)][1]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: if s == t: break n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break # if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] score, atig_path = all_alt_path[0] atig_output = [] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)] e_seq = G_asm.sg_edge_seqs[(vv, ww)] rid, ss, tt = r sub_seqs.append(e_seq) total_length += abs(ss - tt) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append((s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1)) duplicated = True for score, atig_path in all_alt_path[1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)] e_seq = G_asm.sg_edge_seqs[(vv, ww)] rid, ss, tt = r sub_seqs.append(e_seq) total_length += abs(ss - tt) total_score += aln_score seq = "".join(sub_seqs) aln_data, x, y = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0 * (aln_data[-1][3] - aln_data[-1][2]) / aln_data[-1][4] if idt < 0.96 or cov < 0.98: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov)) else: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0)) # if len(atig_output) == 1: # continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data print >> utg_out, ">%s~%s~%s-%d %d %d" % ( v0, "NA", w0, sub_id, total_length, total_score) print >> utg_out, seq sub_id += 1
def main(argv=None): G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa","w") for utg in G_asm.utg_data: s,t,v = utg type_, length, score, path_or_edges = G_asm.utg_data[ (s,t,v) ] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path( path_or_edges ) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score ) print >> utg_out, seq if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] path_or_edges = [ c.split("~") for c in path_or_edges.split("|")] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = G_asm.utg_data[ (ss,tt,vv) ] sub_path = sub_path.split("~") v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge( v1, v2, e_score = G_asm.sg_edges[ (v1, v2) ][1] ) v1 = v2 shortest_path = nx.shortest_path( c_graph, s, t, "e_score" ) score = nx.shortest_path_length( c_graph, s, t, "e_score" ) all_alt_path.append( (score, shortest_path) ) #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: if s == t: break n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score" ) score = nx.shortest_path_length( c_graph, s, t, "e_score" ) #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append( (score, shortest_path) ) except nx.exception.NetworkXNoPath: break #if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] score, atig_path = all_alt_path[0] atig_output = [] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, typs_ = G_asm.sg_edges[ (vv, ww) ] e_seq = G_asm.sg_edge_seqs[ (vv, ww) ] rid, ss, tt = r sub_seqs.append( e_seq ) total_length += abs(ss-tt) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append( (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1) ) duplicated = True for score, atig_path in all_alt_path[1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, type_ = G_asm.sg_edges[ (vv, ww) ] e_seq = G_asm.sg_edge_seqs[ (vv, ww) ] rid, ss, tt = r sub_seqs.append( e_seq ) total_length += abs(ss-tt) total_score += aln_score seq = "".join(sub_seqs) aln_data, x, y = get_aln_data(base_seq, seq) if len( aln_data ) != 0: idt = 1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4] if idt < 0.96 or cov < 0.98: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov) ) else: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0) ) #if len(atig_output) == 1: # continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data print >> utg_out, ">%s~%s~%s-%d %d %d" % (v0, "NA", w0, sub_id, total_length, total_score ) print >> utg_out, seq sub_id += 1
if alignment[0].aln_str_size > 100: aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist)) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound": c_graph = nx.DiGraph()
aln_data.append( (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist) ) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound":