def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, "w") as f: for ctg in asm_G.ctg_data: if ctg[-1] == "R": continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(":")[0]) rid = pread_did_to_rid[pid].split("/")[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split('\n') rid_to_oid = open(rawread_id_file).read().split('\n') asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, 'w') as f: for ctg in asm_G.ctg_data: if ctg[-1] == 'R': continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(':')[0]) rid = pread_did_to_rid[pid].split('/')[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
def test_add_nx_string_graph(): # Load the assembly graph. sg_edges_list = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list') utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'utg_data') ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths') asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) # The following block is taken from Unzip, graphs_to_h_tigs.py. nx_sg = nx.DiGraph() arid_to_phase = {} for ctg_id in asm_graph.ctg_data.keys(): ctg_G = asm_graph.get_sg_for_ctg(ctg_id) ctg_nodes = set(ctg_G.nodes()) for v, w in ctg_G.edges(): vrid = v[:9] wrid = w[:9] edge_data = asm_graph.sg_edges[(v, w)] if edge_data[-1] != "G": continue vphase = arid_to_phase.get(vrid, (-1, 0)) wphase = arid_to_phase.get(wrid, (-1, 0)) if vphase[0] == wphase[0] and vphase[1] != wphase[1]: cross_phase = "Y" else: cross_phase = "N" nx_sg.add_node(v, label="%d_%d" % vphase, phase="%d_%d" % vphase, src="P") nx_sg.add_node(w, label="%d_%d" % wphase, phase="%d_%d" % wphase, src="P") nx_sg.add_edge(v, w, src="OP", cross_phase=cross_phase) # we need to add the complimentary edges as the ctg_graph does not contain the dual edges rv = reverse_end(v) rw = reverse_end(w) nx_sg.add_node(rv, label="%d_%d" % vphase, phase="%d_%d" % vphase, src="P") nx_sg.add_node(rw, label="%d_%d" % wphase, phase="%d_%d" % wphase, src="P") nx_sg.add_edge(rw, rv, src="OP", cross_phase=cross_phase) # Add the string graph to the GFA. gfa_graph = mod.GFAGraph() gexf_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf') nx_sg = nx.read_gexf(gexf_file) gfa_graph.add_nx_string_graph(nx_sg)
def test_add_asm_graph(): # Load the assembly graph. sg_edges_list = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list') utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'utg_data') ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths') asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) # Add the graph to GFA. gfa_graph = mod.GFAGraph() gfa_graph.add_asm_graph(asm_graph) assert(len(gfa_graph.paths.keys()) == 0) expected = { ('000000016:B', '000000027:B'): ['000000016:B', '000000027:B', '*', 1540, 99.94, 449, 0, None, None, None, None], ('000000005:B', '000000016:B'): ['000000005:B', '000000016:B', '*', 1487, 99.93, 502, 0, None, None, None, None], ('000000016:B', '000000025:B'): ['000000016:B', '000000025:B', '*', 1540, 99.94, 449, 0, None, None, None, None], ('000000007:B', '000000005:B'): ['000000007:B', '000000005:B', '*', 1980, 99.95, 9, 0, None, None, None, None], ('000000018:B', '000000004:B'): ['000000018:B', '000000004:B', '*', 1963, 99.95, 26, 0, None, None, None, None], ('000000025:B', '000000018:B'): ['000000025:B', '000000018:B', '*', 1978, 99.95, 11, 0, None, None, None, None] } assert(len(gfa_graph.edges.keys()) == len(expected.keys())) for key, edge in gfa_graph.edges.iteritems(): assert(key in expected) assert(expected[key] == edge)
def main(*argv): G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") p_ctg_coor_map = {} for fn in ("p_ctg_tiling_path", "a_ctg_tiling_path"): f = open(fn) for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] if ctg_id not in p_ctg_coor_map: coor = 0 # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path p_ctg_coor_map[ctg_id] = {} p_ctg_coor_map[ctg_id][v] = 0 coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor G_asm.node_to_ctg[w] print ctg_id, v, 0, " ".join(list(G_asm.node_to_ctg[v])) print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w])) continue else: coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w])) f.close()
def gfa_from_assembly(fp_out, p_ctg_tiling_path, a_ctg_tiling_path, preads_fasta, p_ctg_fasta, a_ctg_fasta, sg_edges_list, utg_data, ctg_paths, add_string_graph, write_reads, write_contigs, min_p_len, min_a_len): """ This method produces the GFA-1 formatted output of the FALCON assembly. The graphical output is produced from either the entire string graph (only the non-filtered edges are considered) or from only the tiling paths. String graph can show the neighborhood of contig breaks, whereas the tiling path output is more sparse. Output is written to stdout. """ gfa_graph = GFAGraph() add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta, p_ctg_tiling_path, a_ctg_tiling_path, min_p_len, min_a_len, gfa_graph) if add_string_graph: # Load the string graph. asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) gfa_graph.add_asm_graph(asm_graph) gfa_graph.write_gfa_v1(fp_out, preads_fasta, [ p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)
def add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict): asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) for v, w in asm_graph.sg_edges: add_node(gfa_graph, v, preads_dict) add_node(gfa_graph, w, preads_dict) for v, w in asm_graph.sg_edges: edge_data = asm_graph.sg_edges[(v, w)] if edge_data[-1] != 'G': continue add_edge(gfa_graph, v, w, edge_data, preads_overlap_dict, sg_edges_dict)
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs, min_p_len, min_a_len, expected_path): # Create a GFA graph. gfa_graph = mod.GFAGraph() # Init paths to other input files. preads_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta') p_ctg_fasta = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa') a_ctg_fasta = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa') if use_sg: # Load the assembly graph. sg_edges_list = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list') utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'utg_data') ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths') asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) # Add the string graph to the GFA. gfa_graph.add_asm_graph(asm_graph) if use_tp: p_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path') a_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'a_ctg_tiling_path') gen_gfa_v1.add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta, p_ctg_tiling_path_file, a_ctg_tiling_path_file, min_p_len, min_a_len, gfa_graph) if use_nx: gexf_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf') nx_sg = nx.read_gexf(gexf_file) gfa_graph.add_nx_string_graph(nx_sg) fp_out = StringIO() # Run the unit under test. gfa_graph.write_gfa_v1(fp_out, preads_file, [p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs) # Compare results. value = fp_out.getvalue() helpers.assert_filecmp(value, expected_path)
def gfa_from_assembly(fp_out, p_ctg_tiling_path, a_ctg_tiling_path, preads_fasta, p_ctg_fasta, a_ctg_fasta, sg_edges_list, utg_data, ctg_paths, tiling, write_reads, write_contigs, min_p_len, min_a_len): """ This method produces the GFA-1 formatted output of the FALCON assembly. The graphical output is produced from either the entire string graph (only the non-filtered edges are considered) or from only the tiling paths. String graph can show the neighborhood of contig breaks, whereas the tiling path output is more sparse. Output is written to stdout. """ gfa_graph = GFAGraph() # Load and filter primary contig paths. p_paths, p_edge_to_ctg = load_tiling_paths(p_ctg_tiling_path, 'P') _, p_ctg_len = calc_tiling_paths_len(p_paths) p_paths = filter_tiling_paths_by_len(p_paths, p_ctg_len, min_p_len) for ctg_id, path in p_paths.iteritems(): gfa_graph.add_tiling_path(path, ctg_id) # Load and filter associate contig paths. a_paths, a_edge_to_ctg = load_tiling_paths(a_ctg_tiling_path, 'A') _, a_ctg_len = calc_tiling_paths_len(a_paths) a_paths = filter_tiling_paths_by_len(a_paths, a_ctg_len, min_a_len) for ctg_id, path in a_paths.iteritems(): gfa_graph.add_tiling_path(path, ctg_id) if not tiling: # Load the string graph. asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) gfa_graph.add_asm_graph(asm_graph) gfa_graph.write_gfa_v1(fp_out, preads_fasta, [p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)
def main(argv=None): G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa","w") for utg in G_asm.utg_data: s,t,v = utg type_, length, score, path_or_edges = G_asm.utg_data[ (s,t,v) ] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path( path_or_edges ) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score ) print >> utg_out, seq if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] path_or_edges = [ c.split("~") for c in path_or_edges.split("|")] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = G_asm.utg_data[ (ss,tt,vv) ] sub_path = sub_path.split("~") v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge( v1, v2, e_score = G_asm.sg_edges[ (v1, v2) ][1] ) v1 = v2 shortest_path = nx.shortest_path( c_graph, s, t, "e_score" ) score = nx.shortest_path_length( c_graph, s, t, "e_score" ) all_alt_path.append( (score, shortest_path) ) #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: if s == t: break n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score" ) score = nx.shortest_path_length( c_graph, s, t, "e_score" ) #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append( (score, shortest_path) ) except nx.exception.NetworkXNoPath: break #if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] score, atig_path = all_alt_path[0] atig_output = [] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, typs_ = G_asm.sg_edges[ (vv, ww) ] e_seq = G_asm.sg_edge_seqs[ (vv, ww) ] rid, ss, tt = r sub_seqs.append( e_seq ) total_length += abs(ss-tt) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append( (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1) ) duplicated = True for score, atig_path in all_alt_path[1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, type_ = G_asm.sg_edges[ (vv, ww) ] e_seq = G_asm.sg_edge_seqs[ (vv, ww) ] rid, ss, tt = r sub_seqs.append( e_seq ) total_length += abs(ss-tt) total_score += aln_score seq = "".join(sub_seqs) aln_data, x, y = get_aln_data(base_seq, seq) if len( aln_data ) != 0: idt = 1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4] if idt < 0.96 or cov < 0.98: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov) ) else: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0) ) #if len(atig_output) == 1: # continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data print >> utg_out, ">%s~%s~%s-%d %d %d" % (v0, "NA", w0, sub_id, total_length, total_score ) print >> utg_out, seq sub_id += 1
if alignment[0].aln_str_size > 100: aln_data.append( (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist) ) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound":
def main(argv=sys.argv): G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] path_or_edges = [c.split("~") for c in path_or_edges.split("|")] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)] sub_path = sub_path.split("~") v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge(v1, v2, e_score=G_asm.sg_edges[(v1, v2)][1]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: if s == t: break n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break # if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] score, atig_path = all_alt_path[0] atig_output = [] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)] e_seq = G_asm.sg_edge_seqs[(vv, ww)] rid, ss, tt = r sub_seqs.append(e_seq) total_length += abs(ss - tt) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append((s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1)) duplicated = True for score, atig_path in all_alt_path[1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)] e_seq = G_asm.sg_edge_seqs[(vv, ww)] rid, ss, tt = r sub_seqs.append(e_seq) total_length += abs(ss - tt) total_score += aln_score seq = "".join(sub_seqs) aln_data, x, y = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0 * (aln_data[-1][3] - aln_data[-1][2]) / aln_data[-1][4] if idt < 0.96 or cov < 0.98: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov)) else: duplicated = False atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0)) # if len(atig_output) == 1: # continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data print >> utg_out, ">%s~%s~%s-%d %d %d" % ( v0, "NA", w0, sub_id, total_length, total_score) print >> utg_out, seq sub_id += 1
def main(argv=sys.argv): # make life easier for now. will refactor it out if possible global all_rid_to_phase global p_asm_G global h_asm_G global all_rid_to_phase global seqs args = parse_args(argv) fc_asm_path = args.fc_asm_path fc_hasm_path = args.fc_hasm_path ctg_id = args.ctg_id base_dir = args.base_dir fasta_fn = args.fasta p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"), os.path.join(fc_asm_path, "utg_data"), os.path.join(fc_asm_path, "ctg_paths")) h_asm_G = AsmGraph(os.path.join(fc_hasm_path, "sg_edges_list"), os.path.join(fc_hasm_path, "utg_data"), os.path.join(fc_hasm_path, "ctg_paths")) all_rid_to_phase = {} all_read_ids = set() with open(args.rid_phase_map) as f: for row in f: row = row.strip().split() all_rid_to_phase.setdefault(row[1], {}) all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3])) all_read_ids.add(row[0]) for v, w in p_asm_G.sg_edges: if p_asm_G.sg_edges[(v, w)][-1] != "G": continue v = v.split(":")[0] w = w.split(":")[0] all_read_ids.add(v) all_read_ids.add(w) for v, w in h_asm_G.sg_edges: if h_asm_G.sg_edges[(v, w)][-1] != "G": continue v = v.split(":")[0] w = w.split(":")[0] all_read_ids.add(v) all_read_ids.add(w) seqs = load_sg_seq(all_read_ids, fasta_fn) if ctg_id == "all": ctg_id_list = p_asm_G.ctg_data.keys() else: ctg_id_list = [ctg_id] exe_list = [] for ctg_id in ctg_id_list: if ctg_id[-1] != "F": continue if ctg_id not in all_rid_to_phase: continue exe_list.append((ctg_id, os.path.join(".", ctg_id))) exec_pool = Pool(4) #TODO, make this configurable exec_pool.map(generate_haplotigs_for_ctg, exe_list)
if alignment[0].aln_str_size > 100: aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist)) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") G_asm.load_sg_seq("preads4falcon.fasta") utg_out = open("utgs.fa", "w") for utg in G_asm.utg_data: s, t, v = utg type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)] if type_ == "simple": path_or_edges = path_or_edges.split("~") seq = G_asm.get_seq_from_path(path_or_edges) print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score) print >> utg_out, seq if type_ == "compound":
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs, min_p_len, min_a_len, expected_path): # Create a GFA graph. gfa_graph = mod.GFAGraph() if use_sg: # Load the assembly graph. sg_edges_list = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list') utg_data = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'utg_data') ctg_paths = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths') asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) # Add the string graph to the GFA. gfa_graph.add_asm_graph(asm_graph) if use_tp: # Load the p_ctg tiling paths. p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path') p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths( p_ctg_tiling_path_file, 'P') # Add the tiling paths to the GFA. for ctg_id, path in p_paths.iteritems(): _, contig_len = gen_gfa_v1.calc_node_coords(path) if contig_len >= min_p_len: gfa_graph.add_tiling_path(path, ctg_id) a_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'a_ctg_tiling_path') a_paths, a_edge_to_ctg = gen_gfa_v1.load_tiling_paths( a_ctg_tiling_path_file, 'P') # Add the tiling paths to the GFA. for ctg_id, path in a_paths.iteritems(): _, contig_len = gen_gfa_v1.calc_node_coords(path) if contig_len >= min_a_len: gfa_graph.add_tiling_path(path, ctg_id) if use_nx: gexf_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf') nx_sg = nx.read_gexf(gexf_file) gfa_graph.add_nx_string_graph(nx_sg) # Init paths to other input files. preads_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta') p_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa') a_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa') fp_out = StringIO() # Run the unit under test. gfa_graph.write_gfa_v1(fp_out, preads_file, [ p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs) # Compare results. result = fp_out.getvalue() result = result.splitlines() expected = [line.strip() for line in open(expected_path).readlines()] assert(result == expected)
#!/usr/local/packages/anaconda2/bin/python ## This is the `sg_edges_to_GFA.py` script ## (More) information at https://github.com/PacificBiosciences/FALCON/wiki/Convert-FALCON-assembly-graph-to-GFA-format from falcon_kit.fc_asm_graph import AsmGraph from falcon_kit.FastaReader import FastaReader read_in_graph = set() G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths") edge_to_ctg = {} a_path = {} with open("a_ctg_tiling_path") as f: for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] a_path.setdefault(ctg_id, []) a_path[ctg_id].append((v, w)) ctg_id = ctg_id.split("-")[0] #get the primary contig id edge_to_ctg[(v, w)] = ctg_id, "A" p_path = {} with open("p_ctg_tiling_path") as f: for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] p_path.setdefault(ctg_id, []) p_path[ctg_id].append((v, w)) edge_to_ctg[(v, w)] = ctg_id, "P" read_pairs = set() link_lines = []
parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True) parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True) parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True) parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory') parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True) parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True) args = parser.parse_args() fc_asm_path = args.fc_asm_path fc_hasm_path = args.fc_hasm_path ctg_id = args.ctg_id base_dir = args.base_dir fasta_fn = args.fasta p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"), os.path.join(fc_asm_path, "utg_data"), os.path.join(fc_asm_path, "ctg_paths") ) h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"), os.path.join(fc_hasm_path, "utg_data"), os.path.join(fc_hasm_path, "ctg_paths") ) all_rid_to_phase = {} all_read_ids = set() with open(args.rid_phase_map) as f: for row in f: row = row.strip().split() all_rid_to_phase.setdefault( row[1], {} ) all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3])) all_read_ids.add(row[0])