def test_load_headers_2(tmpdir): """ There is a duplicate header in the input. This should be reduced to only the unique occurrences. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in range(500)]) fasta_lines = [ '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, ] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set(['000000F-001-01', '000000F-002-02']) assert (out == expected)
def test_load_headers_1(tmpdir): """ Regular case. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in range(500)]) fasta_lines = [ '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, ] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set(['000000F-001-01', '000000F-002-02']) assert (out == expected)
def test_load_headers_2(tmpdir): """ There is a duplicate header in the input. This should be reduced to only the unique occurrences. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)]) fasta_lines = [ '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, ] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set(['000000F-001-01', '000000F-002-02']) assert(out == expected)
def test_load_headers_1(tmpdir): """ Regular case. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)]) fasta_lines = [ '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00', dummy_seq, '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00', dummy_seq, ] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set(['000000F-001-01', '000000F-002-02']) assert(out == expected)
def main(argv=sys.argv): args = parse_args(argv) with open_fasta_reader("a_ctg_all.fa") as reads: with open("a_ctg.fa", "w") as f: for r in reads: tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split() if 100 * float(idt) > args.max_idt and 100 * float(cov) > args.max_aln_cov and\ abs(int(delta_l)) < args.min_len_diff: continue print >> f, ">" + r.name print >> f, r.sequence
def main(argv=sys.argv): args = parse_args(argv) with open_fasta_reader("a_ctg_all.fa") as reads: with open("a_ctg.fa", "w") as f: for r in reads: tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split() if 100 * float(idt) > args.max_idt and 100 * float(cov) > args.max_aln_cov and\ abs(int(delta_l)) < args.min_len_diff: continue print(">" + r.name, file=f) print(r.sequence, file=f)
def run(fp_out, a_ctg, a_ctg_all_tiling_path): with open_fasta_reader(a_ctg) as fp_in: a_ctg_ids = load_headers(fp_in) with open(a_ctg_all_tiling_path, 'r') as fp_in: for line in fp_in: line = line.strip() if len(line) == 0: # pragma: no cover continue # pragma: no cover sl = line.split() if sl[0] not in a_ctg_ids: continue fp_out.write('%s\n' % (line))
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth): read_fofn = fofn if out_dir == None: out_dir = os.path.join(base_dir, '3-unzip/reads') ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa') read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps') rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids') pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids') rid_to_oid = open(rawread_id_file).read().split( '\n') #daligner raw read id to the original ids pid_to_fid = open(pread_id_file).read().split( '\n') #daligner pread id to the fake ids def pid_to_oid(pid): fid = pid_to_fid[int(pid)] rid = int(fid.split('/')[1]) / 10 return rid_to_oid[int(rid)] with open_fasta_reader(ctg_fa) as ref_fasta: all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != 'all' and s_id != ctg_id: continue if len(s.sequence) < min_ctg_lenth: continue if ctg_id != 'all': ref_out = open(os.path.join(out_dir, '%s_ref.fa' % ctg_id), 'w') else: ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w') print >> ref_out, '>%s' % s_id print >> ref_out, s.sequence all_ctg_ids.add(s_id) ref_out.close()
def test_load_headers_3(tmpdir): """ Empty input. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in range(500)]) fasta_lines = [] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set() assert (out == expected)
def test_load_headers_3(tmpdir): """ Empty input. """ random.seed(1234567) dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)]) fasta_lines = [ ] # Create a temporary input files. test_out_a_ctg_file = tmpdir.join('a_ctg.fa') test_out_a_ctg_file.write('\n'.join(fasta_lines)) # fp_in = StringIO('\n'.join(fasta_lines)) with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in: out = mod.load_headers(fp_in) expected = set() assert(out == expected)
def main(argv=sys.argv): p_ctg_coor_map = {} with open("p_ctg_tiling_path") as f: for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] if ctg_id not in p_ctg_coor_map: coor = 0 # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path p_ctg_coor_map[ctg_id] = {} p_ctg_coor_map[ctg_id][v] = 0 coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor continue else: coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor with open_fasta_reader("a_ctg.fa") as a_ctg_fasta: for r in a_ctg_fasta: rid = r.name.split() rid, v, w = rid[:3] pid = rid.split("-")[0] print(rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w])
def run(improper_p_ctg, proper_a_ctg): """improper==True => Neglect the initial read. We used to need that for unzip. """ reads_in_layout = set() with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) seqs = {} # load all p-read name into memory with open_fasta_reader(read_fasta) as f: for r in f: if r.name not in reads_in_layout: continue seqs[r.name] = r.sequence.upper() # name == rid-string edge_data = {} with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1, dir1 = v.split(":") reads_in_layout.add(r1) # redundant, but harmless r2, dir2 = w.split(":") reads_in_layout.add(r2) # redundant, but harmless s = int(s) t = int(t) aln_score = int(aln_score) idt = float(idt) if s < t: e_seq = seqs[rid][s:t] assert 'E' == dir2 else: # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702 # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon. e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]]) assert 'B' == dir2 edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq) utg_data = {} with open(utg_data_file) as f: for l in f: l = l.strip().split() s, v, t, type_, length, score, path_or_edges = l if type_ not in ["compound", "simple", "contained"]: continue length = int(length) score = int(score) if type_ in ("simple", "contained"): path_or_edges = path_or_edges.split("~") else: path_or_edges = [tuple(e.split("~")) for e in path_or_edges.split("|")] utg_data[(s, v, t)] = type_, length, score, path_or_edges p_ctg_out = open("p_ctg.fa", "w") a_ctg_out = open("a_ctg_all.fa", "w") a_ctg_base_out = open("a_ctg_base.fa", "w") p_ctg_t_out = open("p_ctg_tiling_path", "w") a_ctg_t_out = open("a_ctg_tiling_path", "w") a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w") layout_ctg = set() with open(ctg_data_file) as f: for l in f: l = l.strip().split() ctg_id, c_type_, i_utig, t0, length, score, utgs = l ctg_id = ctg_id s0 = i_utig.split("~")[0] if (reverse_end(t0), reverse_end(s0)) in layout_ctg: continue else: layout_ctg.add((s0, t0)) ctg_label = i_utig + "~" + t0 length = int(length) utgs = utgs.split("|") one_path = [] total_score = 0 total_length = 0 #a_ctg_data = [] a_ctg_group = {} for utg in utgs: s, v, t = utg.split("~") type_, length, score, path_or_edges = utg_data[(s, v, t)] total_score += score total_length += length if type_ == "simple": if len(one_path) != 0: one_path.extend(path_or_edges[1:]) else: one_path.extend(path_or_edges) if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = utg_data[(ss, vv, tt)] v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge( v1, v2, e_score=edge_data[(v1, v2)][3]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score") score = nx.shortest_path_length( c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break # if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] if len(one_path) != 0: one_path.extend(shortest_path[1:]) else: one_path.extend(shortest_path) a_ctg_group[(s, t)] = all_alt_path if len(one_path) == 0: continue one_path_edges = list(zip(one_path[:-1], one_path[1:])) if improper_p_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(one_path_edges, seqs)) for vv, ww in one_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) print("%s %s %s %s %d %d %d %0.2f" % ( ctg_id, vv, ww, rid, s, t, aln_score, idt), file=p_ctg_t_out) print(">%s %s %s %d %d" % ( ctg_id, ctg_label, c_type_, total_length, total_score), file=p_ctg_out) print("".join(sub_seqs), file=p_ctg_out) a_id = 1 for v, w, in a_ctg_group: # get the base sequence used in the primary contig atig_output = [] score, atig_path = a_ctg_group[(v, w)][0] atig_path_edges = list(zip(atig_path[:-1], atig_path[1:])) if not proper_a_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(atig_path_edges, seqs)) total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append( (v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1)) for score, atig_path in a_ctg_group[(v, w)][1:]: atig_path_edges = list(zip(atig_path[:-1], atig_path[1:])) if not proper_a_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(atig_path_edges, seqs)) total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score seq = "".join(sub_seqs) delta_len = len(seq) - len(base_seq) idt = 0.0 cov = 0.0 if len(base_seq) > 2000 and len(seq) > 2000: try: aln_data = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * \ aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0 * \ (aln_data[-1][3] - aln_data[-1] [2]) / aln_data[-1][4] except TooLongError: log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs, at atig_path[:-1] == {}'.format(atig_path[:-1])) idt = -1.0 cov = -1.0 atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov)) if len(atig_output) == 1: continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] if sub_id != 0: print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_t_out) else: print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_base_t_out) if sub_id != 0: print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_out) print(seq, file=a_ctg_out) else: print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_base_out) print(seq, file=a_ctg_base_out) sub_id += 1 a_id += 1 a_ctg_out.close() a_ctg_base_out.close() p_ctg_out.close() a_ctg_t_out.close() a_ctg_base_t_out.close() a_ctg_t_out.close() p_ctg_t_out.close()
def run(improper_p_ctg, proper_a_ctg): """improper==True => Neglect the initial read. We used to need that for unzip. """ reads_in_layout = set() with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) seqs = {} # load all p-read name into memory with open_fasta_reader(read_fasta) as f: for r in f: if r.name not in reads_in_layout: continue seqs[r.name] = r.sequence.upper() # name == rid-string edge_data = {} with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1, dir1 = v.split(":") reads_in_layout.add(r1) # redundant, but harmless r2, dir2 = w.split(":") reads_in_layout.add(r2) # redundant, but harmless s = int(s) t = int(t) aln_score = int(aln_score) idt = float(idt) if s < t: e_seq = seqs[rid][s:t] assert 'E' == dir2 else: # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702 # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon. e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]]) assert 'B' == dir2 edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq) utg_data = {} with open(utg_data_file) as f: for l in f: l = l.strip().split() s, v, t, type_, length, score, path_or_edges = l if type_ not in ["compound", "simple", "contained"]: continue length = int(length) score = int(score) if type_ in ("simple", "contained"): path_or_edges = path_or_edges.split("~") else: path_or_edges = [ tuple(e.split("~")) for e in path_or_edges.split("|") ] utg_data[(s, v, t)] = type_, length, score, path_or_edges p_ctg_out = open("p_ctg.fa", "w") a_ctg_out = open("a_ctg_all.fa", "w") a_ctg_base_out = open("a_ctg_base.fa", "w") p_ctg_t_out = open("p_ctg_tiling_path", "w") a_ctg_t_out = open("a_ctg_tiling_path", "w") a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w") layout_ctg = set() with open(ctg_data_file) as f: for l in f: l = l.strip().split() ctg_id, c_type_, i_utig, t0, length, score, utgs = l ctg_id = ctg_id s0 = i_utig.split("~")[0] if (reverse_end(t0), reverse_end(s0)) in layout_ctg: continue else: layout_ctg.add((s0, t0)) ctg_label = i_utig + "~" + t0 length = int(length) utgs = utgs.split("|") one_path = [] total_score = 0 total_length = 0 #a_ctg_data = [] a_ctg_group = {} for utg in utgs: s, v, t = utg.split("~") type_, length, score, path_or_edges = utg_data[(s, v, t)] total_score += score total_length += length if type_ == "simple": if len(one_path) != 0: one_path.extend(path_or_edges[1:]) else: one_path.extend(path_or_edges) if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = utg_data[(ss, vv, tt)] v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge(v1, v2, e_score=edge_data[(v1, v2)][3]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score") score = nx.shortest_path_length( c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break # if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] if len(one_path) != 0: one_path.extend(shortest_path[1:]) else: one_path.extend(shortest_path) a_ctg_group[(s, t)] = all_alt_path if len(one_path) == 0: continue one_path_edges = list(zip(one_path[:-1], one_path[1:])) if improper_p_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(one_path_edges, seqs)) for vv, ww in one_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) print("%s %s %s %s %d %d %d %0.2f" % (ctg_id, vv, ww, rid, s, t, aln_score, idt), file=p_ctg_t_out) print(">%s %s %s %d %d" % (ctg_id, ctg_label, c_type_, total_length, total_score), file=p_ctg_out) print("".join(sub_seqs), file=p_ctg_out) a_id = 1 for v, w, in a_ctg_group: # get the base sequence used in the primary contig atig_output = [] score, atig_path = a_ctg_group[(v, w)][0] atig_path_edges = list(zip(atig_path[:-1], atig_path[1:])) if not proper_a_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(atig_path_edges, seqs)) total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append((v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1)) for score, atig_path in a_ctg_group[(v, w)][1:]: atig_path_edges = list(zip(atig_path[:-1], atig_path[1:])) if not proper_a_ctg: sub_seqs = [] else: sub_seqs = list(yield_first_seq(atig_path_edges, seqs)) total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score seq = "".join(sub_seqs) delta_len = len(seq) - len(base_seq) idt = 0.0 cov = 0.0 if len(base_seq) > 2000 and len(seq) > 2000: try: aln_data = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * \ aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0 * \ (aln_data[-1][3] - aln_data[-1] [2]) / aln_data[-1][4] except TooLongError: log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs, at atig_path[:-1] == {}' .format(atig_path[:-1])) idt = -1.0 cov = -1.0 atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov)) if len(atig_output) == 1: continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] if sub_id != 0: print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_t_out) else: print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_base_t_out) if sub_id != 0: print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_out) print(seq, file=a_ctg_out) else: print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_base_out) print(seq, file=a_ctg_base_out) sub_id += 1 a_id += 1 a_ctg_out.close() a_ctg_base_out.close() p_ctg_out.close() a_ctg_t_out.close() a_ctg_base_t_out.close() a_ctg_t_out.close() p_ctg_t_out.close()
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth): read_fofn = fofn if out_dir == None: out_dir = os.path.join(base_dir, '3-unzip/reads') ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa') read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps') rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids') pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids') rid_to_oid = open(rawread_id_file).read().split( '\n') #daligner raw read id to the original ids pid_to_fid = open(pread_id_file).read().split( '\n') #daligner pread id to the fake ids def pid_to_oid(pid): fid = pid_to_fid[int(pid)] rid = int(fid.split('/')[1]) / 10 return rid_to_oid[int(rid)] with open_fasta_reader(ctg_fa) as ref_fasta: all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != 'all' and s_id != ctg_id: continue if len(s.sequence) < min_ctg_lenth: continue if ctg_id != 'all': ref_out = open(os.path.join(out_dir, '%s_ref.fa' % ctg_id), 'w') else: ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w') print >> ref_out, '>%s' % s_id print >> ref_out, s.sequence all_ctg_ids.add(s_id) ref_out.close() read_set = {} ctg_id_hits = {} map_fn = os.path.join(read_map_dir, 'rawread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if int(row[3]) == 0: o_id = rid_to_oid[int(row[0])] read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 map_fn = os.path.join(read_map_dir, 'pread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if hit_ctg not in read_set and int(row[3]) == 0: o_id = pid_to_oid(row[0]) read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 with open(os.path.join(out_dir, 'ctg_list'), 'w') as f: for ctg_id in sorted(list(all_ctg_ids)): if ctg_id_hits.get(ctg_id, 0) < 5: continue if ctg_id[-1] not in [ 'F', 'R' ]: #ignore small circle contigs, they need different approach continue print >> f, ctg_id read_out_files = {} @contextlib.contextmanager def reopened_fasta_out(ctg_id): # A convenient closure, with a contextmanager. if ctg_id not in read_out_files: read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w') read_out_files[ctg_id] = 1 else: read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a') yield read_out read_out.close() with open(read_fofn, 'r') as f: for r_fn in f: r_fn = r_fn.strip() with open_fasta_reader( r_fn) as read_fa_file: # will soon handle .dexta too for r in read_fa_file: rid = r.name.split()[0] if rid not in read_set: ctg_id = 'unassigned' else: ctg_id = read_set[rid] if ctg_id == 'NA' or ctg_id not in all_ctg_ids: ctg_id = 'unassigned' with reopened_fasta_out(ctg_id) as read_out: print >> read_out, '>' + rid print >> read_out, r.sequence
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth): read_fofn = fofn if out_dir == None: out_dir = os.path.join(base_dir, '3-unzip/reads') ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa') read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps') rawread_id_file = os.path.join( read_map_dir, 'dump_rawread_ids', 'rawread_ids') pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids') rid_to_oid = open(rawread_id_file).read().split( '\n') # daligner raw read id to the original ids pid_to_fid = open(pread_id_file).read().split( '\n') # daligner pread id to the fake ids assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format( rawread_id_file) assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format( pread_id_file) def pid_to_oid(pid): fid = pid_to_fid[int(pid)] rid = int(fid.split('/')[1]) // 10 return rid_to_oid[int(rid)] with open_fasta_reader(ctg_fa) as ref_fasta: all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != 'all' and s_id != ctg_id: continue if len(s.sequence) < min_ctg_lenth: continue if ctg_id != 'all': ref_out = open(os.path.join( out_dir, '%s_ref.fa' % ctg_id), 'w') else: ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w') print('>%s' % s_id, file=ref_out) print(s.sequence, file=ref_out) all_ctg_ids.add(s_id) ref_out.close() read_set = {} ctg_id_hits = {} map_fn = os.path.join(read_map_dir, 'rawread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if int(row[3]) == 0: o_id = rid_to_oid[int(row[0])] read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn) map_fn = os.path.join(read_map_dir, 'pread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if hit_ctg not in read_set and int(row[3]) == 0: o_id = pid_to_oid(row[0]) read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 with open(os.path.join(out_dir, 'ctg_list'), 'w') as f: for ctg_id in sorted(list(all_ctg_ids)): if ctg_id_hits.get(ctg_id, 0) < 5: continue # ignore small circle contigs, they need different approach if ctg_id[-1] not in ['F', 'R']: continue print(ctg_id, file=f) read_out_files = {} @contextlib.contextmanager def reopened_fasta_out(ctg_id): # A convenient closure, with a contextmanager. if ctg_id not in read_out_files: read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w') read_out_files[ctg_id] = 1 else: read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a') yield read_out read_out.close() with open(read_fofn, 'r') as f: for r_fn in f: r_fn = r_fn.strip() # will soon handle .dexta too with open_fasta_reader(r_fn) as read_fa_file: for r in read_fa_file: rid = r.name.split()[0] if rid not in read_set: ctg_id = 'unassigned' else: ctg_id = read_set[rid] if ctg_id == 'NA' or ctg_id not in all_ctg_ids: ctg_id = 'unassigned' with reopened_fasta_out(ctg_id) as read_out: print('>' + rid, file=read_out) print(r.sequence, file=read_out)
def main(argv=sys.argv): args = parse_args(argv) with open_fasta_reader(args.a_ctg_all) as fp_in: run(sys.stdout, fp_in, args.max_idt, args.max_aln_cov, args.min_len_diff, args.min_seq_len, args.ploidy)
def main(argv=sys.argv): reads_in_layout = set() with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) seqs = {} # load all p-read name into memory with open_fasta_reader(read_fasta) as f: for r in f: if r.name not in reads_in_layout: continue seqs[r.name] = r.sequence.upper() edge_data = {} with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) s = int(s) t = int(t) aln_score = int(aln_score) idt = float(idt) if s < t: e_seq = seqs[rid][s:t] else: # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702 # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon. e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]]) edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq) utg_data = {} with open(utg_data_file) as f: for l in f: l = l.strip().split() s, v, t, type_, length, score, path_or_edges = l if type_ not in ["compound", "simple", "contained"]: continue length = int(length) score = int(score) if type_ in ("simple", "contained"): path_or_edges = path_or_edges.split("~") else: path_or_edges = [ tuple(e.split("~")) for e in path_or_edges.split("|") ] utg_data[(s, v, t)] = type_, length, score, path_or_edges p_ctg_out = open("p_ctg.fa", "w") a_ctg_out = open("a_ctg_all.fa", "w") a_ctg_base_out = open("a_ctg_base.fa", "w") p_ctg_t_out = open("p_ctg_tiling_path", "w") a_ctg_t_out = open("a_ctg_tiling_path", "w") a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w") layout_ctg = set() with open(ctg_data_file) as f: for l in f: l = l.strip().split() ctg_id, c_type_, i_utig, t0, length, score, utgs = l ctg_id = ctg_id s0 = i_utig.split("~")[0] if (reverse_end(t0), reverse_end(s0)) in layout_ctg: continue else: layout_ctg.add((s0, t0)) ctg_label = i_utig + "~" + t0 length = int(length) utgs = utgs.split("|") one_path = [] total_score = 0 total_length = 0 #a_ctg_data = [] a_ctg_group = {} for utg in utgs: s, v, t = utg.split("~") type_, length, score, path_or_edges = utg_data[(s, v, t)] total_score += score total_length += length if type_ == "simple": if len(one_path) != 0: one_path.extend(path_or_edges[1:]) else: one_path.extend(path_or_edges) if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = utg_data[(ss, vv, tt)] v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge(v1, v2, e_score=edge_data[(v1, v2)][3]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score") score = nx.shortest_path_length( c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break # if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] if len(one_path) != 0: one_path.extend(shortest_path[1:]) else: one_path.extend(shortest_path) a_ctg_group[(s, t)] = all_alt_path if len(one_path) == 0: continue one_path_edges = zip(one_path[:-1], one_path[1:]) sub_seqs = [] for vv, ww in one_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % ( ctg_id, vv, ww, rid, s, t, aln_score, idt) print >> p_ctg_out, ">%s %s %s %d %d" % ( ctg_id, ctg_label, c_type_, total_length, total_score) print >> p_ctg_out, "".join(sub_seqs) a_id = 1 for v, w, in a_ctg_group: # get the base sequence used in the primary contig #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] ) # if count < 2: # continue atig_output = [] score, atig_path = a_ctg_group[(v, w)][0] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append((v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1)) for score, atig_path in a_ctg_group[(v, w)][1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score seq = "".join(sub_seqs) delta_len = len(seq) - len(base_seq) idt = 0.0 cov = 0.0 if len(base_seq) > 2000 and len(seq) > 2000: aln_data, x, y = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * \ aln_data[-1][-1] / aln_data[-1][-2] cov = 1.0 * \ (aln_data[-1][3] - aln_data[-1] [2]) / aln_data[-1][4] atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov)) if len(atig_output) == 1: continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] if sub_id != 0: print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt) else: print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt) if sub_id != 0: print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov) print >> a_ctg_out, seq else: print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov) print >> a_ctg_base_out, seq sub_id += 1 a_id += 1 a_ctg_out.close() a_ctg_base_out.close() p_ctg_out.close() a_ctg_t_out.close() a_ctg_base_t_out.close() a_ctg_t_out.close() p_ctg_t_out.close()