def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) rid_to_ctg = get_rid_to_ctg( os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n') """ For each b-read, we find the best contig map throgh the b->a->contig map. """ with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f: for bread in bread_to_areads: ctg_score = {} for s, rid in bread_to_areads[bread]: if rid not in rid_to_ctg: continue ctgs = rid_to_ctg[rid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 #oid = rid_to_oid[int(bread)] ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in rid_to_ctg and ctg in rid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f) print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) rid_to_ctg = get_rid_to_ctg(os.path.join( asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n') """ For each b-read, we find the best contig map throgh the b->a->contig map. """ with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f: for bread in bread_to_areads: ctg_score = {} for s, rid in bread_to_areads[bread]: if rid not in rid_to_ctg: continue ctgs = rid_to_ctg[rid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 #oid = rid_to_oid[int(bread)] ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in rid_to_ctg and ctg in rid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f) print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) pid_to_ctg = get_pid_to_ctg( os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg))) assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?' with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"), "w") as out_f: for bread in bread_to_areads: ctg_score = {} for s, pid in bread_to_areads[bread]: if pid not in pid_to_ctg: continue ctgs = pid_to_ctg[pid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in pid_to_ctg and ctg in pid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) pid_to_ctg = get_pid_to_ctg(os.path.join( asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg))) assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?' with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"), "w") as out_f: for bread in bread_to_areads: ctg_score = {} for s, pid in bread_to_areads[bread]: if pid not in pid_to_ctg: continue ctgs = pid_to_ctg[pid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in pid_to_ctg and ctg in pid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn): la4falcon_flags = "mo" + ("I" if ignore_indels else "") io.LOG('preparing filter_stage1') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage1, db_fn, fn, la4falcon_flags, max_diff, max_cov, min_cov, min_len, min_idt)) ignore_all = set() contained = set() for res in exe_pool.imap(io.run_func, inputs): ignore_all.update(res[1]["ignore"]) contained.update(res[1]["contained"]) contained = contained.difference(ignore_all) # do not count ignored reads as contained # print "all", len(contained) io.LOG('preparing filter_stage2') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage2, db_fn, fn, la4falcon_flags, max_diff, max_cov, min_cov, min_len, min_idt, ignore_all, contained, bestn)) for res in exe_pool.imap(io.run_func, inputs): for l in res[1]: outs.write(" ".join(l) + "\n") io.logstats()
def run_ovlp_filter(exe_pool, ovl, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('preparing filter_stage1') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage1, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len)) ignore_all = [] for res in exe_pool.imap(io.run_func, inputs): ignore_all.extend(res[1]) io.LOG('preparing filter_stage2') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage2, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all)) contained = set() for res in exe_pool.imap(io.run_func, inputs): contained.update(res[1]) #print res[0], len(res[1]), len(contained) #print "all", len(contained) io.LOG('preparing filter_stage3') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append( (run_filter_stage3, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn)) for res in exe_pool.imap(io.run_func, inputs): for l in res[1]: print " ".join(l) io.logstats()
def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('preparing filter_stage1') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage1, db_fn, fn, max_diff, max_cov, min_cov, min_len)) ignore_all = [] for res in exe_pool.imap(io.run_func, inputs): ignore_all.extend(res[1]) io.LOG('preparing filter_stage2') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage2, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all)) contained = set() for res in exe_pool.imap(io.run_func, inputs): contained.update(res[1]) # print res[0], len(res[1]), len(contained) # print "all", len(contained) io.LOG('preparing filter_stage3') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage3, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn)) for res in exe_pool.imap(io.run_func, inputs): for l in res[1]: outs.write(" ".join(l) + "\n") io.logstats()
def run_track_reads(exe_pool, phased_read_file_fn, read_to_contig_map_fn, rawread_ids_fn, file_list, min_len, bestn, db_fn, rawread_to_contigs_fn): io.LOG('preparing tr_stage1') io.logstats() rid_to_ctg = get_rid_to_ctg(read_to_contig_map_fn) oid_to_phase = {} with open(phased_read_file_fn) as f: for row in f: row = row.strip().split() ctg_id, block, phase = row[1:4] oid = row[6] block = int(block) phase = int(phase) oid_to_phase[ oid ] = (ctg_id, block, phase) rid_to_phase = {} rid_to_oid = open(rawread_ids_fn).read().split('\n') rid_to_phase = [ None ] * len( rid_to_oid ) for rid, oid in enumerate(rid_to_oid): rid_to_phase[rid] = oid_to_phase.get( oid, None ) inputs = [] for fn in file_list: inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) ) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush( bread_to_areads[k], item ) else: heappushpop( bread_to_areads[k], item ) #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n') """ For each b-read, we find the best contig map throgh the b->a->contig map. """ with open(rawread_to_contigs_fn, 'w') as out_f: for bread in bread_to_areads: ctg_score = {} for s, rid in bread_to_areads[bread]: if rid not in rid_to_ctg: continue ctgs = rid_to_ctg[rid] for ctg in ctgs: ctg_score.setdefault(ctg, [0,0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 #oid = rid_to_oid[int(bread)] ctg_score = ctg_score.items() ctg_score.sort( key = lambda k: k[1][0] ) rank = 0 for ctg, score_count in ctg_score: if bread in rid_to_ctg and ctg in rid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f) print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1