def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn): la4falcon_flags = "mo" + ("I" if ignore_indels else "") io.LOG('preparing filter_stage1') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage1, db_fn, fn, la4falcon_flags, max_diff, max_cov, min_cov, min_len, min_idt)) ignore_all = set() contained = set() for res in exe_pool.imap(io.run_func, inputs): ignore_all.update(res[1]["ignore"]) contained.update(res[1]["contained"]) contained = contained.difference(ignore_all) # do not count ignored reads as contained # print "all", len(contained) io.LOG('preparing filter_stage2') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage2, db_fn, fn, la4falcon_flags, max_diff, max_cov, min_cov, min_len, min_idt, ignore_all, contained, bestn)) for res in exe_pool.imap(io.run_func, inputs): for l in res[1]: outs.write(" ".join(l) + "\n") io.logstats()
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) pid_to_ctg = get_pid_to_ctg( os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg))) assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?' with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"), "w") as out_f: for bread in bread_to_areads: ctg_score = {} for s, pid in bread_to_areads[bread]: if pid not in pid_to_ctg: continue ctgs = pid_to_ctg[pid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in pid_to_ctg and ctg in pid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def try_run_ovlp_stats(n_core, fofn, min_len): io.LOG('starting ovlp_stats') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_stats(exe_pool, file_list, min_len) io.LOG('finished ovlp_stats') except KeyboardInterrupt: io.LOG('terminating ovlp_stats workers...') exe_pool.terminate()
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('starting ovlp_filter') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' %(fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn) io.LOG('finished ovlp_filter') except: io.LOG('terminating ovlp_filter workers...') exe_pool.terminate() raise
def try_run_ovlp_stats(n_core, db_fn, fofn, min_len): io.LOG('starting ovlp_stats') file_list = io.validated_fns(fofn) io.LOG('fofn {!r}: {}'.format(fofn, file_list)) io.LOG('db {!r}; n_core={}'.format(db_fn, n_core)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_stats(exe_pool, db_fn, file_list, min_len) io.LOG('finished ovlp_stats') except KeyboardInterrupt: io.LOG('terminating ovlp_stats workers...') exe_pool.terminate()
def try_run_track_reads(n_core, base_dir, min_len, bestn): io.LOG('starting track_reads') pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl")) file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las")) io.LOG('file list: %r' % file_list) db_fn = os.path.join(pread_dir, "preads.db") n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn): io.LOG('preparing tr_stage1') io.logstats() asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon')) rid_to_ctg = get_rid_to_ctg( os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map')) inputs = [] for fn in file_list: inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg)) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush(bread_to_areads[k], item) else: heappushpop(bread_to_areads[k], item) #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n') """ For each b-read, we find the best contig map throgh the b->a->contig map. """ with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f: for bread in bread_to_areads: ctg_score = {} for s, rid in bread_to_areads[bread]: if rid not in rid_to_ctg: continue ctgs = rid_to_ctg[rid] for ctg in ctgs: ctg_score.setdefault(ctg, [0, 0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 #oid = rid_to_oid[int(bread)] ctg_score = list(ctg_score.items()) ctg_score.sort(key=lambda k: k[1][0]) rank = 0 for ctg, score_count in ctg_score: if bread in rid_to_ctg and ctg in rid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f) print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn): io.LOG('starting ovlp_filter') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) tmp_out_fn = out_fn + '.tmp' try: with open(tmp_out_fn, 'w') as outs: run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn) os.rename(tmp_out_fn, out_fn) io.LOG('finished ovlp_filter') except: io.LOG('terminating ovlp_filter workers...') exe_pool.terminate() raise
def try_run_track_reads(n_core, phased_read_file, read_to_contig_map, rawread_ids, min_len, bestn, output): io.LOG('starting track_reads') rawread_dir = os.path.abspath('0-rawreads') # better logic for finding the las files path or move the logic to extern (taking the --fofn option?) file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las')) # TODO: More inputs io.LOG('file list: %r' % file_list) db_fn = os.path.join(rawread_dir, 'raw_reads.db') # TODO: Another input n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, phased_read_file, read_to_contig_map, rawread_ids, file_list, min_len, bestn, db_fn, output) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def run_filter_stats(db_fn, fn, min_len): try: cmd = "LA4Falcon -mo {} {}".format(db_fn, fn) reader = Reader(cmd) with reader: return fn, filter_stats(reader.readlines, min_len) except Exception: stack = traceback.format_exc() io.LOG(stack) raise
def try_run_track_reads(n_core, base_dir, min_len, bestn): io.LOG('starting track_reads') rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads")) # better logic for finding the las files path or move the logic to extern (taking the --fofn option?) file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las")) io.LOG('file list: %r' % file_list) # same, shoud we decide this as a parameter db_fn = os.path.join(rawread_dir, "raw_reads.db") n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def run_ovlp_filter(exe_pool, ovl, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('preparing filter_stage1') io.logstats() inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage1, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len)) ignore_all = [] for res in exe_pool.imap(io.run_func, inputs): ignore_all.extend(res[1]) io.LOG('preparing filter_stage2') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append((run_filter_stage2, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all)) contained = set() for res in exe_pool.imap(io.run_func, inputs): contained.update(res[1]) #print res[0], len(res[1]), len(contained) #print "all", len(contained) io.LOG('preparing filter_stage3') io.logstats() inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append( (run_filter_stage3, ovl, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn)) for res in exe_pool.imap(io.run_func, inputs): for l in res[1]: print " ".join(l) io.logstats()
def run_track_reads(exe_pool, phased_read_file_fn, read_to_contig_map_fn, rawread_ids_fn, file_list, min_len, bestn, db_fn, rawread_to_contigs_fn): io.LOG('preparing tr_stage1') io.logstats() rid_to_ctg = get_rid_to_ctg(read_to_contig_map_fn) oid_to_phase = {} with open(phased_read_file_fn) as f: for row in f: row = row.strip().split() ctg_id, block, phase = row[1:4] oid = row[6] block = int(block) phase = int(phase) oid_to_phase[ oid ] = (ctg_id, block, phase) rid_to_phase = {} rid_to_oid = open(rawread_ids_fn).read().split('\n') rid_to_phase = [ None ] * len( rid_to_oid ) for rid, oid in enumerate(rid_to_oid): rid_to_phase[rid] = oid_to_phase.get( oid, None ) inputs = [] for fn in file_list: inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) ) """ Aggregate hits from each individual LAS and keep the best n hit. Note that this does not guarantee that the final results is globally the best n hits espcially when the number of `bestn` is too small. In those case, if there is more hits from single LAS file, then we will miss some good hits. """ bread_to_areads = {} for fn, res in exe_pool.imap(io.run_func, inputs): for k in res: bread_to_areads.setdefault(k, []) for item in res[k]: if len(bread_to_areads[k]) < bestn: heappush( bread_to_areads[k], item ) else: heappushpop( bread_to_areads[k], item ) #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n') """ For each b-read, we find the best contig map throgh the b->a->contig map. """ with open(rawread_to_contigs_fn, 'w') as out_f: for bread in bread_to_areads: ctg_score = {} for s, rid in bread_to_areads[bread]: if rid not in rid_to_ctg: continue ctgs = rid_to_ctg[rid] for ctg in ctgs: ctg_score.setdefault(ctg, [0,0]) ctg_score[ctg][0] += -s ctg_score[ctg][1] += 1 #oid = rid_to_oid[int(bread)] ctg_score = ctg_score.items() ctg_score.sort( key = lambda k: k[1][0] ) rank = 0 for ctg, score_count in ctg_score: if bread in rid_to_ctg and ctg in rid_to_ctg[bread]: in_ctg = 1 else: in_ctg = 0 score, count = score_count #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f) print(bread, ctg, count, rank, score, in_ctg, file=out_f) rank += 1