Exemple #1
0
def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn):
    la4falcon_flags = "mo" + ("I" if ignore_indels else "")

    io.LOG('preparing filter_stage1')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage1, db_fn, fn, la4falcon_flags,
                           max_diff, max_cov, min_cov, min_len, min_idt))

    ignore_all = set()
    contained = set()
    for res in exe_pool.imap(io.run_func, inputs):
        ignore_all.update(res[1]["ignore"])
        contained.update(res[1]["contained"])
    contained = contained.difference(ignore_all) # do not count ignored reads as contained

    # print "all", len(contained)
    io.LOG('preparing filter_stage2')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage2, db_fn, fn, la4falcon_flags,
                           max_diff, max_cov, min_cov, min_len, min_idt, ignore_all, contained, bestn))
    for res in exe_pool.imap(io.run_func, inputs):
        for l in res[1]:
            outs.write(" ".join(l) + "\n")
    io.logstats()
Exemple #2
0
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    pid_to_ctg = get_pid_to_ctg(
        os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map',
                     'read_to_contig_map'))
    io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg)))
    assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list)
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg))
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """

    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)
    assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?'

    with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"),
              "w") as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, pid in bread_to_areads[bread]:
                if pid not in pid_to_ctg:
                    continue

                ctgs = pid_to_ctg[pid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in pid_to_ctg and ctg in pid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
Exemple #3
0
def try_run_ovlp_stats(n_core, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Exemple #4
0
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' %(fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Exemple #5
0
def try_run_ovlp_stats(n_core, db_fn, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn {!r}: {}'.format(fofn, file_list))
    io.LOG('db {!r}; n_core={}'.format(db_fn, n_core))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, db_fn, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Exemple #6
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')
    pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl"))
    file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las"))
    io.LOG('file list: %r' % file_list)
    db_fn = os.path.join(pread_dir, "preads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Exemple #7
0
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    rid_to_ctg = get_rid_to_ctg(
        os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map',
                     'read_to_contig_map'))
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg))
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """
    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)

    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n')
    """
    For each b-read, we find the best contig map throgh the b->a->contig map.
    """
    with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'),
              'w') as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, rid in bread_to_areads[bread]:
                if rid not in rid_to_ctg:
                    continue

                ctgs = rid_to_ctg[rid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            #oid = rid_to_oid[int(bread)]
            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
Exemple #8
0
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    tmp_out_fn = out_fn + '.tmp'
    try:
        with open(tmp_out_fn, 'w') as outs:
            run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov,
                            min_cov, min_len, min_idt, ignore_indels, bestn, db_fn)
        os.rename(tmp_out_fn, out_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Exemple #9
0
def try_run_track_reads(n_core, phased_read_file, read_to_contig_map, rawread_ids, min_len, bestn, output):
    io.LOG('starting track_reads')

    rawread_dir = os.path.abspath('0-rawreads')

    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
    file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las')) # TODO: More inputs
    io.LOG('file list: %r' % file_list)

    db_fn = os.path.join(rawread_dir, 'raw_reads.db') # TODO: Another input
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, phased_read_file, read_to_contig_map, rawread_ids, file_list, min_len, bestn, db_fn, output)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Exemple #10
0
def run_filter_stats(db_fn, fn, min_len):
    try:
        cmd = "LA4Falcon -mo {} {}".format(db_fn, fn)
        reader = Reader(cmd)
        with reader:
            return fn, filter_stats(reader.readlines, min_len)
    except Exception:
        stack = traceback.format_exc()
        io.LOG(stack)
        raise
Exemple #11
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')

    rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads"))

    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
    file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las"))
    io.LOG('file list: %r' % file_list)

    # same, shoud we decide this as a parameter
    db_fn = os.path.join(rawread_dir, "raw_reads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Exemple #12
0
def run_ovlp_filter(exe_pool, ovl, file_list, max_diff, max_cov, min_cov,
                    min_len, bestn, db_fn):
    io.LOG('preparing filter_stage1')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage1, ovl, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len))

    ignore_all = []
    for res in exe_pool.imap(io.run_func, inputs):
        ignore_all.extend(res[1])

    io.LOG('preparing filter_stage2')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage2, ovl, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len, ignore_all))
    contained = set()
    for res in exe_pool.imap(io.run_func, inputs):
        contained.update(res[1])
        #print res[0], len(res[1]), len(contained)

    #print "all", len(contained)
    io.LOG('preparing filter_stage3')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append(
                (run_filter_stage3, ovl, db_fn, fn, max_diff, max_cov, min_cov,
                 min_len, ignore_all, contained, bestn))
    for res in exe_pool.imap(io.run_func, inputs):
        for l in res[1]:
            print " ".join(l)
    io.logstats()
Exemple #13
0
def run_track_reads(exe_pool, phased_read_file_fn, read_to_contig_map_fn, rawread_ids_fn, file_list, min_len, bestn, db_fn, rawread_to_contigs_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    rid_to_ctg = get_rid_to_ctg(read_to_contig_map_fn)

    oid_to_phase = {}
    with open(phased_read_file_fn) as f:
        for row in f:
            row = row.strip().split()
            ctg_id, block, phase = row[1:4]
            oid = row[6]
            block = int(block)
            phase = int(phase)
            oid_to_phase[ oid ] = (ctg_id, block, phase)
    rid_to_phase = {}
    rid_to_oid = open(rawread_ids_fn).read().split('\n')
    rid_to_phase = [ None ] * len( rid_to_oid )
    for rid, oid in enumerate(rid_to_oid):
        rid_to_phase[rid] = oid_to_phase.get( oid, None )


    inputs = []
    for fn in file_list:
        inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """
    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush( bread_to_areads[k], item )
                else:
                    heappushpop( bread_to_areads[k], item )

    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')

    """
    For each b-read, we find the best contig map throgh the b->a->contig map.
    """
    with open(rawread_to_contigs_fn, 'w') as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, rid in bread_to_areads[bread]:
                if rid not in rid_to_ctg: continue

                ctgs = rid_to_ctg[rid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0,0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            #oid = rid_to_oid[int(bread)]
            ctg_score = ctg_score.items()
            ctg_score.sort( key = lambda k: k[1][0] )
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1