Example #1
0
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    rid_to_ctg = get_rid_to_ctg(
        os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map',
                     'read_to_contig_map'))
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg))
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """
    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)

    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n')
    """
    For each b-read, we find the best contig map throgh the b->a->contig map.
    """
    with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'),
              'w') as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, rid in bread_to_areads[bread]:
                if rid not in rid_to_ctg:
                    continue

                ctgs = rid_to_ctg[rid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            #oid = rid_to_oid[int(bread)]
            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    rid_to_ctg = get_rid_to_ctg(os.path.join(
        asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map'))
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg))
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """
    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)

    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n')

    """
    For each b-read, we find the best contig map throgh the b->a->contig map.
    """
    with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, rid in bread_to_areads[bread]:
                if rid not in rid_to_ctg:
                    continue

                ctgs = rid_to_ctg[rid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            #oid = rid_to_oid[int(bread)]
            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
Example #3
0
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    pid_to_ctg = get_pid_to_ctg(
        os.path.join(asm_dir, 'read_maps', 'get_ctg_read_map',
                     'read_to_contig_map'))
    io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg)))
    assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list)
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg))
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """

    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)
    assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?'

    with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"),
              "w") as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, pid in bread_to_areads[bread]:
                if pid not in pid_to_ctg:
                    continue

                ctgs = pid_to_ctg[pid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in pid_to_ctg and ctg in pid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
    pid_to_ctg = get_pid_to_ctg(os.path.join(
        asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map'))
    io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg)))
    assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list)
    inputs = []
    for fn in file_list:
        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg))

    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """

    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush(bread_to_areads[k], item)
                else:
                    heappushpop(bread_to_areads[k], item)
    assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?'

    with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"), "w") as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, pid in bread_to_areads[bread]:
                if pid not in pid_to_ctg:
                    continue

                ctgs = pid_to_ctg[pid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0, 0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            ctg_score = list(ctg_score.items())
            ctg_score.sort(key=lambda k: k[1][0])
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in pid_to_ctg and ctg in pid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1
Example #5
0
def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn):
    la4falcon_flags = "mo" + ("I" if ignore_indels else "")

    io.LOG('preparing filter_stage1')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage1, db_fn, fn, la4falcon_flags,
                           max_diff, max_cov, min_cov, min_len, min_idt))

    ignore_all = set()
    contained = set()
    for res in exe_pool.imap(io.run_func, inputs):
        ignore_all.update(res[1]["ignore"])
        contained.update(res[1]["contained"])
    contained = contained.difference(ignore_all) # do not count ignored reads as contained

    # print "all", len(contained)
    io.LOG('preparing filter_stage2')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage2, db_fn, fn, la4falcon_flags,
                           max_diff, max_cov, min_cov, min_len, min_idt, ignore_all, contained, bestn))
    for res in exe_pool.imap(io.run_func, inputs):
        for l in res[1]:
            outs.write(" ".join(l) + "\n")
    io.logstats()
Example #6
0
def run_ovlp_filter(exe_pool, ovl, file_list, max_diff, max_cov, min_cov,
                    min_len, bestn, db_fn):
    io.LOG('preparing filter_stage1')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage1, ovl, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len))

    ignore_all = []
    for res in exe_pool.imap(io.run_func, inputs):
        ignore_all.extend(res[1])

    io.LOG('preparing filter_stage2')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage2, ovl, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len, ignore_all))
    contained = set()
    for res in exe_pool.imap(io.run_func, inputs):
        contained.update(res[1])
        #print res[0], len(res[1]), len(contained)

    #print "all", len(contained)
    io.LOG('preparing filter_stage3')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append(
                (run_filter_stage3, ovl, db_fn, fn, max_diff, max_cov, min_cov,
                 min_len, ignore_all, contained, bestn))
    for res in exe_pool.imap(io.run_func, inputs):
        for l in res[1]:
            print " ".join(l)
    io.logstats()
Example #7
0
def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG('preparing filter_stage1')
    io.logstats()
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage1, db_fn, fn,
                           max_diff, max_cov, min_cov, min_len))

    ignore_all = []
    for res in exe_pool.imap(io.run_func, inputs):
        ignore_all.extend(res[1])

    io.LOG('preparing filter_stage2')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage2, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len, ignore_all))
    contained = set()
    for res in exe_pool.imap(io.run_func, inputs):
        contained.update(res[1])
        # print res[0], len(res[1]), len(contained)

    # print "all", len(contained)
    io.LOG('preparing filter_stage3')
    io.logstats()
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((run_filter_stage3, db_fn, fn, max_diff,
                           max_cov, min_cov, min_len, ignore_all, contained, bestn))
    for res in exe_pool.imap(io.run_func, inputs):
        for l in res[1]:
            outs.write(" ".join(l) + "\n")
    io.logstats()
Example #8
0
def run_track_reads(exe_pool, phased_read_file_fn, read_to_contig_map_fn, rawread_ids_fn, file_list, min_len, bestn, db_fn, rawread_to_contigs_fn):
    io.LOG('preparing tr_stage1')
    io.logstats()
    rid_to_ctg = get_rid_to_ctg(read_to_contig_map_fn)

    oid_to_phase = {}
    with open(phased_read_file_fn) as f:
        for row in f:
            row = row.strip().split()
            ctg_id, block, phase = row[1:4]
            oid = row[6]
            block = int(block)
            phase = int(phase)
            oid_to_phase[ oid ] = (ctg_id, block, phase)
    rid_to_phase = {}
    rid_to_oid = open(rawread_ids_fn).read().split('\n')
    rid_to_phase = [ None ] * len( rid_to_oid )
    for rid, oid in enumerate(rid_to_oid):
        rid_to_phase[rid] = oid_to_phase.get( oid, None )


    inputs = []
    for fn in file_list:
        inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
    """
    Aggregate hits from each individual LAS and keep the best n hit.
    Note that this does not guarantee that the final results is globally the best n hits espcially
    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
    file, then we will miss some good  hits.
    """
    bread_to_areads = {}
    for fn, res in exe_pool.imap(io.run_func, inputs):
        for k in res:
            bread_to_areads.setdefault(k, [])
            for item in res[k]:
                if len(bread_to_areads[k]) < bestn:
                    heappush( bread_to_areads[k], item )
                else:
                    heappushpop( bread_to_areads[k], item )

    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')

    """
    For each b-read, we find the best contig map throgh the b->a->contig map.
    """
    with open(rawread_to_contigs_fn, 'w') as out_f:
        for bread in bread_to_areads:

            ctg_score = {}
            for s, rid in bread_to_areads[bread]:
                if rid not in rid_to_ctg: continue

                ctgs = rid_to_ctg[rid]
                for ctg in ctgs:
                    ctg_score.setdefault(ctg, [0,0])
                    ctg_score[ctg][0] += -s
                    ctg_score[ctg][1] += 1

            #oid = rid_to_oid[int(bread)]
            ctg_score = ctg_score.items()
            ctg_score.sort( key = lambda k: k[1][0] )
            rank = 0

            for ctg, score_count in ctg_score:
                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
                    in_ctg = 1
                else:
                    in_ctg = 0
                score, count = score_count
                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
                rank += 1