コード例 #1
0
def test_load_headers_2(tmpdir):
    """
    There is a duplicate header in the input. This should be reduced to
    only the unique occurrences.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in range(500)])

    fasta_lines = [
        '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00',
        dummy_seq,
        '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
        dummy_seq,
        '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
        dummy_seq,
    ]

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set(['000000F-001-01', '000000F-002-02'])

    assert (out == expected)
コード例 #2
0
def test_load_headers_1(tmpdir):
    """
    Regular case.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in range(500)])

    fasta_lines = [
        '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00',
        dummy_seq,
        '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
        dummy_seq,
    ]

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set(['000000F-001-01', '000000F-002-02'])

    assert (out == expected)
コード例 #3
0
ファイル: test_dedup_a_tp.py プロジェクト: pb-cdunn/FALCON
def test_load_headers_2(tmpdir):
    """
    There is a duplicate header in the input. This should be reduced to
    only the unique occurrences.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)])

    fasta_lines = [
                    '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00',
                    dummy_seq,
                    '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
                    dummy_seq,
                    '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
                    dummy_seq,
                  ]

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set(['000000F-001-01', '000000F-002-02'])

    assert(out == expected)
コード例 #4
0
ファイル: test_dedup_a_tp.py プロジェクト: pb-cdunn/FALCON
def test_load_headers_1(tmpdir):
    """
    Regular case.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)])

    fasta_lines = [
                    '>000000F-001-01 000000123:E 000000078:E 46974 1268418 33 0 1.00 1.00',
                    dummy_seq,
                    '>000000F-002-02 000000125:E 000000080:E 46974 1268418 33 0 1.00 1.00',
                    dummy_seq,
                  ]

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set(['000000F-001-01', '000000F-002-02'])

    assert(out == expected)
コード例 #5
0
def main(argv=sys.argv):
    args = parse_args(argv)
    with open_fasta_reader("a_ctg_all.fa") as reads:
        with open("a_ctg.fa", "w") as f:
            for r in reads:
                tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
                if 100 * float(idt) > args.max_idt and 100 * float(cov) > args.max_aln_cov and\
                   abs(int(delta_l)) < args.min_len_diff:
                    continue
                print >> f, ">" + r.name
                print >> f, r.sequence
コード例 #6
0
def main(argv=sys.argv):
    args = parse_args(argv)
    with open_fasta_reader("a_ctg_all.fa") as reads:
        with open("a_ctg.fa", "w") as f:
            for r in reads:
                tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
                if 100 * float(idt) > args.max_idt and 100 * float(cov) > args.max_aln_cov and\
                   abs(int(delta_l)) < args.min_len_diff:
                    continue
                print(">" + r.name, file=f)
                print(r.sequence, file=f)
コード例 #7
0
ファイル: dedup_a_tp.py プロジェクト: pb-cdunn/FALCON
def run(fp_out, a_ctg, a_ctg_all_tiling_path):
    with open_fasta_reader(a_ctg) as fp_in:
        a_ctg_ids = load_headers(fp_in)

    with open(a_ctg_all_tiling_path, 'r') as fp_in:
        for line in fp_in:
            line = line.strip()
            if len(line) == 0:  # pragma: no cover
                continue        # pragma: no cover
            sl = line.split()
            if sl[0] not in a_ctg_ids:
                continue
            fp_out.write('%s\n' % (line))
コード例 #8
0
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, '3-unzip/reads')

    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')

    rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids',
                                   'rawread_ids')
    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')

    rid_to_oid = open(rawread_id_file).read().split(
        '\n')  #daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split(
        '\n')  #daligner pread id to the fake ids

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split('/')[1]) / 10
        return rid_to_oid[int(rid)]

    with open_fasta_reader(ctg_fa) as ref_fasta:
        all_ctg_ids = set()
        for s in ref_fasta:
            s_id = s.name.split()[0]
            if ctg_id != 'all' and s_id != ctg_id:
                continue

            if len(s.sequence) < min_ctg_lenth:
                continue

            if ctg_id != 'all':
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % ctg_id),
                               'w')
            else:
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')

            print >> ref_out, '>%s' % s_id
            print >> ref_out, s.sequence
            all_ctg_ids.add(s_id)
            ref_out.close()
コード例 #9
0
def test_load_headers_3(tmpdir):
    """
    Empty input.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in range(500)])

    fasta_lines = []

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set()

    assert (out == expected)
コード例 #10
0
ファイル: test_dedup_a_tp.py プロジェクト: pb-cdunn/FALCON
def test_load_headers_3(tmpdir):
    """
    Empty input.
    """
    random.seed(1234567)

    dummy_seq = ''.join([random.choice('ACTG') for i in xrange(500)])

    fasta_lines = [
                  ]

    # Create a temporary input files.
    test_out_a_ctg_file = tmpdir.join('a_ctg.fa')
    test_out_a_ctg_file.write('\n'.join(fasta_lines))

    # fp_in = StringIO('\n'.join(fasta_lines))
    with open_fasta_reader(str(test_out_a_ctg_file)) as fp_in:
        out = mod.load_headers(fp_in)

    expected = set()

    assert(out == expected)
コード例 #11
0
def main(argv=sys.argv):
    p_ctg_coor_map = {}
    with open("p_ctg_tiling_path") as f:
        for row in f:
            row = row.strip().split()
            ctg_id, v, w, edge_rid, b, e = row[:6]
            if ctg_id not in p_ctg_coor_map:
                coor = 0  # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
                p_ctg_coor_map[ctg_id] = {}
                p_ctg_coor_map[ctg_id][v] = 0
                coor += abs(int(b) - int(e))
                p_ctg_coor_map[ctg_id][w] = coor
                continue
            else:
                coor += abs(int(b) - int(e))
                p_ctg_coor_map[ctg_id][w] = coor

    with open_fasta_reader("a_ctg.fa") as a_ctg_fasta:
        for r in a_ctg_fasta:
            rid = r.name.split()
            rid, v, w = rid[:3]
            pid = rid.split("-")[0]
            print(rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w])
コード例 #12
0
def run(improper_p_ctg, proper_a_ctg):
    """improper==True => Neglect the initial read.
    We used to need that for unzip.
    """
    reads_in_layout = set()
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
            v, w, rid, s, t, aln_score, idt, type_ = l
            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

    seqs = {}
    # load all p-read name into memory
    with open_fasta_reader(read_fasta) as f:
        for r in f:
            if r.name not in reads_in_layout:
                continue
            seqs[r.name] = r.sequence.upper() # name == rid-string

    edge_data = {}
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
            v, w, rid, s, t, aln_score, idt, type_ = l

            if type_ != "G":
                continue
            r1, dir1 = v.split(":")
            reads_in_layout.add(r1) # redundant, but harmless
            r2, dir2 = w.split(":")
            reads_in_layout.add(r2) # redundant, but harmless

            s = int(s)
            t = int(t)
            aln_score = int(aln_score)
            idt = float(idt)

            if s < t:
                e_seq = seqs[rid][s:t]
                assert 'E' == dir2
            else:
                # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702
                # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon.
                e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]])
                assert 'B' == dir2
            edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)

    utg_data = {}
    with open(utg_data_file) as f:
        for l in f:
            l = l.strip().split()
            s, v, t, type_, length, score, path_or_edges = l
            if type_ not in ["compound", "simple", "contained"]:
                continue
            length = int(length)
            score = int(score)
            if type_ in ("simple", "contained"):
                path_or_edges = path_or_edges.split("~")
            else:
                path_or_edges = [tuple(e.split("~"))
                                 for e in path_or_edges.split("|")]
            utg_data[(s, v, t)] = type_, length, score, path_or_edges

    p_ctg_out = open("p_ctg.fa", "w")
    a_ctg_out = open("a_ctg_all.fa", "w")
    a_ctg_base_out = open("a_ctg_base.fa", "w")
    p_ctg_t_out = open("p_ctg_tiling_path", "w")
    a_ctg_t_out = open("a_ctg_tiling_path", "w")
    a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w")
    layout_ctg = set()

    with open(ctg_data_file) as f:
        for l in f:
            l = l.strip().split()
            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
            ctg_id = ctg_id
            s0 = i_utig.split("~")[0]

            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
                continue
            else:
                layout_ctg.add((s0, t0))

            ctg_label = i_utig + "~" + t0
            length = int(length)
            utgs = utgs.split("|")
            one_path = []
            total_score = 0
            total_length = 0

            #a_ctg_data = []
            a_ctg_group = {}

            for utg in utgs:
                s, v, t = utg.split("~")
                type_, length, score, path_or_edges = utg_data[(s, v, t)]
                total_score += score
                total_length += length
                if type_ == "simple":
                    if len(one_path) != 0:
                        one_path.extend(path_or_edges[1:])
                    else:
                        one_path.extend(path_or_edges)
                if type_ == "compound":

                    c_graph = nx.DiGraph()

                    all_alt_path = []
                    for ss, vv, tt in path_or_edges:
                        type_, length, score, sub_path = utg_data[(ss, vv, tt)]

                        v1 = sub_path[0]
                        for v2 in sub_path[1:]:
                            c_graph.add_edge(
                                v1, v2, e_score=edge_data[(v1, v2)][3])
                            v1 = v2

                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
                    all_alt_path.append((score, shortest_path))

                    # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
                    while 1:
                        n0 = shortest_path[0]
                        for n1 in shortest_path[1:]:
                            c_graph.remove_edge(n0, n1)
                            n0 = n1
                        try:
                            shortest_path = nx.shortest_path(
                                c_graph, s, t, "e_score")
                            score = nx.shortest_path_length(
                                c_graph, s, t, "e_score")
                            #a_ctg_data.append( (s, t, shortest_path) )
                            all_alt_path.append((score, shortest_path))

                        except nx.exception.NetworkXNoPath:
                            break
                        # if len(shortest_path) < 2:
                        #    break
                    all_alt_path.sort()
                    all_alt_path.reverse()
                    shortest_path = all_alt_path[0][1]
                    if len(one_path) != 0:
                        one_path.extend(shortest_path[1:])
                    else:
                        one_path.extend(shortest_path)

                    a_ctg_group[(s, t)] = all_alt_path

            if len(one_path) == 0:
                continue

            one_path_edges = list(zip(one_path[:-1], one_path[1:]))

            if improper_p_ctg:
                sub_seqs = []
            else:
                sub_seqs = list(yield_first_seq(one_path_edges, seqs))
            for vv, ww in one_path_edges:
                rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                sub_seqs.append(e_seq)
                print("%s %s %s %s %d %d %d %0.2f" % (
                    ctg_id, vv, ww, rid, s, t, aln_score, idt), file=p_ctg_t_out)
            print(">%s %s %s %d %d" % (
                ctg_id, ctg_label, c_type_, total_length, total_score), file=p_ctg_out)
            print("".join(sub_seqs), file=p_ctg_out)

            a_id = 1
            for v, w, in a_ctg_group:
                # get the base sequence used in the primary contig
                atig_output = []

                score, atig_path = a_ctg_group[(v, w)][0]
                atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
                if not proper_a_ctg:
                    sub_seqs = []
                else:
                    sub_seqs = list(yield_first_seq(atig_path_edges, seqs))
                total_length = 0
                total_score = 0
                for vv, ww in atig_path_edges:
                    rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                    sub_seqs.append(e_seq)
                    total_length += abs(s - t)
                    total_score += aln_score

                base_seq = "".join(sub_seqs)
                atig_output.append(
                    (v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1))

                for score, atig_path in a_ctg_group[(v, w)][1:]:
                    atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
                    if not proper_a_ctg:
                        sub_seqs = []
                    else:
                        sub_seqs = list(yield_first_seq(atig_path_edges, seqs))
                    total_length = 0
                    total_score = 0
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        sub_seqs.append(e_seq)
                        total_length += abs(s - t)
                        total_score += aln_score

                    seq = "".join(sub_seqs)

                    delta_len = len(seq) - len(base_seq)
                    idt = 0.0
                    cov = 0.0
                    if len(base_seq) > 2000 and len(seq) > 2000:
                        try:
                          aln_data = get_aln_data(base_seq, seq)
                          if len(aln_data) != 0:
                            idt = 1.0 - 1.0 * \
                                aln_data[-1][-1] / aln_data[-1][-2]
                            cov = 1.0 * \
                                (aln_data[-1][3] - aln_data[-1]
                                 [2]) / aln_data[-1][4]
                        except TooLongError:
                            log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs, at atig_path[:-1] == {}'.format(atig_path[:-1]))
                            idt = -1.0
                            cov = -1.0

                    atig_output.append(
                        (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov))

                if len(atig_output) == 1:
                    continue

                sub_id = 0
                for data in atig_output:
                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        if sub_id != 0:
                            print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_t_out)
                        else:
                            print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt), file=a_ctg_base_t_out)

                    if sub_id != 0:
                        print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_out)
                        print(seq, file=a_ctg_out)
                    else:
                        print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov), file=a_ctg_base_out)
                        print(seq, file=a_ctg_base_out)

                    sub_id += 1

                a_id += 1

    a_ctg_out.close()
    a_ctg_base_out.close()
    p_ctg_out.close()
    a_ctg_t_out.close()
    a_ctg_base_t_out.close()
    a_ctg_t_out.close()
    p_ctg_t_out.close()
コード例 #13
0
ファイル: graph_to_contig.py プロジェクト: pythseq/FALCON-1
def run(improper_p_ctg, proper_a_ctg):
    """improper==True => Neglect the initial read.
    We used to need that for unzip.
    """
    reads_in_layout = set()
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
            v, w, rid, s, t, aln_score, idt, type_ = l
            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

    seqs = {}
    # load all p-read name into memory
    with open_fasta_reader(read_fasta) as f:
        for r in f:
            if r.name not in reads_in_layout:
                continue
            seqs[r.name] = r.sequence.upper()  # name == rid-string

    edge_data = {}
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
            v, w, rid, s, t, aln_score, idt, type_ = l

            if type_ != "G":
                continue
            r1, dir1 = v.split(":")
            reads_in_layout.add(r1)  # redundant, but harmless
            r2, dir2 = w.split(":")
            reads_in_layout.add(r2)  # redundant, but harmless

            s = int(s)
            t = int(t)
            aln_score = int(aln_score)
            idt = float(idt)

            if s < t:
                e_seq = seqs[rid][s:t]
                assert 'E' == dir2
            else:
                # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702
                # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon.
                e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]])
                assert 'B' == dir2
            edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)

    utg_data = {}
    with open(utg_data_file) as f:
        for l in f:
            l = l.strip().split()
            s, v, t, type_, length, score, path_or_edges = l
            if type_ not in ["compound", "simple", "contained"]:
                continue
            length = int(length)
            score = int(score)
            if type_ in ("simple", "contained"):
                path_or_edges = path_or_edges.split("~")
            else:
                path_or_edges = [
                    tuple(e.split("~")) for e in path_or_edges.split("|")
                ]
            utg_data[(s, v, t)] = type_, length, score, path_or_edges

    p_ctg_out = open("p_ctg.fa", "w")
    a_ctg_out = open("a_ctg_all.fa", "w")
    a_ctg_base_out = open("a_ctg_base.fa", "w")
    p_ctg_t_out = open("p_ctg_tiling_path", "w")
    a_ctg_t_out = open("a_ctg_tiling_path", "w")
    a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w")
    layout_ctg = set()

    with open(ctg_data_file) as f:
        for l in f:
            l = l.strip().split()
            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
            ctg_id = ctg_id
            s0 = i_utig.split("~")[0]

            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
                continue
            else:
                layout_ctg.add((s0, t0))

            ctg_label = i_utig + "~" + t0
            length = int(length)
            utgs = utgs.split("|")
            one_path = []
            total_score = 0
            total_length = 0

            #a_ctg_data = []
            a_ctg_group = {}

            for utg in utgs:
                s, v, t = utg.split("~")
                type_, length, score, path_or_edges = utg_data[(s, v, t)]
                total_score += score
                total_length += length
                if type_ == "simple":
                    if len(one_path) != 0:
                        one_path.extend(path_or_edges[1:])
                    else:
                        one_path.extend(path_or_edges)
                if type_ == "compound":

                    c_graph = nx.DiGraph()

                    all_alt_path = []
                    for ss, vv, tt in path_or_edges:
                        type_, length, score, sub_path = utg_data[(ss, vv, tt)]

                        v1 = sub_path[0]
                        for v2 in sub_path[1:]:
                            c_graph.add_edge(v1,
                                             v2,
                                             e_score=edge_data[(v1, v2)][3])
                            v1 = v2

                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
                    all_alt_path.append((score, shortest_path))

                    # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
                    while 1:
                        n0 = shortest_path[0]
                        for n1 in shortest_path[1:]:
                            c_graph.remove_edge(n0, n1)
                            n0 = n1
                        try:
                            shortest_path = nx.shortest_path(
                                c_graph, s, t, "e_score")
                            score = nx.shortest_path_length(
                                c_graph, s, t, "e_score")
                            #a_ctg_data.append( (s, t, shortest_path) )
                            all_alt_path.append((score, shortest_path))

                        except nx.exception.NetworkXNoPath:
                            break
                        # if len(shortest_path) < 2:
                        #    break
                    all_alt_path.sort()
                    all_alt_path.reverse()
                    shortest_path = all_alt_path[0][1]
                    if len(one_path) != 0:
                        one_path.extend(shortest_path[1:])
                    else:
                        one_path.extend(shortest_path)

                    a_ctg_group[(s, t)] = all_alt_path

            if len(one_path) == 0:
                continue

            one_path_edges = list(zip(one_path[:-1], one_path[1:]))

            if improper_p_ctg:
                sub_seqs = []
            else:
                sub_seqs = list(yield_first_seq(one_path_edges, seqs))
            for vv, ww in one_path_edges:
                rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                sub_seqs.append(e_seq)
                print("%s %s %s %s %d %d %d %0.2f" %
                      (ctg_id, vv, ww, rid, s, t, aln_score, idt),
                      file=p_ctg_t_out)
            print(">%s %s %s %d %d" %
                  (ctg_id, ctg_label, c_type_, total_length, total_score),
                  file=p_ctg_out)
            print("".join(sub_seqs), file=p_ctg_out)

            a_id = 1
            for v, w, in a_ctg_group:
                # get the base sequence used in the primary contig
                atig_output = []

                score, atig_path = a_ctg_group[(v, w)][0]
                atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
                if not proper_a_ctg:
                    sub_seqs = []
                else:
                    sub_seqs = list(yield_first_seq(atig_path_edges, seqs))
                total_length = 0
                total_score = 0
                for vv, ww in atig_path_edges:
                    rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                    sub_seqs.append(e_seq)
                    total_length += abs(s - t)
                    total_score += aln_score

                base_seq = "".join(sub_seqs)
                atig_output.append((v, w, atig_path, total_length, total_score,
                                    base_seq, atig_path_edges, 0, 1, 1))

                for score, atig_path in a_ctg_group[(v, w)][1:]:
                    atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
                    if not proper_a_ctg:
                        sub_seqs = []
                    else:
                        sub_seqs = list(yield_first_seq(atig_path_edges, seqs))
                    total_length = 0
                    total_score = 0
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        sub_seqs.append(e_seq)
                        total_length += abs(s - t)
                        total_score += aln_score

                    seq = "".join(sub_seqs)

                    delta_len = len(seq) - len(base_seq)
                    idt = 0.0
                    cov = 0.0
                    if len(base_seq) > 2000 and len(seq) > 2000:
                        try:
                            aln_data = get_aln_data(base_seq, seq)
                            if len(aln_data) != 0:
                                idt = 1.0 - 1.0 * \
                                    aln_data[-1][-1] / aln_data[-1][-2]
                                cov = 1.0 * \
                                    (aln_data[-1][3] - aln_data[-1]
                                     [2]) / aln_data[-1][4]
                        except TooLongError:
                            log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs, at atig_path[:-1] == {}'
                                .format(atig_path[:-1]))
                            idt = -1.0
                            cov = -1.0

                    atig_output.append(
                        (v, w, atig_path, total_length, total_score, seq,
                         atig_path_edges, delta_len, idt, cov))

                if len(atig_output) == 1:
                    continue

                sub_id = 0
                for data in atig_output:
                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        if sub_id != 0:
                            print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" %
                                  (ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                   aln_score, idt),
                                  file=a_ctg_t_out)
                        else:
                            print("%s-%03d-%02d %s %s %s %d %d %d %0.2f" %
                                  (ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                   aln_score, idt),
                                  file=a_ctg_base_t_out)

                    if sub_id != 0:
                        print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" %
                              (ctg_id, a_id, sub_id, v0,
                               w0, total_length, total_score,
                               len(atig_path_edges), delta_len, a_idt, cov),
                              file=a_ctg_out)
                        print(seq, file=a_ctg_out)
                    else:
                        print(">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" %
                              (ctg_id, a_id, sub_id, v0,
                               w0, total_length, total_score,
                               len(atig_path_edges), delta_len, a_idt, cov),
                              file=a_ctg_base_out)
                        print(seq, file=a_ctg_base_out)

                    sub_id += 1

                a_id += 1

    a_ctg_out.close()
    a_ctg_base_out.close()
    p_ctg_out.close()
    a_ctg_t_out.close()
    a_ctg_base_t_out.close()
    a_ctg_t_out.close()
    p_ctg_t_out.close()
コード例 #14
0
ファイル: fetch_reads.py プロジェクト: pbnjay/FALCON
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, '3-unzip/reads')

    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')

    rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids',
                                   'rawread_ids')
    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')

    rid_to_oid = open(rawread_id_file).read().split(
        '\n')  #daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split(
        '\n')  #daligner pread id to the fake ids

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split('/')[1]) / 10
        return rid_to_oid[int(rid)]

    with open_fasta_reader(ctg_fa) as ref_fasta:
        all_ctg_ids = set()
        for s in ref_fasta:
            s_id = s.name.split()[0]
            if ctg_id != 'all' and s_id != ctg_id:
                continue

            if len(s.sequence) < min_ctg_lenth:
                continue

            if ctg_id != 'all':
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % ctg_id),
                               'w')
            else:
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')

            print >> ref_out, '>%s' % s_id
            print >> ref_out, s.sequence
            all_ctg_ids.add(s_id)
            ref_out.close()

    read_set = {}
    ctg_id_hits = {}

    map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if int(row[3]) == 0:
                o_id = rid_to_oid[int(row[0])]
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if hit_ctg not in read_set and int(row[3]) == 0:
                o_id = pid_to_oid(row[0])
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
        for ctg_id in sorted(list(all_ctg_ids)):
            if ctg_id_hits.get(ctg_id, 0) < 5:
                continue
            if ctg_id[-1] not in [
                    'F', 'R'
            ]:  #ignore small circle contigs, they need different approach
                continue
            print >> f, ctg_id

    read_out_files = {}

    @contextlib.contextmanager
    def reopened_fasta_out(ctg_id):
        # A convenient closure, with a contextmanager.
        if ctg_id not in read_out_files:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
            read_out_files[ctg_id] = 1
        else:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
        yield read_out
        read_out.close()

    with open(read_fofn, 'r') as f:
        for r_fn in f:
            r_fn = r_fn.strip()
            with open_fasta_reader(
                    r_fn) as read_fa_file:  # will soon handle .dexta too
                for r in read_fa_file:
                    rid = r.name.split()[0]
                    if rid not in read_set:
                        ctg_id = 'unassigned'
                    else:
                        ctg_id = read_set[rid]

                    if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
                        ctg_id = 'unassigned'

                    with reopened_fasta_out(ctg_id) as read_out:
                        print >> read_out, '>' + rid
                        print >> read_out, r.sequence
コード例 #15
0
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, '3-unzip/reads')

    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')

    rawread_id_file = os.path.join(
        read_map_dir, 'dump_rawread_ids', 'rawread_ids')
    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')

    rid_to_oid = open(rawread_id_file).read().split(
        '\n')  # daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split(
        '\n')  # daligner pread id to the fake ids
    assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format(
        rawread_id_file)
    assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format(
        pread_id_file)

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split('/')[1]) // 10
        return rid_to_oid[int(rid)]

    with open_fasta_reader(ctg_fa) as ref_fasta:
        all_ctg_ids = set()
        for s in ref_fasta:
            s_id = s.name.split()[0]
            if ctg_id != 'all' and s_id != ctg_id:
                continue

            if len(s.sequence) < min_ctg_lenth:
                continue

            if ctg_id != 'all':
                ref_out = open(os.path.join(
                    out_dir, '%s_ref.fa' % ctg_id), 'w')
            else:
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')

            print('>%s' % s_id, file=ref_out)
            print(s.sequence, file=ref_out)
            all_ctg_ids.add(s_id)
            ref_out.close()

    read_set = {}
    ctg_id_hits = {}

    map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if int(row[3]) == 0:
                o_id = rid_to_oid[int(row[0])]
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
    assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn)
    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if hit_ctg not in read_set and int(row[3]) == 0:
                o_id = pid_to_oid(row[0])
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
        for ctg_id in sorted(list(all_ctg_ids)):
            if ctg_id_hits.get(ctg_id, 0) < 5:
                continue
            # ignore small circle contigs, they need different approach
            if ctg_id[-1] not in ['F', 'R']:
                continue
            print(ctg_id, file=f)

    read_out_files = {}

    @contextlib.contextmanager
    def reopened_fasta_out(ctg_id):
                # A convenient closure, with a contextmanager.
        if ctg_id not in read_out_files:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
            read_out_files[ctg_id] = 1
        else:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
        yield read_out
        read_out.close()

    with open(read_fofn, 'r') as f:
        for r_fn in f:
            r_fn = r_fn.strip()
            # will soon handle .dexta too
            with open_fasta_reader(r_fn) as read_fa_file:
                for r in read_fa_file:
                    rid = r.name.split()[0]
                    if rid not in read_set:
                        ctg_id = 'unassigned'
                    else:
                        ctg_id = read_set[rid]

                    if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
                        ctg_id = 'unassigned'

                    with reopened_fasta_out(ctg_id) as read_out:
                        print('>' + rid, file=read_out)
                        print(r.sequence, file=read_out)
コード例 #16
0
ファイル: dedup_a_tigs.py プロジェクト: pb-cdunn/FALCON
def main(argv=sys.argv):
    args = parse_args(argv)

    with open_fasta_reader(args.a_ctg_all) as fp_in:
        run(sys.stdout, fp_in, args.max_idt, args.max_aln_cov, args.min_len_diff, args.min_seq_len, args.ploidy)
コード例 #17
0
def main(argv=sys.argv):
    reads_in_layout = set()
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l
            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

    seqs = {}
    # load all p-read name into memory
    with open_fasta_reader(read_fasta) as f:
        for r in f:
            if r.name not in reads_in_layout:
                continue
            seqs[r.name] = r.sequence.upper()

    edge_data = {}
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l

            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

            s = int(s)
            t = int(t)
            aln_score = int(aln_score)
            idt = float(idt)

            if s < t:
                e_seq = seqs[rid][s:t]
            else:
                # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702
                # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon.
                e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]])
            edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)

    utg_data = {}
    with open(utg_data_file) as f:
        for l in f:
            l = l.strip().split()
            s, v, t, type_, length, score, path_or_edges = l
            if type_ not in ["compound", "simple", "contained"]:
                continue
            length = int(length)
            score = int(score)
            if type_ in ("simple", "contained"):
                path_or_edges = path_or_edges.split("~")
            else:
                path_or_edges = [
                    tuple(e.split("~")) for e in path_or_edges.split("|")
                ]
            utg_data[(s, v, t)] = type_, length, score, path_or_edges

    p_ctg_out = open("p_ctg.fa", "w")
    a_ctg_out = open("a_ctg_all.fa", "w")
    a_ctg_base_out = open("a_ctg_base.fa", "w")
    p_ctg_t_out = open("p_ctg_tiling_path", "w")
    a_ctg_t_out = open("a_ctg_tiling_path", "w")
    a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w")
    layout_ctg = set()

    with open(ctg_data_file) as f:
        for l in f:
            l = l.strip().split()
            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
            ctg_id = ctg_id
            s0 = i_utig.split("~")[0]

            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
                continue
            else:
                layout_ctg.add((s0, t0))

            ctg_label = i_utig + "~" + t0
            length = int(length)
            utgs = utgs.split("|")
            one_path = []
            total_score = 0
            total_length = 0

            #a_ctg_data = []
            a_ctg_group = {}

            for utg in utgs:
                s, v, t = utg.split("~")
                type_, length, score, path_or_edges = utg_data[(s, v, t)]
                total_score += score
                total_length += length
                if type_ == "simple":
                    if len(one_path) != 0:
                        one_path.extend(path_or_edges[1:])
                    else:
                        one_path.extend(path_or_edges)
                if type_ == "compound":

                    c_graph = nx.DiGraph()

                    all_alt_path = []
                    for ss, vv, tt in path_or_edges:
                        type_, length, score, sub_path = utg_data[(ss, vv, tt)]

                        v1 = sub_path[0]
                        for v2 in sub_path[1:]:
                            c_graph.add_edge(v1,
                                             v2,
                                             e_score=edge_data[(v1, v2)][3])
                            v1 = v2

                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
                    all_alt_path.append((score, shortest_path))

                    # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
                    while 1:
                        n0 = shortest_path[0]
                        for n1 in shortest_path[1:]:
                            c_graph.remove_edge(n0, n1)
                            n0 = n1
                        try:
                            shortest_path = nx.shortest_path(
                                c_graph, s, t, "e_score")
                            score = nx.shortest_path_length(
                                c_graph, s, t, "e_score")
                            #a_ctg_data.append( (s, t, shortest_path) )
                            all_alt_path.append((score, shortest_path))

                        except nx.exception.NetworkXNoPath:
                            break
                        # if len(shortest_path) < 2:
                        #    break
                    all_alt_path.sort()
                    all_alt_path.reverse()
                    shortest_path = all_alt_path[0][1]
                    if len(one_path) != 0:
                        one_path.extend(shortest_path[1:])
                    else:
                        one_path.extend(shortest_path)

                    a_ctg_group[(s, t)] = all_alt_path

            if len(one_path) == 0:
                continue

            one_path_edges = zip(one_path[:-1], one_path[1:])

            sub_seqs = []
            for vv, ww in one_path_edges:
                rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                sub_seqs.append(e_seq)
                print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % (
                    ctg_id, vv, ww, rid, s, t, aln_score, idt)
            print >> p_ctg_out, ">%s %s %s %d %d" % (
                ctg_id, ctg_label, c_type_, total_length, total_score)
            print >> p_ctg_out, "".join(sub_seqs)

            a_id = 1
            for v, w, in a_ctg_group:
                # get the base sequence used in the primary contig
                #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] )
                # if count < 2:
                #    continue
                atig_output = []

                score, atig_path = a_ctg_group[(v, w)][0]
                atig_path_edges = zip(atig_path[:-1], atig_path[1:])
                sub_seqs = []
                total_length = 0
                total_score = 0
                for vv, ww in atig_path_edges:
                    rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                    sub_seqs.append(e_seq)
                    total_length += abs(s - t)
                    total_score += aln_score

                base_seq = "".join(sub_seqs)
                atig_output.append((v, w, atig_path, total_length, total_score,
                                    base_seq, atig_path_edges, 0, 1, 1))

                for score, atig_path in a_ctg_group[(v, w)][1:]:
                    atig_path_edges = zip(atig_path[:-1], atig_path[1:])
                    sub_seqs = []
                    total_length = 0
                    total_score = 0
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        sub_seqs.append(e_seq)
                        total_length += abs(s - t)
                        total_score += aln_score

                    seq = "".join(sub_seqs)

                    delta_len = len(seq) - len(base_seq)
                    idt = 0.0
                    cov = 0.0
                    if len(base_seq) > 2000 and len(seq) > 2000:
                        aln_data, x, y = get_aln_data(base_seq, seq)
                        if len(aln_data) != 0:
                            idt = 1.0 - 1.0 * \
                                aln_data[-1][-1] / aln_data[-1][-2]
                            cov = 1.0 * \
                                (aln_data[-1][3] - aln_data[-1]
                                 [2]) / aln_data[-1][4]

                    atig_output.append(
                        (v, w, atig_path, total_length, total_score, seq,
                         atig_path_edges, delta_len, idt, cov))

                if len(atig_output) == 1:
                    continue

                sub_id = 0
                for data in atig_output:
                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        if sub_id != 0:
                            print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                aln_score, idt)
                        else:
                            print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                aln_score, idt)

                    if sub_id != 0:
                        print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id,
                            v0, w0, total_length, total_score,
                            len(atig_path_edges), delta_len, a_idt, cov)
                        print >> a_ctg_out, seq
                    else:
                        print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id,
                            v0, w0, total_length, total_score,
                            len(atig_path_edges), delta_len, a_idt, cov)
                        print >> a_ctg_base_out, seq

                    sub_id += 1

                a_id += 1

    a_ctg_out.close()
    a_ctg_base_out.close()
    p_ctg_out.close()
    a_ctg_t_out.close()
    a_ctg_base_t_out.close()
    a_ctg_t_out.close()
    p_ctg_t_out.close()
コード例 #18
0
ファイル: dedup_a_tigs.py プロジェクト: fangzhiyu/falcon3
def main(argv=sys.argv):
    args = parse_args(argv)

    with open_fasta_reader(args.a_ctg_all) as fp_in:
        run(sys.stdout, fp_in, args.max_idt, args.max_aln_cov, args.min_len_diff, args.min_seq_len, args.ploidy)