Esempio n. 1
0
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths))

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid) / 10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault(k, set())
                    pread_to_contigs[k].add(ctg)

            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
Esempio n. 2
0
def generate_read_to_ctg_map(self):
    rawread_id_file = fn(self.rawread_id_file)
    pread_id_file = fn(self.pread_id_file)
    read_to_contig_map = fn(self.read_to_contig_map)

    pread_did_to_rid = open(pread_id_file).read().split('\n')
    rid_to_oid = open(rawread_id_file).read().split('\n')

    asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data),
                     fn(self.ctg_paths))

    pread_to_contigs = {}

    with open(read_to_contig_map, 'w') as f:
        for ctg in asm_G.ctg_data:
            if ctg[-1] == 'R':
                continue
            ctg_g = asm_G.get_sg_for_ctg(ctg)
            for n in ctg_g.nodes():
                pid = int(n.split(':')[0])

                rid = pread_did_to_rid[pid].split('/')[1]
                rid = int(int(rid) / 10)
                oid = rid_to_oid[rid]
                k = (pid, rid, oid)
                pread_to_contigs.setdefault(k, set())
                pread_to_contigs[k].add(ctg)

        for k in pread_to_contigs:
            pid, rid, oid = k
            for ctg in list(pread_to_contigs[k]):
                print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
Esempio n. 3
0
def test_add_nx_string_graph():
    # Load the assembly graph.
    sg_edges_list = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                                 'sg_edges_list')
    utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'utg_data')
    ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths')
    asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)

    # The following block is taken from Unzip, graphs_to_h_tigs.py.
    nx_sg = nx.DiGraph()
    arid_to_phase = {}
    for ctg_id in asm_graph.ctg_data.keys():
        ctg_G = asm_graph.get_sg_for_ctg(ctg_id)
        ctg_nodes = set(ctg_G.nodes())
        for v, w in ctg_G.edges():
            vrid = v[:9]
            wrid = w[:9]
            edge_data = asm_graph.sg_edges[(v, w)]
            if edge_data[-1] != "G":
                continue

            vphase = arid_to_phase.get(vrid, (-1, 0))
            wphase = arid_to_phase.get(wrid, (-1, 0))
            if vphase[0] == wphase[0] and vphase[1] != wphase[1]:
                cross_phase = "Y"
            else:
                cross_phase = "N"

            nx_sg.add_node(v,
                           label="%d_%d" % vphase,
                           phase="%d_%d" % vphase,
                           src="P")

            nx_sg.add_node(w,
                           label="%d_%d" % wphase,
                           phase="%d_%d" % wphase,
                           src="P")

            nx_sg.add_edge(v, w, src="OP", cross_phase=cross_phase)

            # we need to add the complimentary edges as the ctg_graph does not contain the dual edges
            rv = reverse_end(v)
            rw = reverse_end(w)
            nx_sg.add_node(rv,
                           label="%d_%d" % vphase,
                           phase="%d_%d" % vphase,
                           src="P")
            nx_sg.add_node(rw,
                           label="%d_%d" % wphase,
                           phase="%d_%d" % wphase,
                           src="P")
            nx_sg.add_edge(rw, rv, src="OP", cross_phase=cross_phase)

    # Add the string graph to the GFA.
    gfa_graph = mod.GFAGraph()
    gexf_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf')
    nx_sg = nx.read_gexf(gexf_file)
    gfa_graph.add_nx_string_graph(nx_sg)
Esempio n. 4
0
def test_add_asm_graph():
    # Load the assembly graph.
    sg_edges_list = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list')
    utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'utg_data')
    ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths')
    asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)

    # Add the graph to GFA.
    gfa_graph = mod.GFAGraph()
    gfa_graph.add_asm_graph(asm_graph)

    assert(len(gfa_graph.paths.keys()) == 0)

    expected = {
        ('000000016:B', '000000027:B'): ['000000016:B', '000000027:B', '*', 1540, 99.94, 449, 0, None, None, None, None],
        ('000000005:B', '000000016:B'): ['000000005:B', '000000016:B', '*', 1487, 99.93, 502, 0, None, None, None, None],
        ('000000016:B', '000000025:B'): ['000000016:B', '000000025:B', '*', 1540, 99.94, 449, 0, None, None, None, None],
        ('000000007:B', '000000005:B'): ['000000007:B', '000000005:B', '*', 1980, 99.95, 9, 0, None, None, None, None],
        ('000000018:B', '000000004:B'): ['000000018:B', '000000004:B', '*', 1963, 99.95, 26, 0, None, None, None, None],
        ('000000025:B', '000000018:B'): ['000000025:B', '000000018:B', '*', 1978, 99.95, 11, 0, None, None, None, None]
    }

    assert(len(gfa_graph.edges.keys()) == len(expected.keys()))

    for key, edge in gfa_graph.edges.iteritems():
        assert(key in expected)
        assert(expected[key] == edge)
Esempio n. 5
0
def main(*argv):
  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")


  p_ctg_coor_map = {}
  for fn in ("p_ctg_tiling_path", "a_ctg_tiling_path"):
    f = open(fn)
    for row in f:
        row = row.strip().split()
        ctg_id, v, w, edge_rid, b, e  = row[:6]
        if ctg_id not in p_ctg_coor_map:
            coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
            p_ctg_coor_map[ctg_id] = {}
            p_ctg_coor_map[ctg_id][v] = 0
            coor += abs(int(b) - int(e))
            p_ctg_coor_map[ctg_id][w] = coor
            G_asm.node_to_ctg[w]
            print ctg_id, v, 0, " ".join(list(G_asm.node_to_ctg[v]))
            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
            continue
        else:
            coor += abs(int(b) - int(e))
            p_ctg_coor_map[ctg_id][w] = coor 
            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
    f.close()
Esempio n. 6
0
def gfa_from_assembly(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
                      preads_fasta, p_ctg_fasta, a_ctg_fasta,
                      sg_edges_list, utg_data, ctg_paths,
                      add_string_graph, write_reads, write_contigs,
                      min_p_len, min_a_len):
    """
    This method produces the GFA-1 formatted output of the
    FALCON assembly.
    The graphical output is produced from either the entire string
    graph (only the non-filtered edges are considered) or from only
    the tiling paths. String graph can show the neighborhood of contig
    breaks, whereas the tiling path output is more sparse.
    Output is written to stdout.
    """
    gfa_graph = GFAGraph()

    add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta,
                            p_ctg_tiling_path, a_ctg_tiling_path,
                            min_p_len, min_a_len,
                            gfa_graph)

    if add_string_graph:
        # Load the string graph.
        asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
        gfa_graph.add_asm_graph(asm_graph)

    gfa_graph.write_gfa_v1(fp_out, preads_fasta, [
                           p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)
Esempio n. 7
0
def add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
    asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)

    for v, w in asm_graph.sg_edges:
        add_node(gfa_graph, v, preads_dict)
        add_node(gfa_graph, w, preads_dict)

    for v, w in asm_graph.sg_edges:
        edge_data = asm_graph.sg_edges[(v, w)]
        if edge_data[-1] != 'G':
            continue
        add_edge(gfa_graph, v, w, edge_data, preads_overlap_dict, sg_edges_dict)
Esempio n. 8
0
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs,
                           min_p_len, min_a_len, expected_path):
    # Create a GFA graph.
    gfa_graph = mod.GFAGraph()

    # Init paths to other input files.
    preads_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                               'preads4falcon.fasta')
    p_ctg_fasta = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                               'p_ctg.fa')
    a_ctg_fasta = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                               'a_ctg.fa')

    if use_sg:
        # Load the assembly graph.
        sg_edges_list = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                                     'sg_edges_list')
        utg_data = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                                'utg_data')
        ctg_paths = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                                 'ctg_paths')
        asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
        # Add the string graph to the GFA.
        gfa_graph.add_asm_graph(asm_graph)

    if use_tp:
        p_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(),
                                              'gfa-1', 'p_ctg_tiling_path')
        a_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(),
                                              'gfa-1', 'a_ctg_tiling_path')
        gen_gfa_v1.add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta,
                                           p_ctg_tiling_path_file,
                                           a_ctg_tiling_path_file, min_p_len,
                                           min_a_len, gfa_graph)

    if use_nx:
        gexf_file = os.path.join(helpers.get_test_data_dir(), 'gfa-1',
                                 'sg.gexf')
        nx_sg = nx.read_gexf(gexf_file)
        gfa_graph.add_nx_string_graph(nx_sg)

    fp_out = StringIO()
    # Run the unit under test.
    gfa_graph.write_gfa_v1(fp_out, preads_file, [p_ctg_fasta, a_ctg_fasta],
                           write_reads, write_contigs)

    # Compare results.
    value = fp_out.getvalue()
    helpers.assert_filecmp(value, expected_path)
Esempio n. 9
0
def gfa_from_assembly(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
                      preads_fasta, p_ctg_fasta, a_ctg_fasta, sg_edges_list,
                      utg_data, ctg_paths, tiling, write_reads, write_contigs,
                      min_p_len, min_a_len):
    """
    This method produces the GFA-1 formatted output of the
    FALCON assembly.
    The graphical output is produced from either the entire string
    graph (only the non-filtered edges are considered) or from only
    the tiling paths. String graph can show the neighborhood of contig
    breaks, whereas the tiling path output is more sparse.
    Output is written to stdout.
    """
    gfa_graph = GFAGraph()

    # Load and filter primary contig paths.
    p_paths, p_edge_to_ctg = load_tiling_paths(p_ctg_tiling_path, 'P')
    _, p_ctg_len = calc_tiling_paths_len(p_paths)
    p_paths = filter_tiling_paths_by_len(p_paths, p_ctg_len, min_p_len)
    for ctg_id, path in p_paths.iteritems():
        gfa_graph.add_tiling_path(path, ctg_id)

    # Load and filter associate contig paths.
    a_paths, a_edge_to_ctg = load_tiling_paths(a_ctg_tiling_path, 'A')
    _, a_ctg_len = calc_tiling_paths_len(a_paths)
    a_paths = filter_tiling_paths_by_len(a_paths, a_ctg_len, min_a_len)
    for ctg_id, path in a_paths.iteritems():
        gfa_graph.add_tiling_path(path, ctg_id)

    if not tiling:
        # Load the string graph.
        asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
        gfa_graph.add_asm_graph(asm_graph)

    gfa_graph.write_gfa_v1(fp_out, preads_fasta, [p_ctg_fasta, a_ctg_fasta],
                           write_reads, write_contigs)
Esempio n. 10
0
def main(argv=None):
  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
  G_asm.load_sg_seq("preads4falcon.fasta")

  utg_out = open("utgs.fa","w")


  for utg in G_asm.utg_data:
    s,t,v  = utg
    type_, length, score, path_or_edges = G_asm.utg_data[ (s,t,v) ]
    if type_ == "simple":
        path_or_edges = path_or_edges.split("~")
        seq = G_asm.get_seq_from_path( path_or_edges )
        print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score )
        print >> utg_out, seq

    if type_ == "compound":

        c_graph = nx.DiGraph()

        all_alt_path = []
        path_or_edges = [ c.split("~") for c in path_or_edges.split("|")]
        for ss, vv, tt in path_or_edges:
            type_, length, score, sub_path = G_asm.utg_data[ (ss,tt,vv) ]

            sub_path = sub_path.split("~")
            v1 = sub_path[0]
            for v2 in sub_path[1:]:
                c_graph.add_edge( v1, v2, e_score = G_asm.sg_edges[ (v1, v2) ][1]  )
                v1 = v2

        shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
        score = nx.shortest_path_length( c_graph, s, t, "e_score" )
        all_alt_path.append( (score, shortest_path) )


        #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
        while 1:
            if s == t:
                break
            n0 = shortest_path[0]
            for n1 in shortest_path[1:]:
                c_graph.remove_edge(n0, n1)
                n0 = n1
            try:
                shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
                score = nx.shortest_path_length( c_graph, s, t, "e_score" )
                #a_ctg_data.append( (s, t, shortest_path) )
                all_alt_path.append( (score, shortest_path) )

            except nx.exception.NetworkXNoPath:
                break
            #if len(shortest_path) < 2:
            #    break

        all_alt_path.sort()
        all_alt_path.reverse()
        shortest_path = all_alt_path[0][1]


        score, atig_path = all_alt_path[0]

        atig_output = []

        atig_path_edges = zip(atig_path[:-1], atig_path[1:])
        sub_seqs = []
        total_length = 0
        total_score = 0
        for vv, ww in atig_path_edges:
            r, aln_score, idt, typs_  = G_asm.sg_edges[ (vv, ww) ]
            e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
            rid, ss, tt = r
            sub_seqs.append( e_seq )
            total_length += abs(ss-tt)
            total_score += aln_score

        base_seq = "".join(sub_seqs)
        atig_output.append( (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1) )


        duplicated = True
        for score, atig_path in all_alt_path[1:]:
            atig_path_edges = zip(atig_path[:-1], atig_path[1:])
            sub_seqs = []
            total_length = 0
            total_score = 0
            for vv, ww in atig_path_edges:
                r, aln_score, idt, type_ = G_asm.sg_edges[ (vv, ww) ]
                e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
                rid, ss, tt = r
                sub_seqs.append( e_seq )
                total_length += abs(ss-tt)
                total_score += aln_score

            seq = "".join(sub_seqs)

            aln_data, x, y = get_aln_data(base_seq, seq)
            if len( aln_data ) != 0:
                idt =  1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2]
                cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]
                if idt < 0.96 or cov < 0.98:
                    duplicated = False
                    atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov) )
            else:
                duplicated = False
                atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0) )

        #if len(atig_output) == 1:
        #    continue

        sub_id = 0
        for data in atig_output:
            v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
            print >> utg_out, ">%s~%s~%s-%d %d %d" % (v0, "NA", w0, sub_id,  total_length, total_score )
            print >> utg_out, seq
            sub_id += 1
Esempio n. 11
0
        if alignment[0].aln_str_size > 100:
            aln_data.append(
                (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist)
            )
            aln_str1 = alignment[0].q_aln_str
            aln_str0 = alignment[0].t_aln_str

        DWA.free_alignment(alignment)

    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data, x, y


G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
G_asm.load_sg_seq("preads4falcon.fasta")

utg_out = open("utgs.fa", "w")


for utg in G_asm.utg_data:
    s, t, v = utg
    type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
    if type_ == "simple":
        path_or_edges = path_or_edges.split("~")
        seq = G_asm.get_seq_from_path(path_or_edges)
        print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score)
        print >> utg_out, seq

    if type_ == "compound":
Esempio n. 12
0
def main(argv=sys.argv):
    G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
    G_asm.load_sg_seq("preads4falcon.fasta")

    utg_out = open("utgs.fa", "w")

    for utg in G_asm.utg_data:
        s, t, v = utg
        type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
        if type_ == "simple":
            path_or_edges = path_or_edges.split("~")
            seq = G_asm.get_seq_from_path(path_or_edges)
            print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length,
                                                      score)
            print >> utg_out, seq

        if type_ == "compound":

            c_graph = nx.DiGraph()

            all_alt_path = []
            path_or_edges = [c.split("~") for c in path_or_edges.split("|")]
            for ss, vv, tt in path_or_edges:
                type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)]

                sub_path = sub_path.split("~")
                v1 = sub_path[0]
                for v2 in sub_path[1:]:
                    c_graph.add_edge(v1,
                                     v2,
                                     e_score=G_asm.sg_edges[(v1, v2)][1])
                    v1 = v2

            shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
            score = nx.shortest_path_length(c_graph, s, t, "e_score")
            all_alt_path.append((score, shortest_path))

            # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
            while 1:
                if s == t:
                    break
                n0 = shortest_path[0]
                for n1 in shortest_path[1:]:
                    c_graph.remove_edge(n0, n1)
                    n0 = n1
                try:
                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
                    #a_ctg_data.append( (s, t, shortest_path) )
                    all_alt_path.append((score, shortest_path))

                except nx.exception.NetworkXNoPath:
                    break
                # if len(shortest_path) < 2:
                #    break

            all_alt_path.sort()
            all_alt_path.reverse()
            shortest_path = all_alt_path[0][1]

            score, atig_path = all_alt_path[0]

            atig_output = []

            atig_path_edges = zip(atig_path[:-1], atig_path[1:])
            sub_seqs = []
            total_length = 0
            total_score = 0
            for vv, ww in atig_path_edges:
                r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)]
                e_seq = G_asm.sg_edge_seqs[(vv, ww)]
                rid, ss, tt = r
                sub_seqs.append(e_seq)
                total_length += abs(ss - tt)
                total_score += aln_score

            base_seq = "".join(sub_seqs)
            atig_output.append((s, t, atig_path, total_length, total_score,
                                base_seq, atig_path_edges, 1, 1))

            duplicated = True
            for score, atig_path in all_alt_path[1:]:
                atig_path_edges = zip(atig_path[:-1], atig_path[1:])
                sub_seqs = []
                total_length = 0
                total_score = 0
                for vv, ww in atig_path_edges:
                    r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)]
                    e_seq = G_asm.sg_edge_seqs[(vv, ww)]
                    rid, ss, tt = r
                    sub_seqs.append(e_seq)
                    total_length += abs(ss - tt)
                    total_score += aln_score

                seq = "".join(sub_seqs)

                aln_data, x, y = get_aln_data(base_seq, seq)
                if len(aln_data) != 0:
                    idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2]
                    cov = 1.0 * (aln_data[-1][3] -
                                 aln_data[-1][2]) / aln_data[-1][4]
                    if idt < 0.96 or cov < 0.98:
                        duplicated = False
                        atig_output.append(
                            (s, t, atig_path, total_length, total_score, seq,
                             atig_path_edges, idt, cov))
                else:
                    duplicated = False
                    atig_output.append(
                        (s, t, atig_path, total_length, total_score, seq,
                         atig_path_edges, 0, 0))

            # if len(atig_output) == 1:
            #    continue

            sub_id = 0
            for data in atig_output:
                v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
                print >> utg_out, ">%s~%s~%s-%d %d %d" % (
                    v0, "NA", w0, sub_id, total_length, total_score)
                print >> utg_out, seq
                sub_id += 1
Esempio n. 13
0
def main(argv=sys.argv):

    # make life easier for now. will refactor it out if possible
    global all_rid_to_phase
    global p_asm_G
    global h_asm_G
    global all_rid_to_phase
    global seqs

    args = parse_args(argv)
    fc_asm_path = args.fc_asm_path
    fc_hasm_path = args.fc_hasm_path
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    fasta_fn = args.fasta

    p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"),
                       os.path.join(fc_asm_path, "utg_data"),
                       os.path.join(fc_asm_path, "ctg_paths"))

    h_asm_G = AsmGraph(os.path.join(fc_hasm_path, "sg_edges_list"),
                       os.path.join(fc_hasm_path, "utg_data"),
                       os.path.join(fc_hasm_path, "ctg_paths"))

    all_rid_to_phase = {}

    all_read_ids = set()
    with open(args.rid_phase_map) as f:
        for row in f:
            row = row.strip().split()
            all_rid_to_phase.setdefault(row[1], {})
            all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
            all_read_ids.add(row[0])

    for v, w in p_asm_G.sg_edges:
        if p_asm_G.sg_edges[(v, w)][-1] != "G":
            continue
        v = v.split(":")[0]
        w = w.split(":")[0]
        all_read_ids.add(v)
        all_read_ids.add(w)

    for v, w in h_asm_G.sg_edges:
        if h_asm_G.sg_edges[(v, w)][-1] != "G":
            continue
        v = v.split(":")[0]
        w = w.split(":")[0]
        all_read_ids.add(v)
        all_read_ids.add(w)

    seqs = load_sg_seq(all_read_ids, fasta_fn)

    if ctg_id == "all":
        ctg_id_list = p_asm_G.ctg_data.keys()
    else:
        ctg_id_list = [ctg_id]

    exe_list = []
    for ctg_id in ctg_id_list:
        if ctg_id[-1] != "F":
            continue
        if ctg_id not in all_rid_to_phase:
            continue
        exe_list.append((ctg_id, os.path.join(".", ctg_id)))

    exec_pool = Pool(4)  #TODO, make this configurable
    exec_pool.map(generate_haplotigs_for_ctg, exe_list)
Esempio n. 14
0
        if alignment[0].aln_str_size > 100:
            aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
                             alignment[0].aln_str_size, alignment[0].dist))
            aln_str1 = alignment[0].q_aln_str
            aln_str0 = alignment[0].t_aln_str

        DWA.free_alignment(alignment)

    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data, x, y


G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
G_asm.load_sg_seq("preads4falcon.fasta")

utg_out = open("utgs.fa", "w")

for utg in G_asm.utg_data:
    s, t, v = utg
    type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
    if type_ == "simple":
        path_or_edges = path_or_edges.split("~")
        seq = G_asm.get_seq_from_path(path_or_edges)
        print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score)
        print >> utg_out, seq

    if type_ == "compound":
Esempio n. 15
0
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs, min_p_len, min_a_len, expected_path):
    # Create a GFA graph.
    gfa_graph = mod.GFAGraph()

    if use_sg:
        # Load the assembly graph.
        sg_edges_list = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list')
        utg_data = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'utg_data')
        ctg_paths = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths')
        asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
        # Add the string graph to the GFA.
        gfa_graph.add_asm_graph(asm_graph)

    if use_tp:
        # Load the p_ctg tiling paths.
        p_ctg_tiling_path_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path')
        p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
            p_ctg_tiling_path_file, 'P')
        # Add the tiling paths to the GFA.
        for ctg_id, path in p_paths.iteritems():
            _, contig_len = gen_gfa_v1.calc_node_coords(path)
            if contig_len >= min_p_len:
                gfa_graph.add_tiling_path(path, ctg_id)
        a_ctg_tiling_path_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'a_ctg_tiling_path')
        a_paths, a_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
            a_ctg_tiling_path_file, 'P')
        # Add the tiling paths to the GFA.
        for ctg_id, path in a_paths.iteritems():
            _, contig_len = gen_gfa_v1.calc_node_coords(path)
            if contig_len >= min_a_len:
                gfa_graph.add_tiling_path(path, ctg_id)

    if use_nx:
        gexf_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf')
        nx_sg = nx.read_gexf(gexf_file)
        gfa_graph.add_nx_string_graph(nx_sg)

    # Init paths to other input files.
    preads_file = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta')
    p_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa')
    a_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa')

    fp_out = StringIO()
    # Run the unit under test.
    gfa_graph.write_gfa_v1(fp_out, preads_file, [
                           p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)

    # Compare results.
    result = fp_out.getvalue()
    result = result.splitlines()
    expected = [line.strip() for line in open(expected_path).readlines()]
    assert(result == expected)
#!/usr/local/packages/anaconda2/bin/python
## This is the `sg_edges_to_GFA.py` script
## (More) information at https://github.com/PacificBiosciences/FALCON/wiki/Convert-FALCON-assembly-graph-to-GFA-format

from falcon_kit.fc_asm_graph import AsmGraph
from falcon_kit.FastaReader import FastaReader

read_in_graph = set()
G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
edge_to_ctg = {}
a_path = {}
with open("a_ctg_tiling_path") as f:
    for row in f:
        row = row.strip().split()
        ctg_id, v, w, edge_rid, b, e = row[:6]
        a_path.setdefault(ctg_id, [])
        a_path[ctg_id].append((v, w))
        ctg_id = ctg_id.split("-")[0]  #get the primary contig id
        edge_to_ctg[(v, w)] = ctg_id, "A"

p_path = {}
with open("p_ctg_tiling_path") as f:
    for row in f:
        row = row.strip().split()
        ctg_id, v, w, edge_rid, b, e = row[:6]
        p_path.setdefault(ctg_id, [])
        p_path[ctg_id].append((v, w))
        edge_to_ctg[(v, w)] = ctg_id, "P"

read_pairs = set()
link_lines = []
Esempio n. 17
0
    parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True)
    parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True)
    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True)
    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
    parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True)
    parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True)

    args = parser.parse_args()
    fc_asm_path = args.fc_asm_path
    fc_hasm_path = args.fc_hasm_path
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    fasta_fn = args.fasta
    
    p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"), 
                       os.path.join(fc_asm_path, "utg_data"),
                       os.path.join(fc_asm_path, "ctg_paths") )

    h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"), 
                        os.path.join(fc_hasm_path, "utg_data"), 
                        os.path.join(fc_hasm_path, "ctg_paths") )

    all_rid_to_phase = {}

    all_read_ids = set()
    with open(args.rid_phase_map) as f:
        for row in f:
            row = row.strip().split()
            all_rid_to_phase.setdefault( row[1], {} )
            all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
            all_read_ids.add(row[0])