Esempio n. 1
0
def find_bubbles(G, path_d, mermap):
    """
    We find all cases where n' -> n1 -> n3
                            n' -> n2 -> n3
    (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing
    <i> make sure that n1 and n2 is not used in the same path  (which indicates in-gene repeat?)
    <ii> retrace n1, n2 to make sure that they are largely similar
    """
    def has_common_unique_pred(n1, n2):
        """
        Case:
         pred -> n1 -> common succ
         pred -> n2 -> common succ
        """
        preds1 = G.predecessors(n1)
        preds2 = G.predecessors(n2)
        return len(preds1) == 1  and len(preds2) == 1 and preds1[0] == preds2[0]

    def traceback_path(n1, n2):
        """
        Find a common pred where
         pred -> n1
         pred -> some_node -> n2
        """
        assert G.in_degree(n1) == 1
        pred = G.predecessors(n1)[0]
        return path_finder(G, n2, pred, [n2], 2)

    def replace_node(n_to_del, n_to_replace_with):
        #pdb.set_trace()
        G.remove_node(n_to_del)
        del mermap[n_to_del]
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+1:]

    def replace_path_w_node(path_to_del, n_to_replace_with):
        """
        
        """
        path_len = len(path_to_del)
        for k in path_d:
            if path_to_del[0] in path_d[k]:
                i = path_d[k].index(path_to_del[0])
                if path_d[k][i:i+path_len] == path_to_del:
                    path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+path_len:]
        # now delete all non branching nodes in path_to_del
        # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees!
        safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1, path_to_del)
        for node in safe_to_remove:
            print "safe to delete from G:", node
            G.remove_node(node)
            del mermap[node]
    

    in_same_path = make_in_same_path(path_d)
    cands = filter(lambda n: G.in_degree(n)>=2, G.nodes_iter())
    for n in cands:
        if n not in G: continue # deleted in loop below
        _pred = G.predecessors(n)
        if len(_pred) >= 2:
            for i, n1 in enumerate(_pred):
                if n1 not in G: continue
                for n2 in _pred[i+1:]:
                    if n1 not in G or n2 not in G or n1 in in_same_path[n2]: continue

                    if has_common_unique_pred(n1, n2):
                        # what is known: common pred -> n1 -> common succ
                        #                common pred -> n2 -> common succ
                        # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1)
                        if __debug__:
                            pdb.set_trace()
                        if splice_align.node_is_similar(mermap[n1], mermap[n2]):
                            mermap[n1] = splice_align.get_consensus_through_pbdagcon(mermap[n1],\
                                                                                     G.get_edge_data(n1, n)['weight'],\
                                                                                     mermap[n2],\
                                                                                     G.get_edge_data(n2, n)['weight'])
                            replace_node(n_to_del=n2, n_to_replace_with=n1)
                        else:
                            flag, is_skipped = splice_align.node_is_skipping(mermap[n1], mermap[n2], KMER_SIZE)
                            if is_skipped:
                                if flag == "SEQ1":  # seq1 is the one with retained exon
                                    replace_node(n_to_del=n2, n_to_replace_with=n1)
                                else:
                                    replace_node(n_to_del=n1, n_to_replace_with=n2)
                            else:
                                print >> sys.stderr, "should NOT collapse", n1, n2
                    else:
                        if G.in_degree(n1) == 1:
                            p2 = traceback_path(n1, n2)
                            if p2 is not None:
                                # common pred -> n1 -> common succ
                                # common pred -> another node -> n2 -> common succ
                                s1 = mermap[n1]
                                s2 = stitch_string_from_path(p2, mermap)
                                if __debug__:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n1] = splice_align.get_consensus_through_pbdagcon(s1,\
                                                                        G.get_edge_data(n1, n)['weight'],\
                                                                        s2,
                                                                        G.get_edge_data(n2, n)['weight'])
                                    replace_path_w_node(p2, n1)

                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, KMER_SIZE)
                                    if is_skipped:
                                        print >> sys.stderr, "path collapse possible", n1, p2
                                        mermap[n1] = s1 if flag == 'SEQ1' else s2
                                        replace_path_w_node(p2, n1)
                                    else:
                                        print >> sys.stderr, "should NOT collapse", n1, n2
                            else:
                                print >> sys.stderr, "should NOT collapse", n1, n2
                        elif G.in_degree(n2) == 1:
                            p1 = traceback_path(n2, n1)
                            if p1 is not None:
                                s1 = stitch_string_from_path(p1, mermap)
                                s2 = mermap[n2]
                                if __debug__:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n2] = splice_align.get_consensus_through_pbdagcon(s1,\
                                                                                             G.get_edge_data(n1, n)['weight'],\
                                                                                             s2,
                                                                                             G.get_edge_data(n2, n)['weight'])
                                    print >> sys.stderr, "path collapse possible", p1, n2
                                    replace_path_w_node(p1, n2)
                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, KMER_SIZE)
                                    if is_skipped:
                                        mermap[n2] = s1 if flag=='SEQ1' else s2
                                        replace_path_w_node(p1, n2)
                                    else:
                                        print >> sys.stderr, "should NOT collapse", n1, n2
                            else:
                                print >> sys.stderr, "should NOT collapse", n1, n2
                        else:
                            print >> sys.stderr, "should NOT collapse", n1, n2
Esempio n. 2
0
def find_source_bubbles(G, path_d, mermap):
    """
    Find all cases where
       src1 --> n3 
       ...> path1 --> n3
    and that
    <i> src1 and path1 each has only one outgoing edge to n3
    <ii> src1 and path1 are similar

    path1: can also be a source
    """
    def traceback(cur):
        """
        Retrace path of n1 -> n2 ... -> cur
        where n1, n2....all have exactly one outgoing edge
        """
        acc = []
        while True:
            acc.append(cur)
            preds = G.predecessors(cur)
            if len(preds) == 0 or len(preds) > 1 or G.out_degree(preds[0]) > 1:
                break
            cur = preds[0]
        return acc[::-1]

    def replace_node(n_to_del, path_to_replace_with):
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + path_to_replace_with + path_d[k][i+1:]
        G.remove_node(n_to_del)
        del mermap[n_to_del]

    in_same_path = make_in_same_path(path_d)
    sources = filter(lambda n: G.in_degree(n) == 0, G.nodes_iter())
    for src1 in sources:
        if src1 not in G: continue # deleted in the loop below
        succ = G.successors(src1)
        if len(succ) == 1:
            n3 = succ[0]
            cands = G.predecessors(n3)
            for n in cands:
                if src1 not in G: break  # deleted, jump out of this
                if n not in G: continue # deleted in the loop below
                if n!=src1 and n not in in_same_path[src1] and (n in sources or G.out_degree(n)==1):
                    t = traceback(n)
                    seq1 = mermap[src1]
                    seq2 = stitch_string_from_path(t, mermap)
                    minlen = min(len(seq1), len(seq2))
                    # we know that seq1 and seq2 both have the same successor so they must share the same last (KMER_SIZE-1) suffix
                    if __debug__:
                        pdb.set_trace()
                    if splice_align.node_is_similar(seq1[::-1][:minlen], seq2[::-1][:minlen]):
                        # should collapse src1 into n
                        # to do so: 
                        # (1) if both are sources, replace the shorter src with the longer src
                        # (2) if one is src other is path, replace the src with path
                        # -- delete the replaced node from G
                        # -- for all path in path_d that uses the deleted node, update with replacement
                        print >> sys.stderr, "should collapse", src1, t
                        if len(t) == 1 and t[0] in sources: # both are sources
                            if len(mermap[t[0]]) > len(mermap[src1]):
                                replace_node(n_to_del=src1, path_to_replace_with=t)
                            else: # src1 is longer, use src1
                                replace_node(t[0], [src1])
                        else: # src1 is a source, <t> is not a source node but a path
                            if len(seq2) > len(seq1):  # let's just "tuck" src1 into <t>
                                replace_node(src1, t)
                            else: # we don't know if nodes in <t> branch out so we can't collapse them
                                print >> sys.stderr, "should NOT collapse", src1, t
                    else: # src1 and <t> are not similar enough, DO NOT collapse
                        print >> sys.stderr, "should NOT collapse", src1, t