Ejemplo n.º 1
0
def find_bubbles(G, path_d, mermap):
    """
    We find all cases where n' -> n1 -> n3
                            n' -> n2 -> n3
    (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing
    <i> make sure that n1 and n2 is not used in the same path  (which indicates in-gene repeat?)
    <ii> retrace n1, n2 to make sure that they are largely similar
    """

    def has_common_unique_pred(n1, n2):
        """
        Case:
         pred -> n1 -> common succ
         pred -> n2 -> common succ
        """
        preds1 = G.predecessors(n1)
        preds2 = G.predecessors(n2)
        return len(preds1) == 1 and len(preds2) == 1 and preds1[0] == preds2[0]

    def traceback_path(n1, n2):
        """
        Find a common pred where
         pred -> n1
         pred -> some_node -> n2
        """
        assert G.in_degree(n1) == 1
        pred = G.predecessors(n1)[0]
        return path_finder(G, n2, pred, [n2], 2)

    def replace_node(n_to_del, n_to_replace_with):
        # pdb.set_trace()
        G.remove_node(n_to_del)
        del mermap[n_to_del]
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + 1 :]

    def replace_path_w_node(path_to_del, n_to_replace_with, common_succ):
        """
        
        """
        # first, it's possible that the last node in <path_to_del> has other successors
        # ex: path_to_del = x1 -> x2 -> common_succ
        #          also has       x2 -> another node x3
        # so must change to n_to_replace_with -> x3
        last_n_in_path = path_to_del[-1]
        for s, t, data in G.out_edges(last_n_in_path, data=True):
            if G.has_edge(n_to_replace_with, t):
                G[n_to_replace_with][t]["weight"] += data["weight"]
            else:
                G.add_edge(n_to_replace_with, t, weight=data["weight"])

        # for every predecssor of path_to_del, replace with n_to_replace_with
        # ex:        pred -> x1 -> x2 -> ...
        # becomes    pred -> n_to_replace_with -> ...
        for pred in G.predecessors(path_to_del[0]):
            G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])["weight"])

        path_len = len(path_to_del)
        for k in path_d:
            if path_to_del[0] in path_d[k]:
                i = path_d[k].index(path_to_del[0])
                m = min(i + path_len, len(path_d[k]))
                if path_d[k][i:m] == path_to_del[: (m - i)]:
                    path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + path_len :]
        # now delete all non branching nodes in path_to_del
        # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees!

        nodes_in_path = set()
        for path in path_d.itervalues():
            nodes_in_path = nodes_in_path.union(path)
        safe_to_remove = filter(lambda x: G.out_degree(x) <= 1 and x not in nodes_in_path, path_to_del)
        # safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        for node in safe_to_remove:
            log.debug("safe to delete from G: {0}".format(node))
            G.remove_node(node)
            del mermap[node]

    in_same_path = make_in_same_path(path_d)
    cands = filter(lambda n: G.in_degree(n) >= 2, G.nodes_iter())
    for n in cands:
        if n not in G:
            continue  # deleted in loop below
        _pred = G.predecessors(n)
        if len(_pred) >= 2:
            for i, n1 in enumerate(_pred):
                if n1 not in G:
                    continue
                for n2 in _pred[i + 1 :]:
                    if n1 not in G or n2 not in G or n1 in in_same_path[n2]:
                        continue

                    if has_common_unique_pred(n1, n2):
                        # what is known: common pred -> n1 -> common succ
                        #                common pred -> n2 -> common succ
                        # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1)
                        if DEBUG_FLAG:
                            pdb.set_trace()
                        if splice_align.node_is_similar(mermap[n1], mermap[n2]):
                            mermap[n1] = splice_align.get_consensus_through_voting(
                                mermap[n1],
                                G.get_edge_data(n1, n)["weight"],
                                mermap[n2],
                                G.get_edge_data(n2, n)["weight"],
                            )
                            replace_node(n_to_del=n2, n_to_replace_with=n1)
                        else:
                            flag, is_skipped = splice_align.node_is_skipping(
                                mermap[n1], mermap[n2], cc_settings.KMER_SIZE
                            )
                            if is_skipped:
                                if flag == "SEQ1":  # seq1 is the one with retained exon
                                    replace_node(n_to_del=n2, n_to_replace_with=n1)
                                else:
                                    replace_node(n_to_del=n1, n_to_replace_with=n2)
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                    else:
                        if G.in_degree(n1) == 1:
                            p2 = traceback_path(n1, n2)
                            if p2 is not None:
                                # common pred -> n1 -> common succ
                                # common pred -> another node -> n2 -> common succ
                                s1 = mermap[n1]
                                s2 = stitch_string_from_path(p2, mermap)
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n1] = splice_align.get_consensus_through_voting(
                                        s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"]
                                    )
                                    replace_path_w_node(p2, n1, common_succ=n)

                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        log.debug("path collapse possible {0},{1}".format(n1, p2))
                                        mermap[n1] = s1 if flag == "SEQ1" else s2
                                        replace_path_w_node(p2, n1, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        elif G.in_degree(n2) == 1:
                            p1 = traceback_path(n2, n1)
                            if p1 is not None:
                                s1 = stitch_string_from_path(p1, mermap)
                                s2 = mermap[n2]
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n2] = splice_align.get_consensus_through_voting(
                                        s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"]
                                    )
                                    log.debug("path collapse possible: {0},{1}".format(p1, n2))
                                    replace_path_w_node(p1, n2, common_succ=n)
                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        mermap[n2] = s1 if flag == "SEQ1" else s2
                                        replace_path_w_node(p1, n2, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        else:
                            log.debug("should NOT collapse {0},{1}".format(n1, n2))
Ejemplo n.º 2
0
def find_bubbles(G, path_d, mermap):
    """
    We find all cases where n' -> n1 -> n3
                            n' -> n2 -> n3
    (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing
    <i> make sure that n1 and n2 is not used in the same path  (which indicates in-gene repeat?)
    <ii> retrace n1, n2 to make sure that they are largely similar
    """
    def has_common_unique_pred(n1, n2):
        """
        Case:
         pred -> n1 -> common succ
         pred -> n2 -> common succ
        """
        preds1 = G.predecessors(n1)
        preds2 = G.predecessors(n2)
        return len(preds1) == 1  and len(preds2) == 1 and preds1[0] == preds2[0]

    def traceback_path(n1, n2):
        """
        Find a common pred where
         pred -> n1
         pred -> some_node -> n2
        """
        assert G.in_degree(n1) == 1
        pred = G.predecessors(n1)[0]
        return path_finder(G, n2, pred, [n2], 2)

    def replace_node(n_to_del, n_to_replace_with):
        """
        Replacing <n_to_del> with <n_to_replace_with>
        1. add successors of <n_to_del> to successor of <n_to_replace_with>

           ex: n' -> n1 -> n3
               n' -> n2 -> n3
               n' -> n1 -> n4
           (make sure to add n2 -> n4 if not already exists)
        2. remove <n_to_del> from graph G
        3. replace all existence of <n_to_del> in path_d
        """
        #pdb.set_trace()
        for n in G.successors_iter(n_to_del):
            if not G.has_edge(n_to_replace_with, n):
                G.add_edge(n_to_replace_with, n, weight=G.get_edge_data(n_to_del, n)['weight'])
        G.remove_node(n_to_del)
        del mermap[n_to_del]
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+1:]

    def replace_path_w_node(path_to_del, n_to_replace_with, common_succ):
        """
        
        """
        # first, it's possible that the last node in <path_to_del> has other successors
        # ex: path_to_del = x1 -> x2 -> common_succ
        #          also has       x2 -> another node x3
        # so must change to n_to_replace_with -> x3
        last_n_in_path = path_to_del[-1]
        for s,t,data in G.out_edges(last_n_in_path, data=True):
            if G.has_edge(n_to_replace_with, t):
                G[n_to_replace_with][t]['weight'] += data['weight']
            else:
                G.add_edge(n_to_replace_with, t, weight=data['weight'])

        # for every predecssor of path_to_del, replace with n_to_replace_with
        # ex:        pred -> x1 -> x2 -> ...
        # becomes    pred -> n_to_replace_with -> ...
        for pred in G.predecessors(path_to_del[0]):
            G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])['weight'])

        path_len = len(path_to_del)
        for k in path_d:
            if path_to_del[0] in path_d[k]:
                i = path_d[k].index(path_to_del[0])
                m = min(i+path_len, len(path_d[k]))
                if path_d[k][i:m] == path_to_del[:(m-i)]:
                    path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+path_len:]
        # now delete all non branching nodes in path_to_del
        # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees!

        nodes_in_path = set()
        for path in path_d.itervalues():
            nodes_in_path = nodes_in_path.union(path)
        safe_to_remove = filter(lambda x: G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        #safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        for node in safe_to_remove:
            log.debug("safe to delete from G: {0}".format(node))
            G.remove_node(node)
            del mermap[node]
    

    in_same_path = make_in_same_path(path_d)
    cands = filter(lambda n: G.in_degree(n)>=2, G.nodes_iter())
    for n in cands:
        if n not in G: continue # deleted in loop below
        _pred = G.predecessors(n)
        if len(_pred) >= 2:
            for i, n1 in enumerate(_pred):
                if n1 not in G: continue
                for n2 in _pred[i+1:]:
                    if n1 not in G or n2 not in G or n1 in in_same_path[n2]: continue

                    if has_common_unique_pred(n1, n2):
                        # what is known: common pred -> n1 -> common succ
                        #                common pred -> n2 -> common succ
                        # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1)
                        if DEBUG_FLAG:
                            pdb.set_trace()
                        if splice_align.node_is_similar(mermap[n1], mermap[n2]):
                            mermap[n1] = splice_align.get_consensus_through_voting(mermap[n1],\
                                                                                     G.get_edge_data(n1, n)['weight'],\
                                                                                     mermap[n2],\
                                                                                     G.get_edge_data(n2, n)['weight'])
                            replace_node(n_to_del=n2, n_to_replace_with=n1)
                        else:
                            flag, is_skipped = splice_align.node_is_skipping(mermap[n1], mermap[n2], cc_settings.KMER_SIZE)
                            if is_skipped:
                                if flag == "SEQ1":  # seq1 is the one with retained exon
                                    replace_node(n_to_del=n2, n_to_replace_with=n1)
                                else:
                                    replace_node(n_to_del=n1, n_to_replace_with=n2)
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                    else:
                        if G.in_degree(n1) == 1:
                            p2 = traceback_path(n1, n2)
                            if p2 is not None:
                                # common pred -> n1 -> common succ
                                # common pred -> another node -> n2 -> common succ
                                s1 = mermap[n1]
                                s2 = stitch_string_from_path(p2, mermap)
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n1] = splice_align.get_consensus_through_voting(s1,\
                                                                        G.get_edge_data(n1, n)['weight'],\
                                                                        s2,
                                                                        G.get_edge_data(n2, n)['weight'])
                                    replace_path_w_node(p2, n1, common_succ=n)

                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        log.debug("path collapse possible {0},{1}".format(n1, p2))
                                        mermap[n1] = s1 if flag == 'SEQ1' else s2
                                        replace_path_w_node(p2, n1, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        elif G.in_degree(n2) == 1:
                            p1 = traceback_path(n2, n1)
                            if p1 is not None:
                                s1 = stitch_string_from_path(p1, mermap)
                                s2 = mermap[n2]
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n2] = splice_align.get_consensus_through_voting(s1,\
                                                                                             G.get_edge_data(n1, n)['weight'],\
                                                                                             s2,
                                                                                             G.get_edge_data(n2, n)['weight'])
                                    log.debug("path collapse possible: {0},{1}".format(p1, n2))
                                    replace_path_w_node(p1, n2, common_succ=n)
                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        mermap[n2] = s1 if flag=='SEQ1' else s2
                                        replace_path_w_node(p1, n2, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        else:
                            log.debug("should NOT collapse {0},{1}".format(n1, n2))