Beispiel #1
0
def find_dangling_sinks(G, path_d, mermap):
    """
    For isoforms w/ 3' alt ends that have a longer last exon, it shows up as a *branch* from the path

    ex:
    ... -> pred -> sink_node (sink_node has only one incoming edge)
    ... -> pred -> n' -> ...other path...
    (pred only has two outgoing edges)

    we can "tuck" sink into n' if and only if sink is a substring of n'

    pred = [prefix] + [suffix]
    sink_node = [suffix] + [extra]
    n' = [suffix] + [...]

    update by deleting sink_node, and updating:
    pred = [prefix] + [suffix] + [extra]
    n' = [just use last k-mer of extra] + [...]
    """
    cand_sinks = [
        n for n in G.nodes() if G.out_degree(n) == 0 and G.in_degree(n) == 1
    ]
    for sink in cand_sinks:
        pred = next(G.predecessors(sink))
        for n in G.successors(pred):
            if n == sink or n not in G: continue
            if splice_align.node_is_similar(mermap[sink],
                                            mermap[n][:len(mermap[sink])]):
                log.debug(
                    "tugging dangling sink: {0}->{1}(sink), {0}->{2}".format(
                        pred, sink, n))
                # sink is just a shortened version of <n>
                # just update all paths with presence of <sink> to <n>
                # and safely remove <sink> from G
                for k in path_d:
                    if sink in path_d[k]:
                        assert path_d[k][-1] == sink
                        path_d[k] = path_d[k][:-1] + [n]
                del mermap[sink]
                G.remove_node(sink)
                break
Beispiel #2
0
def find_dangling_sinks(G, path_d, mermap):
    """
    For isoforms w/ 3' alt ends that have a longer last exon, it shows up as a *branch* from the path

    ex:
    ... -> pred -> sink_node (sink_node has only one incoming edge)
    ... -> pred -> n' -> ...other path...
    (pred only has two outgoing edges)

    we can "tuck" sink into n' if and only if sink is a substring of n'

    pred = [prefix] + [suffix]
    sink_node = [suffix] + [extra]
    n' = [suffix] + [...]

    update by deleting sink_node, and updating:
    pred = [prefix] + [suffix] + [extra]
    n' = [just use last k-mer of extra] + [...]
    """
    cand_sinks = filter(lambda n: G.out_degree(n) == 0 and G.in_degree(n) == 1, G.nodes_iter())
    for sink in cand_sinks:
        pred = G.predecessors(sink)[0]
        for n in G.successors(pred):
            if n == sink or n not in G:
                continue
            if splice_align.node_is_similar(mermap[sink], mermap[n][: len(mermap[sink])]):
                log.debug("tugging dangling sink: {0}->{1}(sink), {0}->{2}".format(pred, sink, n))
                # sink is just a shortened version of <n>
                # just update all paths with presence of <sink> to <n>
                # and safely remove <sink> from G
                for k in path_d:
                    if sink in path_d[k]:
                        assert path_d[k][-1] == sink
                        path_d[k] = path_d[k][:-1] + [n]
                del mermap[sink]
                G.remove_node(sink)
                break
Beispiel #3
0
def find_bubbles(G, path_d, mermap):
    """
    We find all cases where n' -> n1 -> n3
                            n' -> n2 -> n3
    (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing
    <i> make sure that n1 and n2 is not used in the same path  (which indicates in-gene repeat?)
    <ii> retrace n1, n2 to make sure that they are largely similar
    """

    def has_common_unique_pred(n1, n2):
        """
        Case:
         pred -> n1 -> common succ
         pred -> n2 -> common succ
        """
        preds1 = G.predecessors(n1)
        preds2 = G.predecessors(n2)
        return len(preds1) == 1 and len(preds2) == 1 and preds1[0] == preds2[0]

    def traceback_path(n1, n2):
        """
        Find a common pred where
         pred -> n1
         pred -> some_node -> n2
        """
        assert G.in_degree(n1) == 1
        pred = G.predecessors(n1)[0]
        return path_finder(G, n2, pred, [n2], 2)

    def replace_node(n_to_del, n_to_replace_with):
        # pdb.set_trace()
        G.remove_node(n_to_del)
        del mermap[n_to_del]
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + 1 :]

    def replace_path_w_node(path_to_del, n_to_replace_with, common_succ):
        """
        
        """
        # first, it's possible that the last node in <path_to_del> has other successors
        # ex: path_to_del = x1 -> x2 -> common_succ
        #          also has       x2 -> another node x3
        # so must change to n_to_replace_with -> x3
        last_n_in_path = path_to_del[-1]
        for s, t, data in G.out_edges(last_n_in_path, data=True):
            if G.has_edge(n_to_replace_with, t):
                G[n_to_replace_with][t]["weight"] += data["weight"]
            else:
                G.add_edge(n_to_replace_with, t, weight=data["weight"])

        # for every predecssor of path_to_del, replace with n_to_replace_with
        # ex:        pred -> x1 -> x2 -> ...
        # becomes    pred -> n_to_replace_with -> ...
        for pred in G.predecessors(path_to_del[0]):
            G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])["weight"])

        path_len = len(path_to_del)
        for k in path_d:
            if path_to_del[0] in path_d[k]:
                i = path_d[k].index(path_to_del[0])
                m = min(i + path_len, len(path_d[k]))
                if path_d[k][i:m] == path_to_del[: (m - i)]:
                    path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + path_len :]
        # now delete all non branching nodes in path_to_del
        # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees!

        nodes_in_path = set()
        for path in path_d.itervalues():
            nodes_in_path = nodes_in_path.union(path)
        safe_to_remove = filter(lambda x: G.out_degree(x) <= 1 and x not in nodes_in_path, path_to_del)
        # safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        for node in safe_to_remove:
            log.debug("safe to delete from G: {0}".format(node))
            G.remove_node(node)
            del mermap[node]

    in_same_path = make_in_same_path(path_d)
    cands = filter(lambda n: G.in_degree(n) >= 2, G.nodes_iter())
    for n in cands:
        if n not in G:
            continue  # deleted in loop below
        _pred = G.predecessors(n)
        if len(_pred) >= 2:
            for i, n1 in enumerate(_pred):
                if n1 not in G:
                    continue
                for n2 in _pred[i + 1 :]:
                    if n1 not in G or n2 not in G or n1 in in_same_path[n2]:
                        continue

                    if has_common_unique_pred(n1, n2):
                        # what is known: common pred -> n1 -> common succ
                        #                common pred -> n2 -> common succ
                        # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1)
                        if DEBUG_FLAG:
                            pdb.set_trace()
                        if splice_align.node_is_similar(mermap[n1], mermap[n2]):
                            mermap[n1] = splice_align.get_consensus_through_voting(
                                mermap[n1],
                                G.get_edge_data(n1, n)["weight"],
                                mermap[n2],
                                G.get_edge_data(n2, n)["weight"],
                            )
                            replace_node(n_to_del=n2, n_to_replace_with=n1)
                        else:
                            flag, is_skipped = splice_align.node_is_skipping(
                                mermap[n1], mermap[n2], cc_settings.KMER_SIZE
                            )
                            if is_skipped:
                                if flag == "SEQ1":  # seq1 is the one with retained exon
                                    replace_node(n_to_del=n2, n_to_replace_with=n1)
                                else:
                                    replace_node(n_to_del=n1, n_to_replace_with=n2)
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                    else:
                        if G.in_degree(n1) == 1:
                            p2 = traceback_path(n1, n2)
                            if p2 is not None:
                                # common pred -> n1 -> common succ
                                # common pred -> another node -> n2 -> common succ
                                s1 = mermap[n1]
                                s2 = stitch_string_from_path(p2, mermap)
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n1] = splice_align.get_consensus_through_voting(
                                        s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"]
                                    )
                                    replace_path_w_node(p2, n1, common_succ=n)

                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        log.debug("path collapse possible {0},{1}".format(n1, p2))
                                        mermap[n1] = s1 if flag == "SEQ1" else s2
                                        replace_path_w_node(p2, n1, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        elif G.in_degree(n2) == 1:
                            p1 = traceback_path(n2, n1)
                            if p1 is not None:
                                s1 = stitch_string_from_path(p1, mermap)
                                s2 = mermap[n2]
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n2] = splice_align.get_consensus_through_voting(
                                        s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"]
                                    )
                                    log.debug("path collapse possible: {0},{1}".format(p1, n2))
                                    replace_path_w_node(p1, n2, common_succ=n)
                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        mermap[n2] = s1 if flag == "SEQ1" else s2
                                        replace_path_w_node(p1, n2, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        else:
                            log.debug("should NOT collapse {0},{1}".format(n1, n2))
Beispiel #4
0
def find_source_bubbles(G, path_d, mermap):
    """
    Find all cases where
       src1 --> n3 
       ...> path1 --> n3
    and that
    <i> src1 and path1 each has only one outgoing edge to n3
    <ii> src1 and path1 are similar

    path1: can also be a source
    """

    def traceback(cur):
        """
        Retrace path of n1 -> n2 ... -> cur
        where n1, n2....all have exactly one outgoing edge
        """
        acc = []
        while True:
            acc.append(cur)
            preds = G.predecessors(cur)
            if len(preds) == 0 or len(preds) > 1 or G.out_degree(preds[0]) > 1:
                break
            cur = preds[0]
        return acc[::-1]

    def replace_node(n_to_del, path_to_replace_with):
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + path_to_replace_with + path_d[k][i + 1 :]
        G.remove_node(n_to_del)
        del mermap[n_to_del]

    in_same_path = make_in_same_path(path_d)
    sources = filter(lambda n: G.in_degree(n) == 0, G.nodes_iter())
    for src1 in sources:
        if src1 not in G:
            continue  # deleted in the loop below
        succ = G.successors(src1)
        if len(succ) == 1:
            n3 = succ[0]
            cands = G.predecessors(n3)
            for n in cands:
                if src1 not in G:
                    break  # deleted, jump out of this
                if n not in G:
                    continue  # deleted in the loop below
                if n != src1 and n not in in_same_path[src1] and (n in sources or G.out_degree(n) == 1):
                    t = traceback(n)
                    seq1 = mermap[src1]
                    seq2 = stitch_string_from_path(t, mermap)
                    minlen = min(len(seq1), len(seq2))
                    # we know that seq1 and seq2 both have the same successor so they must share the same last (KMER_SIZE-1) suffix
                    if DEBUG_FLAG:
                        pdb.set_trace()
                    if splice_align.node_is_similar(seq1[::-1][:minlen], seq2[::-1][:minlen]):
                        # should collapse src1 into n
                        # to do so:
                        # (1) if both are sources, replace the shorter src with the longer src
                        # (2) if one is src other is path, replace the src with path
                        # -- delete the replaced node from G
                        # -- for all path in path_d that uses the deleted node, update with replacement
                        log.debug("should collapse {0},{1}".format(src1, t))
                        if len(t) == 1 and t[0] in sources:  # both are sources
                            if len(mermap[t[0]]) > len(mermap[src1]):
                                replace_node(n_to_del=src1, path_to_replace_with=t)
                            else:  # src1 is longer, use src1
                                replace_node(t[0], [src1])
                        else:  # src1 is a source, <t> is not a source node but a path
                            if len(seq2) > len(seq1):  # let's just "tuck" src1 into <t>
                                replace_node(src1, t)
                            else:  # we don't know if nodes in <t> branch out so we can't collapse them
                                log.debug("should NOT collapse {0},{1}".format(src1, t))
                    else:  # src1 and <t> are not similar enough, DO NOT collapse
                        log.debug("should NOT collapse {0},{1}".format(src1, t))
Beispiel #5
0
def find_bubbles(G, path_d, mermap):
    """
    We find all cases where n' -> n1 -> n3
                            n' -> n2 -> n3
    (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing
    <i> make sure that n1 and n2 is not used in the same path  (which indicates in-gene repeat?)
    <ii> retrace n1, n2 to make sure that they are largely similar
    """
    def has_common_unique_pred(n1, n2):
        """
        Case:
         pred -> n1 -> common succ
         pred -> n2 -> common succ
        """
        preds1 = G.predecessors(n1)
        preds2 = G.predecessors(n2)
        return len(preds1) == 1  and len(preds2) == 1 and preds1[0] == preds2[0]

    def traceback_path(n1, n2):
        """
        Find a common pred where
         pred -> n1
         pred -> some_node -> n2
        """
        assert G.in_degree(n1) == 1
        pred = G.predecessors(n1)[0]
        return path_finder(G, n2, pred, [n2], 2)

    def replace_node(n_to_del, n_to_replace_with):
        """
        Replacing <n_to_del> with <n_to_replace_with>
        1. add successors of <n_to_del> to successor of <n_to_replace_with>

           ex: n' -> n1 -> n3
               n' -> n2 -> n3
               n' -> n1 -> n4
           (make sure to add n2 -> n4 if not already exists)
        2. remove <n_to_del> from graph G
        3. replace all existence of <n_to_del> in path_d
        """
        #pdb.set_trace()
        for n in G.successors_iter(n_to_del):
            if not G.has_edge(n_to_replace_with, n):
                G.add_edge(n_to_replace_with, n, weight=G.get_edge_data(n_to_del, n)['weight'])
        G.remove_node(n_to_del)
        del mermap[n_to_del]
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+1:]

    def replace_path_w_node(path_to_del, n_to_replace_with, common_succ):
        """
        
        """
        # first, it's possible that the last node in <path_to_del> has other successors
        # ex: path_to_del = x1 -> x2 -> common_succ
        #          also has       x2 -> another node x3
        # so must change to n_to_replace_with -> x3
        last_n_in_path = path_to_del[-1]
        for s,t,data in G.out_edges(last_n_in_path, data=True):
            if G.has_edge(n_to_replace_with, t):
                G[n_to_replace_with][t]['weight'] += data['weight']
            else:
                G.add_edge(n_to_replace_with, t, weight=data['weight'])

        # for every predecssor of path_to_del, replace with n_to_replace_with
        # ex:        pred -> x1 -> x2 -> ...
        # becomes    pred -> n_to_replace_with -> ...
        for pred in G.predecessors(path_to_del[0]):
            G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])['weight'])

        path_len = len(path_to_del)
        for k in path_d:
            if path_to_del[0] in path_d[k]:
                i = path_d[k].index(path_to_del[0])
                m = min(i+path_len, len(path_d[k]))
                if path_d[k][i:m] == path_to_del[:(m-i)]:
                    path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+path_len:]
        # now delete all non branching nodes in path_to_del
        # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees!

        nodes_in_path = set()
        for path in path_d.itervalues():
            nodes_in_path = nodes_in_path.union(path)
        safe_to_remove = filter(lambda x: G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        #safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del)
        for node in safe_to_remove:
            log.debug("safe to delete from G: {0}".format(node))
            G.remove_node(node)
            del mermap[node]
    

    in_same_path = make_in_same_path(path_d)
    cands = filter(lambda n: G.in_degree(n)>=2, G.nodes_iter())
    for n in cands:
        if n not in G: continue # deleted in loop below
        _pred = G.predecessors(n)
        if len(_pred) >= 2:
            for i, n1 in enumerate(_pred):
                if n1 not in G: continue
                for n2 in _pred[i+1:]:
                    if n1 not in G or n2 not in G or n1 in in_same_path[n2]: continue

                    if has_common_unique_pred(n1, n2):
                        # what is known: common pred -> n1 -> common succ
                        #                common pred -> n2 -> common succ
                        # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1)
                        if DEBUG_FLAG:
                            pdb.set_trace()
                        if splice_align.node_is_similar(mermap[n1], mermap[n2]):
                            mermap[n1] = splice_align.get_consensus_through_voting(mermap[n1],\
                                                                                     G.get_edge_data(n1, n)['weight'],\
                                                                                     mermap[n2],\
                                                                                     G.get_edge_data(n2, n)['weight'])
                            replace_node(n_to_del=n2, n_to_replace_with=n1)
                        else:
                            flag, is_skipped = splice_align.node_is_skipping(mermap[n1], mermap[n2], cc_settings.KMER_SIZE)
                            if is_skipped:
                                if flag == "SEQ1":  # seq1 is the one with retained exon
                                    replace_node(n_to_del=n2, n_to_replace_with=n1)
                                else:
                                    replace_node(n_to_del=n1, n_to_replace_with=n2)
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                    else:
                        if G.in_degree(n1) == 1:
                            p2 = traceback_path(n1, n2)
                            if p2 is not None:
                                # common pred -> n1 -> common succ
                                # common pred -> another node -> n2 -> common succ
                                s1 = mermap[n1]
                                s2 = stitch_string_from_path(p2, mermap)
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n1] = splice_align.get_consensus_through_voting(s1,\
                                                                        G.get_edge_data(n1, n)['weight'],\
                                                                        s2,
                                                                        G.get_edge_data(n2, n)['weight'])
                                    replace_path_w_node(p2, n1, common_succ=n)

                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        log.debug("path collapse possible {0},{1}".format(n1, p2))
                                        mermap[n1] = s1 if flag == 'SEQ1' else s2
                                        replace_path_w_node(p2, n1, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        elif G.in_degree(n2) == 1:
                            p1 = traceback_path(n2, n1)
                            if p1 is not None:
                                s1 = stitch_string_from_path(p1, mermap)
                                s2 = mermap[n2]
                                if DEBUG_FLAG:
                                    pdb.set_trace()
                                if splice_align.node_is_similar(s1, s2):
                                    mermap[n2] = splice_align.get_consensus_through_voting(s1,\
                                                                                             G.get_edge_data(n1, n)['weight'],\
                                                                                             s2,
                                                                                             G.get_edge_data(n2, n)['weight'])
                                    log.debug("path collapse possible: {0},{1}".format(p1, n2))
                                    replace_path_w_node(p1, n2, common_succ=n)
                                else:
                                    flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE)
                                    if is_skipped:
                                        mermap[n2] = s1 if flag=='SEQ1' else s2
                                        replace_path_w_node(p1, n2, common_succ=n)
                                    else:
                                        log.debug("should NOT collapse {0},{1}".format(n1, n2))
                            else:
                                log.debug("should NOT collapse {0},{1}".format(n1, n2))
                        else:
                            log.debug("should NOT collapse {0},{1}".format(n1, n2))
Beispiel #6
0
def find_source_bubbles(G, path_d, mermap):
    """
    Find all cases where
       src1 --> n3 
       ...> path1 --> n3
    and that
    <i> src1 and path1 each has only one outgoing edge to n3
    <ii> src1 and path1 are similar

    path1: can also be a source
    """
    def traceback(cur):
        """
        Retrace path of n1 -> n2 ... -> cur
        where n1, n2....all have exactly one outgoing edge
        """
        acc = []
        while True:
            acc.append(cur)
            preds = G.predecessors(cur)
            if len(preds) == 0 or len(preds) > 1 or G.out_degree(preds[0]) > 1:
                break
            cur = preds[0]
        return acc[::-1]

    def replace_node(n_to_del, path_to_replace_with):
        for k in path_d:
            if n_to_del in path_d[k]:
                i = path_d[k].index(n_to_del)
                path_d[k] = path_d[k][:i] + path_to_replace_with + path_d[k][i+1:]
        G.remove_node(n_to_del)
        del mermap[n_to_del]

    in_same_path = make_in_same_path(path_d)
    sources = filter(lambda n: G.in_degree(n) == 0, G.nodes_iter())
    for src1 in sources:
        if src1 not in G: continue # deleted in the loop below
        succ = G.successors(src1)
        if len(succ) == 1:
            n3 = succ[0]
            cands = G.predecessors(n3)
            for n in cands:
                if src1 not in G: break  # deleted, jump out of this
                if n not in G: continue # deleted in the loop below
                if n!=src1 and n not in in_same_path[src1] and (n in sources or G.out_degree(n)==1):
                    t = traceback(n)
                    seq1 = mermap[src1]
                    seq2 = stitch_string_from_path(t, mermap)
                    minlen = min(len(seq1), len(seq2))
                    # we know that seq1 and seq2 both have the same successor so they must share the same last (KMER_SIZE-1) suffix
                    if DEBUG_FLAG:
                        pdb.set_trace()
                    if splice_align.node_is_similar(seq1[::-1][:minlen], seq2[::-1][:minlen]):
                        # should collapse src1 into n
                        # to do so: 
                        # (1) if both are sources, replace the shorter src with the longer src
                        # (2) if one is src other is path, replace the src with path
                        # -- delete the replaced node from G
                        # -- for all path in path_d that uses the deleted node, update with replacement
                        log.debug("should collapse {0},{1}".format(src1, t))
                        if len(t) == 1 and t[0] in sources: # both are sources
                            if len(mermap[t[0]]) > len(mermap[src1]):
                                replace_node(n_to_del=src1, path_to_replace_with=t)
                            else: # src1 is longer, use src1
                                replace_node(t[0], [src1])
                        else: # src1 is a source, <t> is not a source node but a path
                            if len(seq2) > len(seq1):  # let's just "tuck" src1 into <t>
                                replace_node(src1, t)
                            else: # we don't know if nodes in <t> branch out so we can't collapse them
                                log.debug("should NOT collapse {0},{1}".format(src1, t))
                    else: # src1 and <t> are not similar enough, DO NOT collapse
                        log.debug("should NOT collapse {0},{1}".format(src1, t))