Esempio n. 1
0
def combine_nodes(graph: GraphContainer):
    """
    Combine adjacent nodes with the same sequence labels
    """
    for n1 in list(graph.nodes.values()):
        if len(list(graph.outEdges(n1))) != 1:
            continue  # Pair of nodes with no other in/out edges
        n2 = graph.nodes[next(graph.outEdges(n1))["to"]]
        if len(list(graph.inEdges(n2))) != 1:
            continue
        if not (n1["chrom"] == n2["chrom"] and n1["end"] + 1 == n2["start"]):
            continue  # nodes must be adjacent
        haplos = n1["sequences"]
        if n2["sequences"] != haplos:
            continue  # only collapse nodes with same haplotypes
        if "reference" in n1:
            if "reference" not in n2:
                continue  # nodes must be of same type
            node = graph.add_refNode(n1["chrom"], n1["start"], n2["end"], haplos)
        else:
            if "reference" in n2:
                continue  # nodes must be of same type
            node = graph.add_altNode(
                n1["chrom"], n1["start"], n2["end"], n1["sequence"] + n2["sequence"], haplos)
        logging.info("Combinding %s and %s", n1['name'], n2['name'])
        for e in list(graph.inEdges(n1)):
            graph.add_edge(graph.nodes[e["from"]], node, e["sequences"])
        for e in list(graph.outEdges(n2)):
            graph.add_edge(node, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(n1)
        graph.del_node(n2)
Esempio n. 2
0
def split_alt_nodes(graph: GraphContainer, max_len=300, padding_len=150):
    """
    Split long alternate nodes
    :param graph: graph to work on
    :param max_len: max length of reference node with no sequences
    :param padding_len: length of sequence to keep
    """
    assert max_len >= 2 * padding_len
    for node in list(graph.altNodes()):
        if len(node["sequence"]) <= max_len:
            continue
        logging.info(f"Splitting long ALT node: {node['name']}")

        n1 = graph.add_altNode(node["chrom"], node["start"], node["end"],
                               node["sequence"][:padding_len],
                               node["sequences"])
        n2 = graph.add_altNode(node["chrom"], node["start"], node["end"],
                               node["sequence"][-padding_len:],
                               node["sequences"])

        for e in list(graph.inEdges(node)):
            graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"])
        for e in list(graph.outEdges(node)):
            graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(node)
Esempio n. 3
0
def split_node(graph: GraphContainer, node, breakpoints):
    """
    Split a node at a set of breakpoints and link new (sub-)nodes
    Used to link to new variant nodes later
    Modifies graph and deletes node after splitting
    :returns Created sub-nodes
    """
    if not breakpoints:
        return node
    breakpoints = sorted(set(breakpoints))
    logging.debug(f"Splitting {node['name']} at {breakpoints}")
    nodes = []
    lEnd = 0
    for p in breakpoints:
        assert 0 <= p <= node["end"] - node["start"] + 1
        nStart = node["start"] + lEnd
        nEnd = node["start"] + p - 1
        if "reference" in node:
            nodes.append(
                graph.add_refNode(node["chrom"], nStart, nEnd,
                                  node["sequences"]))
        else:
            seq = node["sequence"][lEnd:p]
            nodes.append(
                graph.add_altNode(node["chrom"], nStart, nEnd, seq,
                                  node["sequences"]))
        lEnd = p
    # Add last node
    lStart = node["start"] + breakpoints[-1]
    if "reference" in node:
        nodes.append(
            graph.add_refNode(node["chrom"], lStart, node["end"],
                              node["sequences"]))
    else:
        seq = node["sequence"][breakpoints[-1]:]
        nodes.append(
            graph.add_altNode(node["chrom"], lStart, node["end"], seq,
                              node["sequences"]))
    # Connect nodes
    for e in graph.inEdges(node):
        graph.add_edge(graph.nodes[e["from"]], nodes[0], e["sequences"])
    for e in graph.outEdges(node):
        graph.add_edge(nodes[-1], graph.nodes[e["to"]], e["sequences"])
    for (n1, n2) in zip(nodes[:-1], nodes[1:]):
        graph.add_edge(n1, n2)
    # Delete original node, unless identical to new node (no split)
    if node['name'] not in [n['name'] for n in nodes]:
        graph.del_node(node)
    return nodes
Esempio n. 4
0
def split_ref_nodes(graph: GraphContainer, max_len=300, padding_len=150):
    """
    Split long reference nodes
    :param graph: graph to work on
    :param max_len: max length of reference node with no sequences
    :param padding_len: length of sequence to keep
    """
    assert max_len >= 2 * padding_len
    for node in list(graph.refNodes()):
        if node["end"] - node["start"] + 1 <= max_len:
            continue
        logging.info("Splitting long REF node: %s", node['name'])
        firstEnd = node["start"] + padding_len - 1
        n1 = graph.add_refNode(node["chrom"], node["start"], firstEnd, node["sequences"])
        sndStart = node["end"] - padding_len + 1
        n2 = graph.add_refNode(node["chrom"], sndStart, node["end"], node["sequences"])

        for e in list(graph.inEdges(node)):
            graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"])
        for e in list(graph.outEdges(node)):
            graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(node)
Esempio n. 5
0
def remove_empty_nodes(graph: GraphContainer):
    """
    Remove nodes without sequence (from deletions / skipped insertions or split ref nodes)
    Merge in & out edge pairs to keep connections
    """
    for node in list(graph.nodes.values()):
        if (("reference" in node and node["start"] <= node["end"]) or
                node.get("sequence", "") != ""):
            continue
        logging.info("Removing empty node %s", node['name'])
        inSeqs = [s for e in graph.inEdges(node) for s in e["sequences"]]
        outSeqs = [s for e in graph.outEdges(node) for s in e["sequences"]]
        for e1 in list(graph.inEdges(node)):
            for e2 in list(graph.outEdges(node)):
                # Label the new edges with sequence labels either observed
                # on both merged in- and out-edge or on an in (out) -edge only
                # if the label is undetermined goung out (in)
                haplos = e1["sequences"].intersection(e2["sequences"]).union(
                    e1["sequences"].difference(outSeqs).union(
                        e2["sequences"].difference(inSeqs)))
                graph.add_edge(graph.nodes[e1["from"]], graph.nodes[e2["to"]], haplos)
        graph.del_node(node)