Esempi in Python per GraphContainer, esempi in Python per grm.vcfgraph.graphContainer.GraphContainer

Esempio n. 1

0

Mostra file

def get_path(graph: GraphContainer, sequence):
    """
    Return paths (list of nodes) covering all edges for one sequence (haplotype)
    :param graph: graph to work on
    :param sequence: Haplotype to cover
    """
    nodes, edges = graph.topological_sort()
    for e in edges:
        if "mark" in e:
            del e["mark"]
    paths = []

    def visit(edge, curPath):
        node = graph.nodes[edge["to"]]
        curPath = curPath + [node["name"]]
        edge["mark"] = True
        paths = []
        for e in graph.outEdges(node, sequence):
            if "mark" not in e:
                paths.extend(visit(e, curPath))
        if not paths:
            paths = [curPath]
        return paths

    paths = []
    for node in nodes:
        for edge in graph.outEdges(node, sequence):
            if "mark" not in edge:
                paths += visit(edge, [node["name"]])
    return paths

Esempio n. 2

0

Mostra file

def add_source_sink(graph: GraphContainer,
                    source_name="source",
                    sink_name="sink"):
    """
    add source and sink if necessary and link to exising nodes without in/out edges
    :param graph: graph to work on
    :param source_name: name of source node
    :param sink_name: name of sink node
    """
    if source_name not in graph.nodes:
        graph.nodes[source_name] = {
            "name": source_name,
            "sequence": "N" * 10
        }
    if sink_name not in graph.nodes:
        graph.nodes[sink_name] = {
            "name": sink_name,
            "sequence": "N" * 10
        }
    # Link nodes without incoming/outgoing edges to source/sink
    for node in graph.nodes.values():
        if node["name"] in [source_name, sink_name]:
            continue
        if not any(graph.inEdges(node)):
            logging.info("Linking %s from source", node['name'])
            graph.add_edge(graph.nodes[source_name], node)
        if not any(graph.outEdges(node)):
            logging.info("Linking %s to sink", node['name'])
            graph.add_edge(node, graph.nodes[sink_name])

Esempio n. 3

0

Mostra file

def remove_redundant_edge_labels(graph: GraphContainer):
    """
    Remove edge sequence labels that don't add information about haplotypes.
    """
    # If all out-edges of a node have the same edge label
    # and the node is already labeled on an in-edge
    # and those edges to not connect to a node with that label
    for node in graph.nodes.values():
        for haplo in node["sequences"]:
            for e in graph.outEdges(node):
                if haplo not in e["sequences"]:
                    break
                if haplo in graph.nodes[e["to"]]["sequences"]:
                    break
            else:
                for e in graph.outEdges(node):
                    e["sequences"].remove(haplo)

Esempio n. 4

0

Mostra file

def add_variants_node(graph: GraphContainer, node, variants):
    """ Add variants to one node in the graph """
    bps = []
    for var in variants:
        assert var.start <= var.end + 1
        assert not (var.start == var.end + 1 and var.alt == "")
        logging.info(f"{node}: {var}")
        bps.extend((var.start, var.end + 1))
    nodes = split_node(graph, node, bps)
    nodesEnding = {node["end"]: node for node in nodes[:-1]}
    nodesStarting = {node["start"]: node for node in nodes}
    for var in variants:
        vStart = node["start"] + var.start
        vEnd = node["start"] + var.end
        alt = graph.add_altNode(node["chrom"], vStart, vEnd, var.alt)
        graph.add_edge(nodesEnding[vStart - 1], alt)
        graph.add_edge(alt, nodesStarting[vEnd + 1])

Esempio n. 5

0

Mostra file

def add_graph(graph1: GraphContainer, graph2: GraphContainer):
    """
    Add all nodes, edges and paths from graph2 to graph1 (inplace)
    """
    for node in graph2.refNodes():
        graph1.add_refNode(
            node["chrom"], node["start"], node["end"], node["sequences"])
    for node in graph2.altNodes():
        graph1.add_altNode(
            node["chrom"], node["start"], node["end"], node["sequence"], node["sequences"])
    for edge in graph2.edges.values():
        graph1.add_edge(graph1.nodes[edge["from"]], graph1.nodes[edge["to"]], edge["sequences"])
    graph1.paths += graph2.paths

Esempio n. 6

0

Mostra file

def add_variants_node(graph: GraphContainer, node, variants):
    """ Add variants to one node in the graph """
    bps = []
    for var in variants:
        if var.start > var.end + 1:
            raise Exception("Variant start({}) > variane end({})!".format(
                var.start, var.end))
        if var.start == var.end + 1 and not var.alt:
            raise Exception(
                "Variant start({}) == end but no insertion sequence is specified."
                .format(var.start))
        bps.extend((var.start, var.end + 1))
    nodes = split_node(graph, node, bps)
    nodesEnding = {node["end"]: node for node in nodes[:-1]}
    nodesStarting = {node["start"]: node for node in nodes}
    for var in variants:
        vStart = node["start"] + var.start
        vEnd = node["start"] + var.end
        alt = graph.add_altNode(node["chrom"], vStart, vEnd, var.alt)
        graph.add_edge(nodesEnding[vStart - 1], alt)
        graph.add_edge(alt, nodesStarting[vEnd + 1])

Esempio n. 7

0

Mostra file

def remove_empty_nodes(graph: GraphContainer):
    """
    Remove nodes without sequence (from deletions / skipped insertions or split ref nodes)
    Merge in & out edge pairs to keep connections
    """
    for node in list(graph.nodes.values()):
        if (("reference" in node and node["start"] <= node["end"]) or
                node.get("sequence", "") != ""):
            continue
        logging.info("Removing empty node %s", node['name'])
        inSeqs = [s for e in graph.inEdges(node) for s in e["sequences"]]
        outSeqs = [s for e in graph.outEdges(node) for s in e["sequences"]]
        for e1 in list(graph.inEdges(node)):
            for e2 in list(graph.outEdges(node)):
                # Label the new edges with sequence labels either observed
                # on both merged in- and out-edge or on an in (out) -edge only
                # if the label is undetermined goung out (in)
                haplos = e1["sequences"].intersection(e2["sequences"]).union(
                    e1["sequences"].difference(outSeqs).union(
                        e2["sequences"].difference(inSeqs)))
                graph.add_edge(graph.nodes[e1["from"]], graph.nodes[e2["to"]], haplos)
        graph.del_node(node)

Esempio n. 8

0

Mostra file

def split_alt_nodes(graph: GraphContainer, max_len=300, padding_len=150):
    """
    Split long alternate nodes
    :param graph: graph to work on
    :param max_len: max length of reference node with no sequences
    :param padding_len: length of sequence to keep
    """
    assert max_len >= 2 * padding_len
    for node in list(graph.altNodes()):
        if len(node["sequence"]) <= max_len:
            continue
        logging.info("Splitting long ALT node: %s", node['name'])

        n1 = graph.add_altNode(node["chrom"], node["start"], node["end"],
                               node["sequence"][:padding_len], node["sequences"])
        n2 = graph.add_altNode(node["chrom"], node["start"], node["end"],
                               node["sequence"][-padding_len:], node["sequences"])

        for e in list(graph.inEdges(node)):
            graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"])
        for e in list(graph.outEdges(node)):
            graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(node)

Esempio n. 9

0

Mostra file

def split_ref_nodes(graph: GraphContainer, max_len=300, padding_len=150):
    """
    Split long reference nodes
    :param graph: graph to work on
    :param max_len: max length of reference node with no sequences
    :param padding_len: length of sequence to keep
    """
    assert max_len >= 2 * padding_len
    for node in list(graph.refNodes()):
        if node["end"] - node["start"] + 1 <= max_len:
            continue
        logging.info("Splitting long REF node: %s", node['name'])
        firstEnd = node["start"] + padding_len - 1
        n1 = graph.add_refNode(node["chrom"], node["start"], firstEnd, node["sequences"])
        sndStart = node["end"] - padding_len + 1
        n2 = graph.add_refNode(node["chrom"], sndStart, node["end"], node["sequences"])

        for e in list(graph.inEdges(node)):
            graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"])
        for e in list(graph.outEdges(node)):
            graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(node)

Esempio n. 10

0

Mostra file

def load_json(json) -> GraphContainer:
    """
    Construct graph object from JSON representation
    :param json: Dictionary of JSON file contents
    """
    graph = GraphContainer()
    for node in json["nodes"]:
        seqs = node.get("sequences", ())
        if "reference" in node:
            chrom, start, end = parse_region(node["reference"])
            graph.add_refNode(chrom, start, end, seqs, node["name"])
        elif "position" in node:
            chrom, start, end = parse_region(node["position"])
            graph.add_altNode(chrom, start, end, node["sequence"], seqs, node["name"])
        else:
            graph.nodes[node["name"]] = node
    for edge in json["edges"]:
        seqs = edge.get("sequences", ())
        graph.add_edge(graph.nodes[edge["from"]], graph.nodes[edge["to"]], seqs)
    graph.name = json["model_name"]
    graph.paths = json.get("paths", [])
    graph.target_regions = json.get("target_regions", [])
    graph.check()
    return graph

Esempio n. 11

0

Mostra file

def combine_nodes(graph: GraphContainer):
    """
    Combine adjacent nodes with the same sequence labels
    """
    for n1 in list(graph.nodes.values()):
        if len(list(graph.outEdges(n1))) != 1:
            continue  # Pair of nodes with no other in/out edges
        n2 = graph.nodes[next(graph.outEdges(n1))["to"]]
        if len(list(graph.inEdges(n2))) != 1:
            continue
        if not (n1["chrom"] == n2["chrom"] and n1["end"] + 1 == n2["start"]):
            continue  # nodes must be adjacent
        haplos = n1["sequences"]
        if n2["sequences"] != haplos:
            continue  # only collapse nodes with same haplotypes
        if "reference" in n1:
            if "reference" not in n2:
                continue  # nodes must be of same type
            node = graph.add_refNode(n1["chrom"], n1["start"], n2["end"], haplos)
        else:
            if "reference" in n2:
                continue  # nodes must be of same type
            node = graph.add_altNode(
                n1["chrom"], n1["start"], n2["end"], n1["sequence"] + n2["sequence"], haplos)
        logging.info("Combinding %s and %s", n1['name'], n2['name'])
        for e in list(graph.inEdges(n1)):
            graph.add_edge(graph.nodes[e["from"]], node, e["sequences"])
        for e in list(graph.outEdges(n2)):
            graph.add_edge(node, graph.nodes[e["to"]], e["sequences"])
        graph.del_node(n1)
        graph.del_node(n2)

Esempio n. 12

0

Mostra file

File: __init__.py Progetto: pkrusche/paragraph

def convert_vcf(vcf,
                ref,
                target_regions=None,
                ref_node_padding=150,
                ref_node_max_length=1000,
                allele_graph=False,
                simplify=True,
                alt_paths=False,
                alt_splitting=False):
    """
    Convert a single VCF file to a graph dictionary
    :param vcf: file name of the VCF file
    :param ref: reference FASTA file name
    :param target_regions: target region list
    :param ref_node_padding: padding / read length
    :param ref_node_max_length: maximum length before splitting a reference node
    :param allele_graph: add edges between any compatible allele pair, not just haplotypes from input
    :param simplify: simplify the graph
    :param alt_paths: Add all possible non-reference paths to the graph
    :param alt_splitting: also split long alt nodes (e.g. long insertions)
    :return: dictionary containing JSON graph
    """
    graph = GraphContainer("Graph from %s" % vcf)
    indexed_vcf = tempfile.NamedTemporaryFile(delete=False, suffix=".vcf.gz")
    try:
        indexed_vcf.close()
        # noinspection PyUnresolvedReferences
        pysam.bcftools.view(vcf, "-o", indexed_vcf.name, "-O", "z", catch_stdout=False)  # pylint: disable=no-member
        # noinspection PyUnresolvedReferences
        pysam.bcftools.index(indexed_vcf.name)  # pylint: disable=no-member

        regions = map(parse_region, target_regions) if target_regions else [(None,)*3]
        for (chrom, start, end) in regions:
            if chrom is not None:
                logging.info(f"Starting work on region: {chrom}:{start}-{end}")
            try:
                vcfGraph = VCFGraph.create_from_vcf(
                    ref, indexed_vcf.name, chrom, start, end, ref_node_padding, allele_graph)
            except NoVCFRecordsException:
                logging.info(f"Region {chrom}:{start}-{end} has no VCF records, skipping.")
                continue
            logging.info(f"CONSTRUCTED VCF GRAPH:\n{str(vcfGraph)}")
            chromGraph = vcfGraph.get_graph(allele_graph)
            if ref_node_max_length:
                graphUtils.split_ref_nodes(chromGraph, ref_node_max_length, ref_node_padding)
                if alt_splitting:
                    graphUtils.split_alt_nodes(chromGraph, ref_node_max_length, ref_node_padding)

            if simplify:
                graphUtils.remove_empty_nodes(chromGraph)
                graphUtils.combine_nodes(chromGraph)
                # Disable edge label simplification for now. May use node-label short-cut later
                # graphUtils.remove_redundant_edge_labels(graph)
            chromGraph.check()

            graphUtils.add_graph(graph, chromGraph)
    finally:
        os.remove(indexed_vcf.name)

    graph.target_regions = target_regions or graph.get_reference_regions()
    graphUtils.add_source_sink(graph)
    graphUtils.add_ref_path(graph)
    if alt_paths:
        graphUtils.add_alt_paths(graph)
    graph.check()
    return graph.json_dict()

Esempio n. 13

0

Mostra file

def split_node(graph: GraphContainer, node, breakpoints):
    """
    Split a node at a set of breakpoints and link new (sub-)nodes
    Used to link to new variant nodes later
    Modifies graph and deletes node after splitting
    :returns Created sub-nodes
    """
    if not breakpoints:
        return node
    breakpoints = sorted(set(breakpoints))
    logging.debug(f"Splitting {node['name']} at {breakpoints}")
    nodes = []
    lEnd = 0
    for p in breakpoints:
        assert 0 <= p <= node["end"] - node["start"] + 1
        nStart = node["start"] + lEnd
        nEnd = node["start"] + p - 1
        if "reference" in node:
            nodes.append(
                graph.add_refNode(node["chrom"], nStart, nEnd,
                                  node["sequences"]))
        else:
            seq = node["sequence"][lEnd:p]
            nodes.append(
                graph.add_altNode(node["chrom"], nStart, nEnd, seq,
                                  node["sequences"]))
        lEnd = p
    # Add last node
    lStart = node["start"] + breakpoints[-1]
    if "reference" in node:
        nodes.append(
            graph.add_refNode(node["chrom"], lStart, node["end"],
                              node["sequences"]))
    else:
        seq = node["sequence"][breakpoints[-1]:]
        nodes.append(
            graph.add_altNode(node["chrom"], lStart, node["end"], seq,
                              node["sequences"]))
    # Connect nodes
    for e in graph.inEdges(node):
        graph.add_edge(graph.nodes[e["from"]], nodes[0], e["sequences"])
    for e in graph.outEdges(node):
        graph.add_edge(nodes[-1], graph.nodes[e["to"]], e["sequences"])
    for (n1, n2) in zip(nodes[:-1], nodes[1:]):
        graph.add_edge(n1, n2)
    # Delete original node, unless identical to new node (no split)
    if node['name'] not in [n['name'] for n in nodes]:
        graph.del_node(node)
    return nodes

Esempio n. 14

0

Mostra file

    def get_graph(self, allele_graph=False):
        """ Create the paragraph representation of nodes and edges for this graph
        :param alleleGraph: create edges between any compatible allele pair (rather
                            than just following reference and given haplotypes)
        :return GraphContainer object
        """
        logging.info("Creating output graph")
        graph = GraphContainer()
        # create ref nodes
        pnode = None
        for ref in self.get_ref_alleles():
            node = graph.add_refNode(self.chrom, ref.begin, ref.end - 1,
                                     ref.data.haplotypes)
            if pnode:
                assert pnode["end"] + 1 == node["start"]
                graph.add_edge(pnode, node)
            pnode = node
        # Create alt nodes
        for alt in self.alts.values():
            graph.add_altNode(self.chrom, alt.start, alt.end, alt.sequence,
                              alt.haplotypes)

        # Create edges connecting nodes along a haplotype (or allele in alleleGraph mode)
        for haplo in self.get_haplotypes():
            nodes = graph.nodes_by_haplo(haplo)
            logging.info(
                f"Linking nodes in sequence {haplo}:\t{', '.join(n['name'] for n in nodes)}"
            )
            pnode = None
            for node in nodes:
                if pnode:
                    if pnode["end"] == node["start"] - 1:
                        graph.add_edge(pnode, node, [haplo])
                    pnode_is_ref_dummy = pnode[
                        "end"] == pnode["start"] - 1 and not pnode["sequence"]
                    pnode_ends_before_node = pnode["end"] < node[
                        "start"] and pnode["start"] < node["start"]
                    if not pnode_is_ref_dummy and not pnode_ends_before_node:
                        raise Exception(
                            f"Inconsistent nodes for haplotype {haplo}: {pnode['name']}, {node['name']}"
                        )
                pnode = node

        # In alleleGraph mode link each alt node to all neighboring nodes
        # In haplotype mode link nodes without in/out edges to reference
        for node in graph.altNodes():
            if allele_graph or not any(graph.inEdges(node)):
                graph.add_edge(
                    graph.refNode_ending_at[node["chrom"], node["start"] - 1],
                    node)
            if not any(graph.outEdges(node)):
                graph.add_edge(
                    node, graph.refNode_starting_at[node["chrom"],
                                                    node["end"] + 1])
            if allele_graph:
                isInsertion = node["end"] < node["start"]
                for n in graph.nodes_starting_at[node["end"] + 1]:
                    # Don't loop by connecting multiple insertions at the same position
                    if not (isInsertion and n["end"] < n["start"]):
                        graph.add_edge(node, n)

        # For nodes that do not have determined in/out edges for a given haplotype
        # label all in/out edges as compatible with that haplotype
        # excluding edges that connect to another allele at the same vcfVariant (e.g. insertions)
        for haplo in self.get_haplotypes():
            for node in graph.nodes_by_haplo(haplo):
                if not any(graph.inEdges(node, haplo)):
                    for e in graph.inEdges(node):
                        graph.add_edge(graph.nodes[e["from"]], node, [haplo])
                assert any(graph.inEdges(node, haplo))
                if not any(graph.outEdges(node, haplo)):
                    for e in graph.outEdges(node):
                        graph.add_edge(node, graph.nodes[e["to"]], [haplo])
        return graph