def get_path(graph: GraphContainer, sequence): """ Return paths (list of nodes) covering all edges for one sequence (haplotype) :param graph: graph to work on :param sequence: Haplotype to cover """ nodes, edges = graph.topological_sort() for e in edges: if "mark" in e: del e["mark"] paths = [] def visit(edge, curPath): node = graph.nodes[edge["to"]] curPath = curPath + [node["name"]] edge["mark"] = True paths = [] for e in graph.outEdges(node, sequence): if "mark" not in e: paths.extend(visit(e, curPath)) if not paths: paths = [curPath] return paths paths = [] for node in nodes: for edge in graph.outEdges(node, sequence): if "mark" not in edge: paths += visit(edge, [node["name"]]) return paths
def add_source_sink(graph: GraphContainer, source_name="source", sink_name="sink"): """ add source and sink if necessary and link to exising nodes without in/out edges :param graph: graph to work on :param source_name: name of source node :param sink_name: name of sink node """ if source_name not in graph.nodes: graph.nodes[source_name] = { "name": source_name, "sequence": "N" * 10 } if sink_name not in graph.nodes: graph.nodes[sink_name] = { "name": sink_name, "sequence": "N" * 10 } # Link nodes without incoming/outgoing edges to source/sink for node in graph.nodes.values(): if node["name"] in [source_name, sink_name]: continue if not any(graph.inEdges(node)): logging.info("Linking %s from source", node['name']) graph.add_edge(graph.nodes[source_name], node) if not any(graph.outEdges(node)): logging.info("Linking %s to sink", node['name']) graph.add_edge(node, graph.nodes[sink_name])
def remove_redundant_edge_labels(graph: GraphContainer): """ Remove edge sequence labels that don't add information about haplotypes. """ # If all out-edges of a node have the same edge label # and the node is already labeled on an in-edge # and those edges to not connect to a node with that label for node in graph.nodes.values(): for haplo in node["sequences"]: for e in graph.outEdges(node): if haplo not in e["sequences"]: break if haplo in graph.nodes[e["to"]]["sequences"]: break else: for e in graph.outEdges(node): e["sequences"].remove(haplo)
def add_variants_node(graph: GraphContainer, node, variants): """ Add variants to one node in the graph """ bps = [] for var in variants: assert var.start <= var.end + 1 assert not (var.start == var.end + 1 and var.alt == "") logging.info(f"{node}: {var}") bps.extend((var.start, var.end + 1)) nodes = split_node(graph, node, bps) nodesEnding = {node["end"]: node for node in nodes[:-1]} nodesStarting = {node["start"]: node for node in nodes} for var in variants: vStart = node["start"] + var.start vEnd = node["start"] + var.end alt = graph.add_altNode(node["chrom"], vStart, vEnd, var.alt) graph.add_edge(nodesEnding[vStart - 1], alt) graph.add_edge(alt, nodesStarting[vEnd + 1])
def add_graph(graph1: GraphContainer, graph2: GraphContainer): """ Add all nodes, edges and paths from graph2 to graph1 (inplace) """ for node in graph2.refNodes(): graph1.add_refNode( node["chrom"], node["start"], node["end"], node["sequences"]) for node in graph2.altNodes(): graph1.add_altNode( node["chrom"], node["start"], node["end"], node["sequence"], node["sequences"]) for edge in graph2.edges.values(): graph1.add_edge(graph1.nodes[edge["from"]], graph1.nodes[edge["to"]], edge["sequences"]) graph1.paths += graph2.paths
def add_variants_node(graph: GraphContainer, node, variants): """ Add variants to one node in the graph """ bps = [] for var in variants: if var.start > var.end + 1: raise Exception("Variant start({}) > variane end({})!".format( var.start, var.end)) if var.start == var.end + 1 and not var.alt: raise Exception( "Variant start({}) == end but no insertion sequence is specified." .format(var.start)) bps.extend((var.start, var.end + 1)) nodes = split_node(graph, node, bps) nodesEnding = {node["end"]: node for node in nodes[:-1]} nodesStarting = {node["start"]: node for node in nodes} for var in variants: vStart = node["start"] + var.start vEnd = node["start"] + var.end alt = graph.add_altNode(node["chrom"], vStart, vEnd, var.alt) graph.add_edge(nodesEnding[vStart - 1], alt) graph.add_edge(alt, nodesStarting[vEnd + 1])
def remove_empty_nodes(graph: GraphContainer): """ Remove nodes without sequence (from deletions / skipped insertions or split ref nodes) Merge in & out edge pairs to keep connections """ for node in list(graph.nodes.values()): if (("reference" in node and node["start"] <= node["end"]) or node.get("sequence", "") != ""): continue logging.info("Removing empty node %s", node['name']) inSeqs = [s for e in graph.inEdges(node) for s in e["sequences"]] outSeqs = [s for e in graph.outEdges(node) for s in e["sequences"]] for e1 in list(graph.inEdges(node)): for e2 in list(graph.outEdges(node)): # Label the new edges with sequence labels either observed # on both merged in- and out-edge or on an in (out) -edge only # if the label is undetermined goung out (in) haplos = e1["sequences"].intersection(e2["sequences"]).union( e1["sequences"].difference(outSeqs).union( e2["sequences"].difference(inSeqs))) graph.add_edge(graph.nodes[e1["from"]], graph.nodes[e2["to"]], haplos) graph.del_node(node)
def split_alt_nodes(graph: GraphContainer, max_len=300, padding_len=150): """ Split long alternate nodes :param graph: graph to work on :param max_len: max length of reference node with no sequences :param padding_len: length of sequence to keep """ assert max_len >= 2 * padding_len for node in list(graph.altNodes()): if len(node["sequence"]) <= max_len: continue logging.info("Splitting long ALT node: %s", node['name']) n1 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][:padding_len], node["sequences"]) n2 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][-padding_len:], node["sequences"]) for e in list(graph.inEdges(node)): graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"]) for e in list(graph.outEdges(node)): graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"]) graph.del_node(node)
def split_ref_nodes(graph: GraphContainer, max_len=300, padding_len=150): """ Split long reference nodes :param graph: graph to work on :param max_len: max length of reference node with no sequences :param padding_len: length of sequence to keep """ assert max_len >= 2 * padding_len for node in list(graph.refNodes()): if node["end"] - node["start"] + 1 <= max_len: continue logging.info("Splitting long REF node: %s", node['name']) firstEnd = node["start"] + padding_len - 1 n1 = graph.add_refNode(node["chrom"], node["start"], firstEnd, node["sequences"]) sndStart = node["end"] - padding_len + 1 n2 = graph.add_refNode(node["chrom"], sndStart, node["end"], node["sequences"]) for e in list(graph.inEdges(node)): graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"]) for e in list(graph.outEdges(node)): graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"]) graph.del_node(node)
def load_json(json) -> GraphContainer: """ Construct graph object from JSON representation :param json: Dictionary of JSON file contents """ graph = GraphContainer() for node in json["nodes"]: seqs = node.get("sequences", ()) if "reference" in node: chrom, start, end = parse_region(node["reference"]) graph.add_refNode(chrom, start, end, seqs, node["name"]) elif "position" in node: chrom, start, end = parse_region(node["position"]) graph.add_altNode(chrom, start, end, node["sequence"], seqs, node["name"]) else: graph.nodes[node["name"]] = node for edge in json["edges"]: seqs = edge.get("sequences", ()) graph.add_edge(graph.nodes[edge["from"]], graph.nodes[edge["to"]], seqs) graph.name = json["model_name"] graph.paths = json.get("paths", []) graph.target_regions = json.get("target_regions", []) graph.check() return graph
def combine_nodes(graph: GraphContainer): """ Combine adjacent nodes with the same sequence labels """ for n1 in list(graph.nodes.values()): if len(list(graph.outEdges(n1))) != 1: continue # Pair of nodes with no other in/out edges n2 = graph.nodes[next(graph.outEdges(n1))["to"]] if len(list(graph.inEdges(n2))) != 1: continue if not (n1["chrom"] == n2["chrom"] and n1["end"] + 1 == n2["start"]): continue # nodes must be adjacent haplos = n1["sequences"] if n2["sequences"] != haplos: continue # only collapse nodes with same haplotypes if "reference" in n1: if "reference" not in n2: continue # nodes must be of same type node = graph.add_refNode(n1["chrom"], n1["start"], n2["end"], haplos) else: if "reference" in n2: continue # nodes must be of same type node = graph.add_altNode( n1["chrom"], n1["start"], n2["end"], n1["sequence"] + n2["sequence"], haplos) logging.info("Combinding %s and %s", n1['name'], n2['name']) for e in list(graph.inEdges(n1)): graph.add_edge(graph.nodes[e["from"]], node, e["sequences"]) for e in list(graph.outEdges(n2)): graph.add_edge(node, graph.nodes[e["to"]], e["sequences"]) graph.del_node(n1) graph.del_node(n2)
def convert_vcf(vcf, ref, target_regions=None, ref_node_padding=150, ref_node_max_length=1000, allele_graph=False, simplify=True, alt_paths=False, alt_splitting=False): """ Convert a single VCF file to a graph dictionary :param vcf: file name of the VCF file :param ref: reference FASTA file name :param target_regions: target region list :param ref_node_padding: padding / read length :param ref_node_max_length: maximum length before splitting a reference node :param allele_graph: add edges between any compatible allele pair, not just haplotypes from input :param simplify: simplify the graph :param alt_paths: Add all possible non-reference paths to the graph :param alt_splitting: also split long alt nodes (e.g. long insertions) :return: dictionary containing JSON graph """ graph = GraphContainer("Graph from %s" % vcf) indexed_vcf = tempfile.NamedTemporaryFile(delete=False, suffix=".vcf.gz") try: indexed_vcf.close() # noinspection PyUnresolvedReferences pysam.bcftools.view(vcf, "-o", indexed_vcf.name, "-O", "z", catch_stdout=False) # pylint: disable=no-member # noinspection PyUnresolvedReferences pysam.bcftools.index(indexed_vcf.name) # pylint: disable=no-member regions = map(parse_region, target_regions) if target_regions else [(None,)*3] for (chrom, start, end) in regions: if chrom is not None: logging.info(f"Starting work on region: {chrom}:{start}-{end}") try: vcfGraph = VCFGraph.create_from_vcf( ref, indexed_vcf.name, chrom, start, end, ref_node_padding, allele_graph) except NoVCFRecordsException: logging.info(f"Region {chrom}:{start}-{end} has no VCF records, skipping.") continue logging.info(f"CONSTRUCTED VCF GRAPH:\n{str(vcfGraph)}") chromGraph = vcfGraph.get_graph(allele_graph) if ref_node_max_length: graphUtils.split_ref_nodes(chromGraph, ref_node_max_length, ref_node_padding) if alt_splitting: graphUtils.split_alt_nodes(chromGraph, ref_node_max_length, ref_node_padding) if simplify: graphUtils.remove_empty_nodes(chromGraph) graphUtils.combine_nodes(chromGraph) # Disable edge label simplification for now. May use node-label short-cut later # graphUtils.remove_redundant_edge_labels(graph) chromGraph.check() graphUtils.add_graph(graph, chromGraph) finally: os.remove(indexed_vcf.name) graph.target_regions = target_regions or graph.get_reference_regions() graphUtils.add_source_sink(graph) graphUtils.add_ref_path(graph) if alt_paths: graphUtils.add_alt_paths(graph) graph.check() return graph.json_dict()
def split_node(graph: GraphContainer, node, breakpoints): """ Split a node at a set of breakpoints and link new (sub-)nodes Used to link to new variant nodes later Modifies graph and deletes node after splitting :returns Created sub-nodes """ if not breakpoints: return node breakpoints = sorted(set(breakpoints)) logging.debug(f"Splitting {node['name']} at {breakpoints}") nodes = [] lEnd = 0 for p in breakpoints: assert 0 <= p <= node["end"] - node["start"] + 1 nStart = node["start"] + lEnd nEnd = node["start"] + p - 1 if "reference" in node: nodes.append( graph.add_refNode(node["chrom"], nStart, nEnd, node["sequences"])) else: seq = node["sequence"][lEnd:p] nodes.append( graph.add_altNode(node["chrom"], nStart, nEnd, seq, node["sequences"])) lEnd = p # Add last node lStart = node["start"] + breakpoints[-1] if "reference" in node: nodes.append( graph.add_refNode(node["chrom"], lStart, node["end"], node["sequences"])) else: seq = node["sequence"][breakpoints[-1]:] nodes.append( graph.add_altNode(node["chrom"], lStart, node["end"], seq, node["sequences"])) # Connect nodes for e in graph.inEdges(node): graph.add_edge(graph.nodes[e["from"]], nodes[0], e["sequences"]) for e in graph.outEdges(node): graph.add_edge(nodes[-1], graph.nodes[e["to"]], e["sequences"]) for (n1, n2) in zip(nodes[:-1], nodes[1:]): graph.add_edge(n1, n2) # Delete original node, unless identical to new node (no split) if node['name'] not in [n['name'] for n in nodes]: graph.del_node(node) return nodes
def get_graph(self, allele_graph=False): """ Create the paragraph representation of nodes and edges for this graph :param alleleGraph: create edges between any compatible allele pair (rather than just following reference and given haplotypes) :return GraphContainer object """ logging.info("Creating output graph") graph = GraphContainer() # create ref nodes pnode = None for ref in self.get_ref_alleles(): node = graph.add_refNode(self.chrom, ref.begin, ref.end - 1, ref.data.haplotypes) if pnode: assert pnode["end"] + 1 == node["start"] graph.add_edge(pnode, node) pnode = node # Create alt nodes for alt in self.alts.values(): graph.add_altNode(self.chrom, alt.start, alt.end, alt.sequence, alt.haplotypes) # Create edges connecting nodes along a haplotype (or allele in alleleGraph mode) for haplo in self.get_haplotypes(): nodes = graph.nodes_by_haplo(haplo) logging.info( f"Linking nodes in sequence {haplo}:\t{', '.join(n['name'] for n in nodes)}" ) pnode = None for node in nodes: if pnode: if pnode["end"] == node["start"] - 1: graph.add_edge(pnode, node, [haplo]) pnode_is_ref_dummy = pnode[ "end"] == pnode["start"] - 1 and not pnode["sequence"] pnode_ends_before_node = pnode["end"] < node[ "start"] and pnode["start"] < node["start"] if not pnode_is_ref_dummy and not pnode_ends_before_node: raise Exception( f"Inconsistent nodes for haplotype {haplo}: {pnode['name']}, {node['name']}" ) pnode = node # In alleleGraph mode link each alt node to all neighboring nodes # In haplotype mode link nodes without in/out edges to reference for node in graph.altNodes(): if allele_graph or not any(graph.inEdges(node)): graph.add_edge( graph.refNode_ending_at[node["chrom"], node["start"] - 1], node) if not any(graph.outEdges(node)): graph.add_edge( node, graph.refNode_starting_at[node["chrom"], node["end"] + 1]) if allele_graph: isInsertion = node["end"] < node["start"] for n in graph.nodes_starting_at[node["end"] + 1]: # Don't loop by connecting multiple insertions at the same position if not (isInsertion and n["end"] < n["start"]): graph.add_edge(node, n) # For nodes that do not have determined in/out edges for a given haplotype # label all in/out edges as compatible with that haplotype # excluding edges that connect to another allele at the same vcfVariant (e.g. insertions) for haplo in self.get_haplotypes(): for node in graph.nodes_by_haplo(haplo): if not any(graph.inEdges(node, haplo)): for e in graph.inEdges(node): graph.add_edge(graph.nodes[e["from"]], node, [haplo]) assert any(graph.inEdges(node, haplo)) if not any(graph.outEdges(node, haplo)): for e in graph.outEdges(node): graph.add_edge(node, graph.nodes[e["to"]], [haplo]) return graph