def simple_test(): graph = Graph({ 1: Block(10), 2: Block(1), 3: Block(1), 4: Block(10) }, { 1: [2, 3], 2: [4], 3: [4] }) graph.convert_to_numpy_backend() sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph) sequence_graph.set_sequence(1, "GGGTTTATAC") sequence_graph.set_sequence(2, "A") sequence_graph.set_sequence(3, "C") sequence_graph.set_sequence(4, "GTACATTGTA") linear_ref = Interval(0, 10, [1, 2, 3], graph) linear_ref = linear_ref.to_numpy_indexed_interval() critical_nodes = set([4]) finder = MinimizerFinder(graph, sequence_graph, critical_nodes, linear_ref, k=3, w=3) minimizers = finder.find_minimizers() assert minimizers.has_minimizer(2, 0) assert minimizers.has_minimizer(3, 0) assert minimizers.has_minimizer(4, 4)
def setUp(self): self.linear_graph = Graph({i: Block(5) for i in range(1, 4)}, {i: [i + 1] for i in range(1, 3)}) self.scores = DensePileup.from_intervals( self.linear_graph, [Interval(0, 5, [i]) for i in range(1, 4)]) self.graph = Graph({i: Block(5) for i in range(1, 4)}, { 1: [3], 2: [3], 3: [4] })
def test_find_max_path_on_start_and_end_node(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10), 4: Block(10) }, { 1: [2, 3], 2: [4], 3: [4] }) peak = ConnectedAreas(graph, { 2: [0, 10], 4: [0, 10], }) binary_peak = BinaryContinousAreas.from_old_areas(peak) qvalues = DensePileup.from_intervals(graph, [Interval(7, 2, [1, 2, 4])]) scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues) max_path = scored_peak.get_max_path() self.assertEqual(max_path, Interval(0, 10, [2, 4]))
def test_find_max_path_through_subgraph_multiple_paths(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10), 4: Block(10) }, { 1: [2, 3], 2: [4], 3: [4] }) peak = ConnectedAreas(graph, { 2: [0, 10], 3: [0, 10], 1: [5, 10], 4: [0, 3] }) binary_peak = BinaryContinousAreas.from_old_areas(peak) qvalues = DensePileup.from_intervals( graph, [ Interval(7, 2, [1, 3, 4]) # Giving higher qvalue # through this path ]) print(qvalues) scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues) print(scored_peak) max_path = scored_peak.get_max_path() self.assertEqual(max_path, Interval(5, 3, [1, 3, 4]))
def test_create_from_nongraphpeakcollection(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10) }, { 1: [2], 2: [3] }) graph.convert_to_numpy_backend() linear_path = Interval(0, 10, [1, 2, 3], graph) linear_path = linear_path.to_numpy_indexed_interval() nongraph_peaks = NonGraphPeakCollection([ NonGraphPeak("chr1", 3, 10, 5), NonGraphPeak("chr1", 13, 15, 7), ]) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, None) self.assertEqual(peaks.intervals[0], Interval(3, 10, [1])) self.assertEqual(peaks.intervals[1], Interval(3, 5, [2])) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20)) self.assertEqual(peaks.intervals[0], Interval(0, 7, [1])) self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
def test_three_nodes_in(self): graph = Graph({i: Block(5) for i in range(1, 5)}, { 1: [4], 2: [4], 3: [4] }) intervals = [ Interval(2, 5, [1]), Interval(2, 5, [2]), Interval(2, 5, [3]), Interval(0, 3, [4]) ] pileup = DensePileup.from_intervals(graph, intervals) subgraphs = SubgraphCollectionPartiallyOrderedGraph.create_from_pileup( graph, pileup) print(subgraphs) correct1 = BinaryContinousAreas(graph) correct1.add_start(-1, 3) correct1.add_start(-2, 3) correct1.add_start(-3, 3) correct1.add_start(4, 3) self.assertTrue(correct1 in subgraphs)
def test_simple3(self): graph = Graph({i: Block(5) for i in range(1, 6)}, { 1: [3], 2: [3], 3: [4, 5] }) scores = DensePileup.from_intervals( graph, [Interval(0, 5, [i]) for i in range(1, 6)]) intervals = [ Interval(0, 5, [1]), Interval(0, 5, [3]), Interval(0, 5, [4]), Interval(0, 3, [5]) ] pileup = DensePileup.from_intervals(graph, intervals) subgraphs = SubgraphCollectionPartiallyOrderedGraph.create_from_pileup( graph, pileup) scored_peaks = (ScoredPeak.from_peak_and_pileup(peak, scores) for peak in subgraphs) max_paths = [peak.get_max_path() for peak in scored_peaks] self.assertTrue( Interval(0, 5, [1, 3, 4]) in max_paths or Interval(0, 3, [1, 3, 5]) in max_paths)
def __init__(self, tf_experiment_dir, data_dir): self.experiment_dir = tf_experiment_dir self.data_dir = data_dir self.bam_file = pysam.AlignmentFile(self.experiment_dir + "/linear_alignments.bam", "rb") self.linear_path = NumpyIndexedInterval.from_file(self.data_dir + "/5_linear_pathv2.interval") self.graph = Graph.from_file(self.data_dir + "/5.nobg") self.alignment_collection = AlignmentCollection.from_file(self.experiment_dir + "/5_alignments.pickle", self.graph) self.check_peaks()
def test_reverse(): graph = Graph({ 1: Block(10), 2: Block(5), 3: Block(10), 4: Block(5) }, { 1: [2, 3], 2: [4], 3: [4] }) graph.convert_to_numpy_backend() linear_path = NumpyIndexedInterval.from_interval( Interval(0, 10, [1, 2, 4], graph)) alignments = [Interval(4, 5, [-3, -1], graph)] projected = project_alignments(alignments, linear_path) projected = list(projected) assert projected[0] == (5, 16, "-")
def set_graph(self): self.graph = Graph({ 1: Block(5), 2: Block(5), 3: Block(5) }, { 1: [2], 2: [3] })
def test_simple(): graph = Graph({ 1: Block(10), 2: Block(5), 3: Block(10), 4: Block(5) }, { 1: [2, 3], 2: [4], 3: [4] }) graph.convert_to_numpy_backend() linear_path = NumpyIndexedInterval.from_interval( Interval(0, 10, [1, 2, 4], graph)) alignments = [Interval(5, 5, [1, 3], graph), Interval(5, 5, [3, 4], graph)] projected = project_alignments(alignments, linear_path) projected = list(projected) assert projected[0] == (5, 15, "+") assert projected[1] == (15, 25, "+")
def setUp(self): self.graph = Graph({i: Block(3) for i in range(1, 7)}, {i: [i + 1] for i in range(1, 6)}) self.peaks = PeakCollection([ Peak(3, 3, [1, 2, 3, 4], self.graph), Peak(3, 3, [5, 6], self.graph) ])
def test_many_nodes(): nodes = {i: Block(1) for i in range(2, 10)} nodes[1] = Block(10) nodes[10] = Block(10) graph = Graph( nodes, { 1: [2, 3], 2: [4], 3: [4], 4: [5, 6], 5: [7], 6: [7], 7: [8, 9], 8: [10], 9: [10] }) graph.convert_to_numpy_backend() sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph) sequence_graph.set_sequence(1, "ACTGACTGAC") sequence_graph.set_sequence(10, "ACTGACTGAC") sequence_graph.set_sequence(2, "A") sequence_graph.set_sequence(3, "C") sequence_graph.set_sequence(4, "A") sequence_graph.set_sequence(5, "G") sequence_graph.set_sequence(6, "C") sequence_graph.set_sequence(7, "T") sequence_graph.set_sequence(8, "A") sequence_graph.set_sequence(9, "A") linear_ref = Interval(0, 10, [1, 2, 4, 6, 7, 8, 10], graph) linear_ref = linear_ref.to_numpy_indexed_interval() critical_nodes = {1, 4, 7, 10} finder = MinimizerFinder(graph, sequence_graph, critical_nodes, linear_ref, k=3, w=3) minimizers = finder.find_minimizers() print(len(minimizers.minimizers))
def setUp(self): self.simple_graph = Graph({i: Block(3) for i in range(1, 9)}, { 1: [2, 3], 2: [4], 3: [4], 4: [5], 5: [6, 7], 6: [8], 7: [8] }) print(self.simple_graph.get_first_blocks()) print(self.simple_graph.reverse_adj_list) self.simple_snarls = \ { 20: SimpleSnarl(1, 4, 20), 21: SimpleSnarl(5, 8, 21), 22: SimpleSnarl(4, 5, 22) }
def set_graph(self): self.graph = Graph({ 1: Block(5), 2: Block(5), 3: Block(5), 4: Block(5) }, { 1: [2, 3], 2: [4], 3: [4] })
def macs_to_graph_peaks(folder): for chrom in ["1", "2", "3", "4", "5"]: path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = PeakCollection.from_fasta_file( folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom, graph) macs_peaks.to_file( folder + "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
def test_overlapping_alt_loci(self): chrom_file = "data/chrom.sizes.test" alt_loci = "data/alt_loci_test" graph = create_initial_grch38_graph(chrom_file) numeric_graph, name_translation = convert_to_numeric_graph(graph) self.assertEqual(len(graph.blocks), 3) self.assertEqual(len([a for a, v in graph.adj_list.items() if v]), 0) new_numeric_graph, numeric_translation = \ connect_without_flanks(numeric_graph, alt_loci, name_translation) correct_graph_structure = Graph( { 1: Block(1), 2: Block(1), 3: Block(1), 4: Block(1), 5: Block(1), 6: Block(1), 7: Block(1), 8: Block(1), 9: Block(1), }, { 1: [2, 8], 2: [3, 9], 3: [4], 4: [5], 5: [6], 6: [7], 9: [5], 8: [6] } ) self.assertTrue(correct_graph_structure.has_identical_structure(new_numeric_graph))
def run_predict_path(args): chromosomes = args.chromosomes.split(",") processes = [] if not os.path.isfile(args.alignments): logging.error("Input alignments file %s does not exist" % args.alignments) sys.exit() for chromosome in chromosomes: logging.info("Starting process for chromosome %s " % chromosome) process = Process(target=run_predict_path_single_chromosome, args=(args.alignments, chromosome, args.data_dir, args.linear_ref_bonus, args.out_file_name, args.max_nodes_to_traverse)) process.start() processes.append(process) for process in processes: process.join() # Merge all fasta files that were produces out_fasta = open(args.out_file_name + ".fa", "w") logging.info("Merging fasta files") for chromosome in tqdm(chromosomes): with open(args.out_file_name + "_" + chromosome + ".fasta") as f: out_fasta.write(f.read()) logging.info("Wrote resulting linear reference to %s" % (args.out_file_name + ".fa")) # Create indexed intervals for each interval file that was produced logging.info("Creating indexed interval for all chromosomes") for chromosome in chromosomes: file_name = args.out_file_name + "_" + chromosome + ".intervalcollection" graph = Graph.from_file(args.data_dir + chromosome + ".nobg") intervals = IntervalCollection.from_file(file_name, text_file=True, graph=graph) intervals = list(intervals.intervals) assert len( intervals) == 1, "Only a single interval in file is supported" interval = intervals[0] indexed = interval.to_numpy_indexed_interval() indexed.to_file(file_name + ".indexed") logging.info("Wrote indexed interval to file %s" % file_name + ".indexed") if not args.skip_bwa_index: logging.info("Running bwa index") run_bwa_index(args.out_file_name + ".fa") else: logging.info("Not creating bwa index")
def test_many_nodes(): nodes = {i: Block(1) for i in range(2, 10)} nodes[1] = Block(10) nodes[10] = Block(10) graph = Graph( nodes, { 1: [2, 3], 2: [4], 3: [4], 4: [5, 6], 5: [7], 6: [7], 7: [8, 9], 8: [10], 9: [10] }) graph.convert_to_numpy_backend() sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph) sequence_graph.set_sequence(1, "ACTGACTGAC") sequence_graph.set_sequence(10, "ACTGACTGAC") sequence_graph.set_sequence(2, "A") sequence_graph.set_sequence(3, "C") sequence_graph.set_sequence(4, "A") sequence_graph.set_sequence(5, "G") sequence_graph.set_sequence(6, "C") sequence_graph.set_sequence(7, "T") sequence_graph.set_sequence(8, "T") sequence_graph.set_sequence(9, "A") linear_ref_nodes = {1, 2, 4, 6, 7, 8, 10} read_sequence = "ACTGACCAGTAACTGAC" start_node = 1 start_offset = 4 aligner = LocalGraphAligner(graph, sequence_graph, read_sequence, linear_ref_nodes, start_node, start_offset) alignment, score = aligner.align() assert alignment == [1, 3, 4, 5, 7, 9, 10]
def visualize_alt_locus(args, skip_wrapping=False, quiet=False): from offsetbasedgraph.graphutils import GeneList, \ create_gene_dicts, create_subgraph_around_alt_locus if not isinstance(args.translation_file_name, Translation): trans = Translation.from_file(args.translation_file_name) else: trans = args.translation_file_name graph = trans.graph2 orig_trans = trans.copy() # Find all genes on this graph genes = GeneList(get_gene_objects_as_intervals(args.genes)).gene_list alt_loci_genes, gene_name_dict, main_genes = create_gene_dicts(genes, alt_loci_fn=args.alt_locations_file_name) genes = main_genes[args.alt_locus] + alt_loci_genes[args.alt_locus] genes = [g.translate(trans) for g in genes] subgraph, trans, start_position = create_subgraph_around_alt_locus(graph, trans, args.alt_locus, 200000, alt_loci_fn=args.alt_locations_file_name) start_position = orig_trans.translate_position(start_position, True)[0] genes = [g for g in genes if not g.multiple_alt_loci() and g.transcription_region.length() > 100] if len(genes) > 40: genes = genes[0:40] levels = Graph.level_dict(subgraph.blocks) # Find start block by choosing a block having no edges in start = None for b in subgraph.blocks: if len(subgraph.reverse_adj_list[b]) == 0: start = b break assert start is not None from visualizehtml import VisualizeHtml subgraph.start_block = start max_offset = sum([subgraph.blocks[b].length() for b in subgraph.blocks]) v = VisualizeHtml(subgraph, 0, max_offset, 0, levels, "", 800, genes, start_position) if quiet: return if skip_wrapping: print(str(v)) else: print(v.get_wrapped_html())
def setUp(self): self.graph = Graph({i: Block(10) for i in range(1, 4)}, {i: [i + 1] for i in range(1, 3)}) self.index = GraphIndex({ 1: [(2, 10), (3, 20)], 2: [(3, 10)], 3: [], -1: [], -2: [(-1, 10)], -3: [(-2, 10), (-1, 20)] }) self.extender = GraphExtender(self.index)
def setUp(self): self.complex_graph = Graph( {i: Block(3) for i in range(1, 13)}, { 1: [2, 3], 2: [7, 8], 3: [4, 5], 4: [6], 5: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [12] }) self.complex_graph.convert_to_numpy_backend()
def test_find_max_path_through_subgraph_two_node_graph(self): graph = Graph({1: Block(10), 2: Block(10)}, {1: [2]}) peak = ConnectedAreas(graph, {2: [0, 4], 1: [5, 10]}) binary_peak = BinaryContinousAreas.from_old_areas(peak) qvalues = DensePileup.from_base_value(graph, 10) print("q values") print(qvalues) print(qvalues.data._values) scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues) print(scored_peak) max_path = scored_peak.get_max_path() self.assertEqual(max_path, Interval(5, 4, [1, 2]))
def run_predict_path_single_chromosome(alignment_file_name, chromosome, graph_dir, linear_ref_bonus, out_file_base_name, max_nodes_to_traverse): sequence_graph = SequenceGraph.from_file(graph_dir + chromosome + ".nobg.sequences") graph = Graph.from_file(graph_dir + chromosome + ".nobg") linear_path = NumpyIndexedInterval.from_file(graph_dir + "/%s_linear_pathv2.interval" % chromosome) PathPredicter(alignment_file_name, graph, sequence_graph, chromosome, linear_path, out_file_base_name, linear_ref_bonus=linear_ref_bonus, max_nodes_to_traverse=max_nodes_to_traverse)
def read_graphs(graph_dir, chromosomes): logging.info("Reading graphs") graphs = {} sequence_graphs = {} linear_ref_nodes = {} for chromosome in chromosomes: chromosome_name = chromosome if chromosome == "X": chromosome_name = "23" logging.info("Reading graphs for chromosome %s" % chromosome) graphs[chromosome_name] = Graph.from_file(graph_dir + chromosome + ".nobg") sequence_graphs[chromosome_name] = SequenceGraph.from_file( graph_dir + chromosome + ".nobg.sequencesv2") linear_ref_nodes[ chromosome_name] = None #NumpyIndexedInterval.from_file(graph_dir + chromosome + "_linear_pathv2.interval").nodes_in_interval() return graphs, sequence_graphs, linear_ref_nodes
def setUp(self): self.graph = Graph({i: Block(10) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] }) self.index = GraphIndex({ 1: [(2, 10), (3, 10), (4, 20)], 2: [(4, 10)], 3: [(4, 10)], 4: [], -1: [], -2: [(-1, 10)], -3: [(-1, 10)], -4: [(-2, 10), (-3, 10), (-1, 20)] }) self.extender = GraphExtender(self.index)
def _coordinate(self, rp): """ Returns the hierarhcial and sequential coordinates of a region path """ length = self.graph.blocks[rp].length() # Translate rp back to get GRCh38 hier. coordinates from offsetbasedgraph import Interval, Graph hier_id = str(rp) hier_of = 0 origin = Graph.block_origin(rp) if origin == "main" or origin == "merged": dist_back = self._distance_to_start(rp) hier_id = self.start_position.region_path_id hier_of = dist_back + self.start_position.offset return (str(rp), "0", str(hier_id), str(hier_of), str(length))
def test_find_max_path_through_subgraph_with_illegal_paths(self): graph = Graph( { 1: Block(10), 2: Block(10), 3: Block(10), 4: Block(10) }, { 1: [2, 3], 2: [4], -4: [-3] # Making 3=>4 not allowed path }) peak = ConnectedAreas(graph, { 2: [0, 10], 3: [0, 10], 1: [5, 10], 4: [0, 8] }) binary_peak = BinaryContinousAreas.from_old_areas(peak) qvalues = DensePileup.from_intervals( graph, [ Interval(0, 10, [3]), # Higher value on 3 than 2 Interval(0, 10, [3]), Interval(0, 10, [4]), # Highest value if ending on 4 Interval(0, 10, [4]), Interval(0, 10, [1]), # Highest value if inncluding 1 Interval(0, 10, [1]), # Highest value if inncluding 1 Interval(0, 10, [1, 2, 4]) ]) scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues) max_path = scored_peak.get_max_path() print(max_path) self.assertEqual(max_path, Interval(5, 8, [1, 2, 4]))
def _create_data(self): node_offset = 1 for chrom_number, chromosome in enumerate(self.chromosomes): graph = Graph( {i + node_offset: Block(10) for i in range(0, 3)}, {i + node_offset: [i + 1 + node_offset] for i in range(0, 2)}) linear_map = LinearMap.from_graph(graph) linear_map_file_name = "linear_map_%s.npz" % chromosome linear_map.to_file(linear_map_file_name) self.linear_maps.append(linear_map_file_name) self.sequence_retrievers.append( SequenceRetriever( {i + node_offset: "A" * 10 for i in range(0, 3)})) self._create_reads(chrom_number, chromosome, graph) node_offset += 3 graph.convert_to_numpy_backend() SequenceGraph.create_empty_from_ob_graph(graph).to_file( chromosome + ".nobg.sequences") graph.to_file(chromosome + ".nobg")
def test_convert_to_approx_linear_peaks(self): graph = Graph({i: Block(3) for i in range(1, 10)}, { 1: [2], 2: [3], 3: [4], 4: [5], 5: [6], 6: [7, 8], 7: [9], 9: [9] }) graph.convert_to_numpy_backend() linear_interval = Interval(0, 3, [2, 4, 8, 9], graph) linear_interval = linear_interval.to_numpy_indexed_interval() peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])]) linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4") linear_peaks = linear_peaks.peaks print(linear_peaks) self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5)) self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
def test_simple(self): graph = Graph( {i: Block(3) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] } ) graph.convert_to_numpy_backend() intervals = IntervalCollection([ Interval(0, 3, [1, 3]) ]) haplotyper = HaploTyper(graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 3, 4]) )
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name, haplotype0_file_name, haplotype1_file_name, out_base_name, chromosome): # Make a linear reference fasta and interval and haplotypes fasta and intervals chrom = chromosome graph = Graph.from_file(graph_file_name) sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences") linear_ref = IntervalCollection.from_file(linear_ref_path_file_name, text_file=True) linear_ref = list(linear_ref.intervals)[0] linear_ref_nodes = set(linear_ref.region_paths) # Write linear ref fasta to file linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref) out_file = open("linear_ref_" + chrom + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([linear_ref_seq + "\n"]) out_file.close() logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" % len(linear_ref_nodes)) haplotype_nodes = [set(), set()] # For haplotype 0 and 1 for haplotype in [0, 1]: haplotype_file_name = haplotype0_file_name if haplotype == 1: haplotype_file_name = haplotype1_file_name intervals = vg_json_file_to_intervals(haplotype_file_name, graph) for interval in intervals: for node in interval.region_paths: haplotype_nodes[haplotype].add(node) logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0])) logging.info("N nodes in haplotype 0 that are also in linear ref: %d" % len(haplotype_nodes[0].intersection(linear_ref_nodes))) logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1])) # Traverse graph to get full correct haplotype intervals first_nodes = graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(graph.blocks)) for haplotype in [0, 1]: logging.info("Traversing haplotype %d" % haplotype) nodes = [] node = first_nodes[0] nodes_in_haplotype = haplotype_nodes[haplotype] nodes_in_haplotype = set(range( 0, max(linear_ref_nodes))).difference(linear_ref_nodes) logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype)) assert len( nodes_in_haplotype ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty" n_haplotype_nodes = 0 i = 0 while True: nodes.append(node) if i % 50000 == 0: logging.info("#%d nodes traversed. On node %d" % (i, node)) i += 1 next_nodes = set(graph.adj_list[node]) if len(next_nodes) == 0: logging.info("Reached end node %d with 0 edges" % node) break next_on_haplotype = next_nodes.intersection(nodes_in_haplotype) if len(next_on_haplotype) == 1: n_haplotype_nodes += 1 next_node = list(next_on_haplotype)[0] assert next_node != node node = next_node elif len(next_on_haplotype) == 0: logging.debug( "No new haplotype node from %d. Will follow reference" % node) # Choose reference with lowest id to avoid deletion node = min(list(next_nodes.intersection(linear_ref_nodes))) else: # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node) # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion node = min(list(next_on_haplotype)) logging.info("Found %d nodes. %d on haplotype" % (len(nodes), n_haplotype_nodes)) haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(), nodes, graph) print("Path length: %d" % haplotype_interval.length()) file_base_name = out_base_name + "_" + str(haplotype) IntervalCollection([haplotype_interval ]).to_file(file_base_name + ".intervalcollection", text_file=True) sequence = sequence_graph.get_interval_sequence(haplotype_interval) out_file = open(file_base_name + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([sequence + "\n"]) out_file.close() logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
def count_variants_in_graph(graph, linear_path): reference_nodes = linear_path.nodes_in_interval() n_variants = 0 i = 0 for node in graph.blocks: if i % 1000000 == 0: print("Node #%d" % i) i += 1 if node not in reference_nodes: continue n_variants += max(0, len(graph.adj_list[node]) - 1) print("Variants: %d" % n_variants) return n_variants if __name__ == "__main__": n_variants = 0 for chromosome in sys.argv[2].split(","): print("Chromosome %s" % chromosome) graph = Graph.from_file(sys.argv[1] + "/" + chromosome + "_pruned.nobg") linear_path = NumpyIndexedInterval.from_file(sys.argv[1] + "/" + chromosome + "_linear_pathv2.interval") n_variants += count_variants_in_graph(graph, linear_path) print("Total: %d" % n_variants)