def fill_small_wholes(self, max_size, write_holes_to_file=None, touched_nodes=None): cleaner = HolesCleaner(self, max_size, touched_nodes=touched_nodes) areas = cleaner.run() n_filled = 0 hole_intervals = [] for node_id in areas.areas: if touched_nodes is not None: if node_id not in touched_nodes: continue starts = areas.get_starts(node_id) ends = areas.get_ends(node_id) for start, end in zip(starts, ends): self.data[node_id].set_interval_value(start, end, True) logging.debug("Filling hole %s, %d, %d" % (node_id, start, end)) n_filled += 1 assert end - start <= max_size hole_intervals.append(Interval(start, end, [node_id])) logging.info("Filled %d small holes (splitted into holes per node)" % n_filled) if write_holes_to_file is not None: intervals = IntervalCollection(hole_intervals) intervals.to_file(write_holes_to_file, text_file=True) self.sanitize()
def get_intersecting_intervals(args): from offsetbasedgraph import IntervalCollection intervals1 = IntervalCollection.from_file(args.file1, text_file=True, graph=args.graph) intervals2 = IntervalCollection.from_file(args.file2, text_file=True, graph=args.graph) out = [] for interval1 in intervals1.intervals: for interval2 in intervals2.intervals: if interval1.intersects(interval2): out.append(interval1) logging.info("Found match between %s and %s" % (interval1, interval2)) continue IntervalCollection(out).to_file(args.out_file_name, text_file=True) logging.info("Wrote intersecting intervals to %s" % args.out_file_name)
def create_linear_peaks_from_bed(linear_sequence_fasta_file, peaks_bed_file, obg_graph_file_name, vg_graph_file_name, start_node, region): ob_graph = obg.GraphWithReversals.from_file(obg_graph_file_name) search_sequence = open(linear_sequence_fasta_file).read() sequence_retriever = SequenceRetriever.from_vg_graph(vg_graph_file_name) traverser = GraphTraverserUsingSequence(ob_graph, search_sequence, sequence_retriever) traverser.search_from_node(start_node) linear_path_interval = traverser.get_interval_found() IntervalCollection([linear_path_interval ]).to_file("linear_path.intervalcollection", text_file=True) print("Length") print(linear_path_interval.length()) print(linear_path_interval.region_paths[0]) print(linear_path_interval.start_position) print(linear_path_interval.end_position) linear_peaks = PeakCollection.create_from_linear_intervals_in_bed_file( obg_graph_file_name, linear_path_interval, peaks_bed_file, region.start, region.end) linear_peaks.to_file("linear_peaks.intervalcollection", text_file=True)
def vg_path_to_obg_interval(path_file_name, out_file_name): json_objects = get_json_lines(path_file_name) alignments = (Alignment.from_json(json_object) for json_object in json_objects) intervals = [] for alignment in alignments: path = alignment.path interval = path.to_obg() intervals.append(interval) chrom = path.name start_node = path.mappings[0].node_id( ) # [m.node_id() for m in path.mappings] logging.info("Processing chromosome %s with start node %d" % (chrom, start_node)) with open("chr%s_start_node.txt" % chrom, "w") as f: f.write(str(start_node)) file_name = out_file_name.split( ".")[0] + "_" + chrom + "." + out_file_name.split(".")[-1] IntervalCollection([interval]).to_file(file_name, text_file=True) logging.info("Number of files in interval for chrom %s: %d" % (chrom, len(interval.region_paths))) logging.info("Wrote path as obg interval to %s" % file_name)
def call_peaks(self): genome_size = sum(block.length() for block in self.graph.blocks.values()) experiment_info = ExperimentInfo(genome_size, 50, 20) experiment_info.n_sample_reads = self.n_sample_reads experiment_info.n_control_reads = self.n_control_reads snarlbuilder = SnarlGraphBuilder(self.graph.copy(), self.snarls, id_counter=self.graph.max_block_id() + 1) snarlgraph = snarlbuilder.build_snarl_graphs() linear_map = LinearSnarlMap(snarlgraph, self.graph) linear_map.to_file("simulated_snarl_map.tmp") caller = CallPeaks(self.graph, sample_intervals="dummy", control_intervals=IntervalCollection( self.control_reads), experiment_info=experiment_info, has_control=self.with_control, linear_map="simulated_snarl_map.tmp") caller._sample_pileup = self.sample_pileup self.sample_pileup.to_bed_graph("sample.bdg") caller.create_control(True) caller.scale_tracks(True) caller.get_score() caller.call_peaks() sequence_retriever = DummySequenceRetriever() caller.save_max_path_sequences_to_fasta_file( "simulated_peak_sequences.fasta", sequence_retriever) self.caller = caller
def assert_final_peaks_equals_input_peaks(self): final_peaks = IntervalCollection.create_list_from_file( "test_max_paths.intervalcollection") for peak in self.peaks: self.assertTrue( peak in final_peaks.intervals, "Peak %s not in final peaks. Final peaks: \n%s" % (peak, final_peaks.intervals)) self.assertEqual(len(self.peaks), len(final_peaks.intervals))
def from_file(cls, file_name, graph): logging.info("Reading from file") logging.info("Reading dict structure") with open(file_name, "rb") as f: node_dict = pickle.load(f) logging.info("Reading intervals") intervals = IntervalCollection.from_file(file_name + ".intervals", graph=graph) return cls(node_dict, graph, list(intervals))
def _read_alignments(self): if self.alignment_file_name.endswith(".json"): self.alignments = vg_json_file_to_interval_collection(self.alignment_file_name).intervals elif self.alignment_file_name.endswith(".graphnodes"): self.alignments = (Interval(0, 1, [int(n) for n in line.strip().split()[1].split(",")]) for line in open(self.alignment_file_name)) elif self.alignment_file_name.endswith(".graphalignments"): self.alignments = (Interval.from_file_line(line.strip().split("\t")[1]) for line in open(self.alignment_file_name) if line.strip().split("\t")[1] != ".") else: self.alignments = IntervalCollection.from_file(self.alignment_file_name).intervals
def test_count_unique_reads(self): reads = [ IntervalCollection([ Interval(4, 10, [1, 2, 3]), Interval(4, 5, [1]), Interval(5, 5, [1]), Interval(6, 2, [-3, -2, -1]) ]) ] unique = MultipleGraphsCallpeaks.count_number_of_unique_reads(reads) self.assertEqual(unique, 3)
def run_predict_path(args): chromosomes = args.chromosomes.split(",") processes = [] if not os.path.isfile(args.alignments): logging.error("Input alignments file %s does not exist" % args.alignments) sys.exit() for chromosome in chromosomes: logging.info("Starting process for chromosome %s " % chromosome) process = Process(target=run_predict_path_single_chromosome, args=(args.alignments, chromosome, args.data_dir, args.linear_ref_bonus, args.out_file_name, args.max_nodes_to_traverse)) process.start() processes.append(process) for process in processes: process.join() # Merge all fasta files that were produces out_fasta = open(args.out_file_name + ".fa", "w") logging.info("Merging fasta files") for chromosome in tqdm(chromosomes): with open(args.out_file_name + "_" + chromosome + ".fasta") as f: out_fasta.write(f.read()) logging.info("Wrote resulting linear reference to %s" % (args.out_file_name + ".fa")) # Create indexed intervals for each interval file that was produced logging.info("Creating indexed interval for all chromosomes") for chromosome in chromosomes: file_name = args.out_file_name + "_" + chromosome + ".intervalcollection" graph = Graph.from_file(args.data_dir + chromosome + ".nobg") intervals = IntervalCollection.from_file(file_name, text_file=True, graph=graph) intervals = list(intervals.intervals) assert len( intervals) == 1, "Only a single interval in file is supported" interval = intervals[0] indexed = interval.to_numpy_indexed_interval() indexed.to_file(file_name + ".indexed") logging.info("Wrote indexed interval to file %s" % file_name + ".indexed") if not args.skip_bwa_index: logging.info("Running bwa index") run_bwa_index(args.out_file_name + ".fa") else: logging.info("Not creating bwa index")
def test_filter_duplicates(self): intervals = [ Interval(0, 10, [1, 2, 3]), Interval(1, 10, [1, 2, 3]), Interval(0, 10, [1, 2, 3]) ] interval_collection = IntervalCollection(intervals) intervals_filtered = list(UniqueIntervals(interval_collection)) self.assertEqual(len(intervals_filtered), len(intervals) - 1) self.assertEqual(intervals_filtered[0], intervals[0]) self.assertEqual(intervals_filtered[1], intervals[1])
def check_similarity(self, analyse_first_n_peaks=10000000): print("Number of peaks in main set: %d" % len(self.peaks1.intervals)) self.results.tot_peaks1 = len(self.peaks1.intervals) self.results.tot_peaks2 = len(self.peaks2.intervals) counter = 0 visited = set([]) for peak in sorted(self.peaks1, key=lambda x: x.score, reverse=True)[0:analyse_first_n_peaks]: assert peak.unique_id is not None counter += 1 if counter % 500 == 0: logging.info("Checked %d peaks" % counter) touching = self.peaks2.approx_contains_part_of_interval( peak, visited) if touching: visited.add(touching[0].unique_id) self.peaks2_in_peaks1.append(touching[0]) self.peaks1_in_peaks2.append(peak) else: self.peaks1_not_in_peaks2.append(peak) for peak in self.peaks2: if peak.unique_id not in visited: self.peaks2_not_in_peaks1.append(peak) self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2) self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1) self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2) self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1) chromosome = self.chromosome if chromosome is None: chromosome = "unknown" gpc_not_matching_macs = IntervalCollection(self.peaks1_not_in_peaks2) gpc_not_matching_macs.to_file("gpc_not_matching_macs_chr%s.intervals" % chromosome, text_file=True) logging.info( "Wrote peaks not matching to file gpc_not_matching_macs_chr%s.intervals" % chromosome) macs_not_matching_gpc = IntervalCollection(self.peaks2_not_in_peaks1) macs_not_matching_gpc.to_file("macs_not_matching_gpc_chr%s.intervals" % chromosome, text_file=True) logging.info( "Wrote peaks not matching to file macs_not_matching_gpc_chr%s.intervals" % chromosome)
def __init__(self, graph, sequence_retriever, linear_path_file_name, peaks1_file_name, peaks2_file_name): self.graph = graph self.sequence_retriever = sequence_retriever self.peaks1 = PeakCollection.create_list_from_file(peaks1_file_name, graph=graph) self.peaks2 = PeakCollection.create_list_from_file(peaks2_file_name, graph=graph) print("Number of intervals in set 1/2: %d / %d" % (len(self.peaks1.intervals), len(self.peaks2.intervals))) if linear_path_file_name is not None: self.linear_path = IntervalCollection.create_list_from_file( linear_path_file_name, self.graph).intervals[0]
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name, ob_graph_file_name, vg_graph_file_name): genome = Fasta(fasta_file_name) seq = str(genome[chromosome][0:50818468]).lower() logging.info("Creating sequence retriever") sequence_retriever = SequenceRetriever.from_vg_json_graph( vg_graph_file_name) graph = GraphWithReversals.from_numpy_file(ob_graph_file_name) start_nodes = graph.get_first_blocks() assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes start_node = start_nodes[0] traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever) traverser.search_from_node(start_node) path = traverser.get_interval_found() path = IntervalCollection(path) path.to_file("22_path.intervalcollection", text_file=True) logging.info("Done")
def test_complex_graph(self): intervals = IntervalCollection([ Interval(0, 3, [1, 3, 4, 6, 10]), Interval(1, 2, [2]), Interval(2, 3, [2]), Interval(0, 3, [7, 9]) ]) haplotyper = HaploTyper(self.complex_graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 2, 7, 9, 10, 12]) )
def test_all_steps(self): run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) run_argument_parser( ['create_linear_map', "--graph", "tests/testgraph.obg"]) IntervalCollection([Interval(1, 1, [1, 2]) ]).to_file("tests/sample.intervalcollection") run_argument_parser([ "callpeaks", "--graph", "tests/testgraph.obg", "-s", "tests/sample.intervalcollection", "-n", "tests/test_experiment_", "-f", "10", "-r", "7" ])
def analyse_pileups_on_peaks(ob_graph, pileups_file_names, peak_intervals_file_name): print("Analysing peaks") pileups = { name: SparsePileup.from_bed_graph(ob_graph, pileup) for name, pileup in pileups_file_names.items() } peaks = IntervalCollection.from_file(peak_intervals_file_name, text_file=True) for peak in peaks: print() print("Peak %s" % peak) rp = peak.region_paths[0] for name, pileup in pileups.items(): pileup_sum = sum(pileup.data[rp].sum() for rp in peak.region_paths) print("Pileup %s: %d" % (name, pileup_sum))
def make_haplotype_fasta(chromosome, haplotype, data_dir): s = SequenceGraph.from_file(data_dir + "giab_chr" + chromosome + ".nobg.sequences") print("Getting interval") interval = list( IntervalCollection.from_file(data_dir + "haplotype_" + chromosome + "__" + haplotype + ".intervalcollection", text_file=True).intervals)[0] print("Getting sequence") sequence = s.get_interval_sequence(interval) print("Writing to file") f = open( data_dir + "giab_chr" + chromosome + "_haplotype" + haplotype + ".fasta", "w") f.write(">seq\n%s\n" % sequence) f.close() f.close()
def test_simple(self): graph = Graph( {i: Block(3) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] } ) graph.convert_to_numpy_backend() intervals = IntervalCollection([ Interval(0, 3, [1, 3]) ]) haplotyper = HaploTyper(graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 3, 4]) )
def vg_alignments_to_linear(): ob_graph = obg.GraphWithReversals.from_file("haplo1kg50-mhc.obg") vg_graph = pyvg.vg.Graph.create_from_file("haplo1kg50-mhc.json") path = create_linear_path(ob_graph, vg_graph) analyser = AlignmentsAnalyser( vg_graph, "ENCFF001HNI_haplo1kg50-mhc_filtered_q50.gam", ob_graph, path) # sample reads #linear = analyser.to_linear_alignments() #collection = IntervalCollection(linear) #collection.to_file("graph_reads_on_linear2.intervals") linear = IntervalCollection.from_file( "graph_reads_on_linear2.intervals").intervals #linear = IntervalCollection.create_list_from_file("graph_reads_on_linear.intervals") f = open("graph_reads_on_linear.bed", "w") path = path.to_indexed_interval() linear_reads = [] for read in linear: read.graph = ob_graph assert np.all(np.array(read.region_paths) > 0) or np.all( np.array(read.region_paths) < 0) dir = "+" if read.region_paths[0] < 0: dir = "-" read = read.get_reverse() graph_start = read.start_position graph_end = read.end_position linear_start = MHC_REGION.start + path.get_offset_at_position( graph_start) linear_end = MHC_REGION.start + path.get_offset_at_position(graph_end) f.writelines("chr6\t%d\t%d\t.\t0\t%s\n" % (linear_start, linear_end, dir)) f.close()
def to_file(self, file_name): logging.info("Writing to file") with open(file_name, "wb") as f: pickle.dump(self._node_dict, f) IntervalCollection(self.intervals).to_file(file_name + ".intervals")
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name, haplotype0_file_name, haplotype1_file_name, out_base_name, chromosome): # Make a linear reference fasta and interval and haplotypes fasta and intervals chrom = chromosome graph = Graph.from_file(graph_file_name) sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences") linear_ref = IntervalCollection.from_file(linear_ref_path_file_name, text_file=True) linear_ref = list(linear_ref.intervals)[0] linear_ref_nodes = set(linear_ref.region_paths) # Write linear ref fasta to file linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref) out_file = open("linear_ref_" + chrom + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([linear_ref_seq + "\n"]) out_file.close() logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" % len(linear_ref_nodes)) haplotype_nodes = [set(), set()] # For haplotype 0 and 1 for haplotype in [0, 1]: haplotype_file_name = haplotype0_file_name if haplotype == 1: haplotype_file_name = haplotype1_file_name intervals = vg_json_file_to_intervals(haplotype_file_name, graph) for interval in intervals: for node in interval.region_paths: haplotype_nodes[haplotype].add(node) logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0])) logging.info("N nodes in haplotype 0 that are also in linear ref: %d" % len(haplotype_nodes[0].intersection(linear_ref_nodes))) logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1])) # Traverse graph to get full correct haplotype intervals first_nodes = graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(graph.blocks)) for haplotype in [0, 1]: logging.info("Traversing haplotype %d" % haplotype) nodes = [] node = first_nodes[0] nodes_in_haplotype = haplotype_nodes[haplotype] nodes_in_haplotype = set(range( 0, max(linear_ref_nodes))).difference(linear_ref_nodes) logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype)) assert len( nodes_in_haplotype ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty" n_haplotype_nodes = 0 i = 0 while True: nodes.append(node) if i % 50000 == 0: logging.info("#%d nodes traversed. On node %d" % (i, node)) i += 1 next_nodes = set(graph.adj_list[node]) if len(next_nodes) == 0: logging.info("Reached end node %d with 0 edges" % node) break next_on_haplotype = next_nodes.intersection(nodes_in_haplotype) if len(next_on_haplotype) == 1: n_haplotype_nodes += 1 next_node = list(next_on_haplotype)[0] assert next_node != node node = next_node elif len(next_on_haplotype) == 0: logging.debug( "No new haplotype node from %d. Will follow reference" % node) # Choose reference with lowest id to avoid deletion node = min(list(next_nodes.intersection(linear_ref_nodes))) else: # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node) # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion node = min(list(next_on_haplotype)) logging.info("Found %d nodes. %d on haplotype" % (len(nodes), n_haplotype_nodes)) haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(), nodes, graph) print("Path length: %d" % haplotype_interval.length()) file_base_name = out_base_name + "_" + str(haplotype) IntervalCollection([haplotype_interval ]).to_file(file_base_name + ".intervalcollection", text_file=True) sequence = sequence_graph.get_interval_sequence(haplotype_interval) out_file = open(file_base_name + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([sequence + "\n"]) out_file.close() logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
def check_similarity_old(self, analyse_first_n_peaks=10000000): i = 1 for peak_datasets in [(self.peaks1, self.peaks2), (self.peaks2, self.peaks1)]: n_identical = 0 tot_n_similar = 0 n_similar = 0 n_tot = 0 print("\n-- Comparing set %d against set %d ---" % (i, i % 2 + 1)) peaks1, peaks2 = peak_datasets print("Number of peaks in main set: %d" % len(peaks1.intervals)) if i == 1: self.results.tot_peaks1 = len(peaks1.intervals) else: self.results.tot_peaks2 = len(peaks1.intervals) not_matching = [] matching = [] counter = 0 visited = set([]) for peak in sorted(peaks1, key=lambda x: x.score, reverse=True)[0:analyse_first_n_peaks]: assert peak.unique_id is not None counter += 1 if counter % 500 == 0: logging.info("Checked %d peaks" % counter) touching = peaks2.approx_contains_part_of_interval( peak, visited) if touching: visited.add(touching[0].unique_id) n_similar += 1 if i == 1: self.peaks1_in_peaks2.append(peak) else: self.peaks2_in_peaks1.append(peak) matching.append(peak) else: not_matching.append(peak) if i == 1: self.peaks1_not_in_peaks2.append(peak) else: self.peaks2_not_in_peaks1.append(peak) n_tot += 1 self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2) self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1) self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2) self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1) not_matching = IntervalCollection(not_matching) not_matching.to_file("not_matching_set%d.intervals" % i, text_file=True) logging.info( "Wrote peaks not matching to file not_matching_set%d.intervals" % i) matching = IntervalCollection(matching) matching.to_file("matching_set%d.intervals" % i, text_file=True) logging.info("Total peaks in main set: %d" % n_tot) logging.info("N similar to peak in other set: %d " % n_similar) logging.info("N not matching other set: %d " % len(not_matching.intervals)) i += 1
def do_asserts(self): for i, chromosome in enumerate(self.chromosomes): final_peaks = IntervalCollection.create_list_from_file( "multigraphs_" + chromosome + "_max_paths.intervalcollection") for peak in self.peaks[i]: assert peak in final_peaks
def _create_reads(self, *args): super(TestMultipleGraphsCallPeaksCommandLine, self)._create_reads(*args) for intervals, chrom in zip(self.sample_reads, self.chromosomes): IntervalCollection(intervals._intervals).to_file( "test_sample_" + chrom + ".intervalcollection", text_file=True)
def predict_path(self): logging.info("Using linear bonus %d on chromosome %s" % (self.linear_ref_bonus, self.chromosome)) logging.info("Using linear out base name %s" % self.out_file_base_name) out_file = open("%s_%s.fasta" % (self.out_file_base_name, self.chromosome), "w") # Traverse first_nodes = self.graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(self.graph.blocks)) node = first_nodes[0] assert node in self.linear_path_nodes, "Start node should be in linear ref" path = [] n_ambigious = 0 edges_chosen = set() i = 0 n_special_case = 0 while True: if i % 1000000 == 0: logging.info("%d nodes in graph traversed on chrom %s" % (i, self.chromosome)) i += 1 if self.max_nodes_to_traverse is not None and i > self.max_nodes_to_traverse: logging.warning("Stopped traversing before end because max node to traverse was set") break path.append(node) next_nodes = self.graph.adj_list[node] if len(next_nodes) == 0: logging.info("Done on node %d" % node) break elif len(next_nodes) == 1: node = next_nodes[0] else: most_reads = 0 most_reads_node = next_nodes[0] has_found_candidate_on_linear_ref = False for next_node in next_nodes: n_reads = self.edge_counts["%s-%s" % (node, next_node)] if next_node in self.linear_path_nodes: n_reads += self.linear_ref_bonus if n_reads > most_reads or (n_reads >= most_reads and next_node in self.linear_path_nodes): if node not in self.linear_path_nodes: n_special_case += 1 # If already found something on linear ref, and this does not have more reads or lower id (not insertion), ignore if has_found_candidate_on_linear_ref and n_reads == most_reads and next_node > most_reads_node: continue # Ignore this alternative most_reads_node = next_node most_reads = n_reads if next_node in self.linear_path_nodes: has_found_candidate_on_linear_ref = True if most_reads == 0: n_ambigious += 1 assert most_reads_node is not None edges_chosen.add("%d-%d" % (node, most_reads_node)) node = most_reads_node if most_reads == 0: # Assert we have taken linear ref path if exists if any([n in self.linear_path_nodes for n in next_nodes]): if node not in self.linear_path_nodes: logging.error("Chose node %d as next, but it is not in linear ref." % node) logging.error("Next nodes are: %s" % next_nodes) for next_node in next_nodes: if next_node in self.linear_path_nodes: logging.error(" Node %d is in linear ref" % next_node) else: logging.error(" Node %d is not in linear ref" % next_node) raise Exception("Could not traverse correctly") # Find statistics of chosen nodes nodes_chosen = set(path) n_on_linear = len(nodes_chosen.intersection(self.linear_path_nodes)) n_not_on_linear = len(nodes_chosen) - n_on_linear linear_ref_interval = Interval(0, self.graph.blocks[path[-1]].length(), path, self.graph) IntervalCollection([linear_ref_interval]).to_file("%s_%s.intervalcollection" % (self.out_file_base_name, self.chromosome), text_file=True) logging.info("=== STATS FOR CHROMOSOME %s ===" % self.chromosome) logging.info("N ambigious choices: %d" % n_ambigious) logging.info("Total nodes in linear ref: %d" % len(self.linear_path_nodes)) logging.info("N nodes chosen that are not in linear ref: %d " % n_not_on_linear) logging.info("N nodes chosen that are in linear ref: %d " % n_on_linear) logging.info("N special case: %d" % n_special_case) logging.info("N nodes in path: %d" % len(path)) logging.info("Linear path length: %d" % linear_ref_interval.length()) sequence = self.sequence_graph.get_interval_sequence(linear_ref_interval) out_file.writelines([">%s\n" % self.chromosome]) out_file.writelines([sequence + "\n"]) out_file.close()