def fill_small_wholes(self, max_size, write_holes_to_file=None, touched_nodes=None): cleaner = HolesCleaner(self, max_size, touched_nodes=touched_nodes) areas = cleaner.run() n_filled = 0 hole_intervals = [] for node_id in areas.areas: if touched_nodes is not None: if node_id not in touched_nodes: continue starts = areas.get_starts(node_id) ends = areas.get_ends(node_id) for start, end in zip(starts, ends): self.data[node_id].set_interval_value(start, end, True) logging.debug("Filling hole %s, %d, %d" % (node_id, start, end)) n_filled += 1 assert end - start <= max_size hole_intervals.append(Interval(start, end, [node_id])) logging.info("Filled %d small holes (splitted into holes per node)" % n_filled) if write_holes_to_file is not None: intervals = IntervalCollection(hole_intervals) intervals.to_file(write_holes_to_file, text_file=True) self.sanitize()
def check_similarity(self, analyse_first_n_peaks=10000000): print("Number of peaks in main set: %d" % len(self.peaks1.intervals)) self.results.tot_peaks1 = len(self.peaks1.intervals) self.results.tot_peaks2 = len(self.peaks2.intervals) counter = 0 visited = set([]) for peak in sorted(self.peaks1, key=lambda x: x.score, reverse=True)[0:analyse_first_n_peaks]: assert peak.unique_id is not None counter += 1 if counter % 500 == 0: logging.info("Checked %d peaks" % counter) touching = self.peaks2.approx_contains_part_of_interval( peak, visited) if touching: visited.add(touching[0].unique_id) self.peaks2_in_peaks1.append(touching[0]) self.peaks1_in_peaks2.append(peak) else: self.peaks1_not_in_peaks2.append(peak) for peak in self.peaks2: if peak.unique_id not in visited: self.peaks2_not_in_peaks1.append(peak) self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2) self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1) self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2) self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1) chromosome = self.chromosome if chromosome is None: chromosome = "unknown" gpc_not_matching_macs = IntervalCollection(self.peaks1_not_in_peaks2) gpc_not_matching_macs.to_file("gpc_not_matching_macs_chr%s.intervals" % chromosome, text_file=True) logging.info( "Wrote peaks not matching to file gpc_not_matching_macs_chr%s.intervals" % chromosome) macs_not_matching_gpc = IntervalCollection(self.peaks2_not_in_peaks1) macs_not_matching_gpc.to_file("macs_not_matching_gpc_chr%s.intervals" % chromosome, text_file=True) logging.info( "Wrote peaks not matching to file macs_not_matching_gpc_chr%s.intervals" % chromosome)
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name, ob_graph_file_name, vg_graph_file_name): genome = Fasta(fasta_file_name) seq = str(genome[chromosome][0:50818468]).lower() logging.info("Creating sequence retriever") sequence_retriever = SequenceRetriever.from_vg_json_graph( vg_graph_file_name) graph = GraphWithReversals.from_numpy_file(ob_graph_file_name) start_nodes = graph.get_first_blocks() assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes start_node = start_nodes[0] traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever) traverser.search_from_node(start_node) path = traverser.get_interval_found() path = IntervalCollection(path) path.to_file("22_path.intervalcollection", text_file=True) logging.info("Done")
def check_similarity_old(self, analyse_first_n_peaks=10000000): i = 1 for peak_datasets in [(self.peaks1, self.peaks2), (self.peaks2, self.peaks1)]: n_identical = 0 tot_n_similar = 0 n_similar = 0 n_tot = 0 print("\n-- Comparing set %d against set %d ---" % (i, i % 2 + 1)) peaks1, peaks2 = peak_datasets print("Number of peaks in main set: %d" % len(peaks1.intervals)) if i == 1: self.results.tot_peaks1 = len(peaks1.intervals) else: self.results.tot_peaks2 = len(peaks1.intervals) not_matching = [] matching = [] counter = 0 visited = set([]) for peak in sorted(peaks1, key=lambda x: x.score, reverse=True)[0:analyse_first_n_peaks]: assert peak.unique_id is not None counter += 1 if counter % 500 == 0: logging.info("Checked %d peaks" % counter) touching = peaks2.approx_contains_part_of_interval( peak, visited) if touching: visited.add(touching[0].unique_id) n_similar += 1 if i == 1: self.peaks1_in_peaks2.append(peak) else: self.peaks2_in_peaks1.append(peak) matching.append(peak) else: not_matching.append(peak) if i == 1: self.peaks1_not_in_peaks2.append(peak) else: self.peaks2_not_in_peaks1.append(peak) n_tot += 1 self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2) self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1) self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2) self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1) not_matching = IntervalCollection(not_matching) not_matching.to_file("not_matching_set%d.intervals" % i, text_file=True) logging.info( "Wrote peaks not matching to file not_matching_set%d.intervals" % i) matching = IntervalCollection(matching) matching.to_file("matching_set%d.intervals" % i, text_file=True) logging.info("Total peaks in main set: %d" % n_tot) logging.info("N similar to peak in other set: %d " % n_similar) logging.info("N not matching other set: %d " % len(not_matching.intervals)) i += 1