def fill_small_wholes(self,
                          max_size,
                          write_holes_to_file=None,
                          touched_nodes=None):
        cleaner = HolesCleaner(self, max_size, touched_nodes=touched_nodes)
        areas = cleaner.run()
        n_filled = 0

        hole_intervals = []

        for node_id in areas.areas:
            if touched_nodes is not None:
                if node_id not in touched_nodes:
                    continue

            starts = areas.get_starts(node_id)
            ends = areas.get_ends(node_id)
            for start, end in zip(starts, ends):
                self.data[node_id].set_interval_value(start, end, True)
                logging.debug("Filling hole %s, %d, %d" %
                              (node_id, start, end))
                n_filled += 1
                assert end - start <= max_size
                hole_intervals.append(Interval(start, end, [node_id]))

        logging.info("Filled %d small holes (splitted into holes per node)" %
                     n_filled)

        if write_holes_to_file is not None:
            intervals = IntervalCollection(hole_intervals)
            intervals.to_file(write_holes_to_file, text_file=True)

        self.sanitize()
    def check_similarity(self, analyse_first_n_peaks=10000000):
        print("Number of peaks in main set: %d" % len(self.peaks1.intervals))
        self.results.tot_peaks1 = len(self.peaks1.intervals)
        self.results.tot_peaks2 = len(self.peaks2.intervals)
        counter = 0
        visited = set([])
        for peak in sorted(self.peaks1, key=lambda x: x.score,
                           reverse=True)[0:analyse_first_n_peaks]:
            assert peak.unique_id is not None
            counter += 1
            if counter % 500 == 0:
                logging.info("Checked %d peaks" % counter)
            touching = self.peaks2.approx_contains_part_of_interval(
                peak, visited)
            if touching:
                visited.add(touching[0].unique_id)
                self.peaks2_in_peaks1.append(touching[0])
                self.peaks1_in_peaks2.append(peak)
            else:
                self.peaks1_not_in_peaks2.append(peak)
        for peak in self.peaks2:
            if peak.unique_id not in visited:
                self.peaks2_not_in_peaks1.append(peak)

        self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2)
        self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1)

        self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2)
        self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1)

        chromosome = self.chromosome
        if chromosome is None:
            chromosome = "unknown"

        gpc_not_matching_macs = IntervalCollection(self.peaks1_not_in_peaks2)
        gpc_not_matching_macs.to_file("gpc_not_matching_macs_chr%s.intervals" %
                                      chromosome,
                                      text_file=True)
        logging.info(
            "Wrote peaks not matching to file gpc_not_matching_macs_chr%s.intervals"
            % chromosome)

        macs_not_matching_gpc = IntervalCollection(self.peaks2_not_in_peaks1)
        macs_not_matching_gpc.to_file("macs_not_matching_gpc_chr%s.intervals" %
                                      chromosome,
                                      text_file=True)
        logging.info(
            "Wrote peaks not matching to file macs_not_matching_gpc_chr%s.intervals"
            % chromosome)
Beispiel #3
0
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name,
                                        ob_graph_file_name,
                                        vg_graph_file_name):
    genome = Fasta(fasta_file_name)
    seq = str(genome[chromosome][0:50818468]).lower()

    logging.info("Creating sequence retriever")
    sequence_retriever = SequenceRetriever.from_vg_json_graph(
        vg_graph_file_name)

    graph = GraphWithReversals.from_numpy_file(ob_graph_file_name)

    start_nodes = graph.get_first_blocks()
    assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes
    start_node = start_nodes[0]

    traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever)
    traverser.search_from_node(start_node)
    path = traverser.get_interval_found()
    path = IntervalCollection(path)
    path.to_file("22_path.intervalcollection", text_file=True)
    logging.info("Done")
    def check_similarity_old(self, analyse_first_n_peaks=10000000):
        i = 1
        for peak_datasets in [(self.peaks1, self.peaks2),
                              (self.peaks2, self.peaks1)]:
            n_identical = 0
            tot_n_similar = 0
            n_similar = 0
            n_tot = 0
            print("\n-- Comparing set %d against set %d ---" % (i, i % 2 + 1))
            peaks1, peaks2 = peak_datasets
            print("Number of peaks in main set: %d" % len(peaks1.intervals))
            if i == 1:
                self.results.tot_peaks1 = len(peaks1.intervals)
            else:
                self.results.tot_peaks2 = len(peaks1.intervals)

            not_matching = []
            matching = []
            counter = 0
            visited = set([])
            for peak in sorted(peaks1, key=lambda x: x.score,
                               reverse=True)[0:analyse_first_n_peaks]:
                assert peak.unique_id is not None
                counter += 1
                if counter % 500 == 0:
                    logging.info("Checked %d peaks" % counter)
                touching = peaks2.approx_contains_part_of_interval(
                    peak, visited)
                if touching:
                    visited.add(touching[0].unique_id)
                    n_similar += 1
                    if i == 1:
                        self.peaks1_in_peaks2.append(peak)
                    else:
                        self.peaks2_in_peaks1.append(peak)
                    matching.append(peak)
                else:
                    not_matching.append(peak)
                    if i == 1:
                        self.peaks1_not_in_peaks2.append(peak)
                    else:
                        self.peaks2_not_in_peaks1.append(peak)

                n_tot += 1
            self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2)
            self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1)

            self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2)
            self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1)

            not_matching = IntervalCollection(not_matching)
            not_matching.to_file("not_matching_set%d.intervals" % i,
                                 text_file=True)
            logging.info(
                "Wrote peaks not matching to file not_matching_set%d.intervals"
                % i)
            matching = IntervalCollection(matching)
            matching.to_file("matching_set%d.intervals" % i, text_file=True)

            logging.info("Total peaks in main set: %d" % n_tot)
            logging.info("N similar to peak in other set: %d " % n_similar)
            logging.info("N not matching other set: %d " %
                         len(not_matching.intervals))

            i += 1