コード例 #1
0
    def find_loop_anchor_points(self, bedgraph: BedGraph):
        """
        Finds the exact loop anchor points.

        Finds peak values for each anchor and weighs the loop. Also finds loops
        that have overlapping start/end indexes due to close and long start/end
        anchors.

        Parameters
        ----------
        bedgraph : BedGraph
            Used to find the anchor points of each loop
        """

        log.info(f'Finding anchor points for {self.sample_name}\'s {self.name}'
                 f' from {bedgraph.name}')

        bedgraph.load_chrom_data(self.name)

        # Get index of peaks in every anchor interval
        self.start_list = bedgraph.stats(start_list=self.start_anchor_list[0],
                                         end_list=self.start_anchor_list[1],
                                         chrom_name=self.name,
                                         stat='max_index')
        self.end_list = bedgraph.stats(start_list=self.end_anchor_list[0],
                                       end_list=self.end_anchor_list[1],
                                       chrom_name=self.name,
                                       stat='max_index')

        # Get peak value for every anchor interval
        start_list_peaks = bedgraph.stats(start_list=self.start_anchor_list[0],
                                          end_list=self.start_anchor_list[1],
                                          chrom_name=self.name,
                                          stat='max')
        end_list_peaks = bedgraph.stats(start_list=self.end_anchor_list[0],
                                        end_list=self.end_anchor_list[1],
                                        chrom_name=self.name,
                                        stat='max')
        self.start_list_peaks = start_list_peaks
        self.end_list_peaks = end_list_peaks
        bedgraph.free_chrom_data(self.name)

        start_list_peaks = start_list_peaks / start_list_peaks.sum()
        end_list_peaks = end_list_peaks / end_list_peaks.sum()

        for i in range(self.numb_loops):
            # loop_start = self.start_list[i]
            # loop_end = self.end_list[i]

            # Remove anchors that have the same* peak
            # Keep indexes of loop length to avoid comparisons in interval
            # if not loop_start < loop_end:
            #     self.value_list[i] = 0
            #
            #     # Removed interval goes from
            #     # (start of start anchor, end of end anchor)
            #     self.removed_intervals[0].append(self.start_anchor_list[0][i])
            #     self.removed_intervals[1].append(self.end_anchor_list[1][i])
            #     continue

            # Weigh each loop based on its corresponding bedgraph peak
            # peak_value = max(start_list_peaks[i], end_list_peaks[i])
            peak_value = start_list_peaks[i] + end_list_peaks[i]
            self.value_list[i] *= peak_value

        self.max_loop_value = np.max(self.value_list)

        # Should be very small due to peaks being weighted earlier
        log.debug(f"Max loop weighted value: {self.max_loop_value}")
コード例 #2
0
    def __init__(self,
                 chrom_size_file: str,
                 loop_file: str,
                 bedgraph: BedGraph,
                 peak_dict: Dict[str, list],
                 chroms_to_load: List[str] = None,
                 min_loop_value: int = 0):
        """
        Initializes all chromosomes and adds loops to them from given file.

        Finds peak max from bedgraph

        Parameters
        ----------
        chrom_size_file : str
            File containing the base pair size of each chromosome to use
        loop_file : str
            File containing loops in format:
            chrom1  start1   end1 chrom2  start2   end2 pet_count
        bedgraph : BedGraph
            The bedgraph file for this sample (from pyBedGraph)
        peak_dict : dict[str, list]
            Key: Name of chromosome (chr1, chr2, ...)
            Value: List of peaks in chromosome
            Peak format: [start, end, length]
        chroms_to_load : list, optional
             List of names of chromosome to load (default is None)
        min_loop_value : int, optional
            Minimum loop value (PET count) to include (default is 0)
        """

        # Prints peak_dict which is too large to be meaningful
        # log.debug(locals())

        self.species_name = os.path.basename(chrom_size_file).split('.')[0]
        self.sample_name = os.path.basename(loop_file).split('.')[0]

        self.total_samples = 0

        self.peak_dict = {}

        # Find values for each peak since peak caller is not accurate sometimes
        for chrom_name, peak_chrom in peak_dict.items():
            if not bedgraph.has_chrom(chrom_name):
                log.warning(f'{bedgraph.name} does not have {chrom_name}')
                continue

            bedgraph.load_chrom_data(chrom_name)
            start_list = [x[0] for x in peak_chrom]
            end_list = [x[1] for x in peak_chrom]
            max_list = \
                bedgraph.stats(start_list=start_list, end_list=end_list,
                               chrom_name=chrom_name, stat='max')
            mean_list = \
                bedgraph.stats(start_list=start_list, end_list=end_list,
                               chrom_name=chrom_name, stat='mean')
            for i in range(max_list.size):
                peak_chrom[i].append(max_list[i])
                peak_chrom[i].append(mean_list[i])
            bedgraph.free_chrom_data(chrom_name)

            self.peak_dict[chrom_name] = peak_dict[chrom_name]

        # Initialize all chromosomes to be loaded
        self.chrom_dict = {}
        with open(chrom_size_file) as in_file:
            for line in in_file:
                line = line.strip().split()
                chrom_name = line[0]
                if chroms_to_load and chrom_name not in chroms_to_load:
                    continue

                if chrom_name in CHROMS_TO_IGNORE:
                    continue

                if chrom_name not in peak_dict:
                    continue

                chrom_size = int(line[1])

                self.chrom_dict[chrom_name] = \
                    ChromLoopData(chrom_name, chrom_size, self.sample_name)

        with open(loop_file) as in_file:
            loop_anchor_list = []
            for line in in_file:
                line = line.strip().split()
                chrom_name = line[0]
                if chrom_name not in self.chrom_dict:
                    continue

                loop_value = int(line[6])
                if loop_value < min_loop_value:
                    continue

                # head interval
                loop_start1 = int(line[1])
                loop_end1 = int(line[2])

                # tail anchor
                loop_start2 = int(line[4])
                loop_end2 = int(line[5])

                self.chrom_dict[chrom_name].add_loop(loop_start1, loop_end1,
                                                     loop_start2, loop_end2,
                                                     loop_value)

                head_interval = loop_end1 - loop_start1
                tail_interval = loop_end2 - loop_start2

                loop_anchor_list.append(head_interval)
                loop_anchor_list.append(tail_interval)

            log.debug(f'Anchor mean width: {np.mean(loop_anchor_list)}')

        # Get rid of chroms that had problems initializing
        to_remove = []
        for chrom_name in self.chrom_dict:
            if self.chrom_dict[chrom_name].finish_init(bedgraph):
                self.total_samples += \
                    np.sum(self.chrom_dict[chrom_name].value_list)
            else:
                to_remove.append(chrom_name)

        # Chromosomes with no loops or other random problems
        for chrom_name in to_remove:
            del self.chrom_dict[chrom_name]