def find_loop_anchor_points(self, bedgraph: BedGraph): """ Finds the exact loop anchor points. Finds peak values for each anchor and weighs the loop. Also finds loops that have overlapping start/end indexes due to close and long start/end anchors. Parameters ---------- bedgraph : BedGraph Used to find the anchor points of each loop """ log.info(f'Finding anchor points for {self.sample_name}\'s {self.name}' f' from {bedgraph.name}') bedgraph.load_chrom_data(self.name) # Get index of peaks in every anchor interval self.start_list = bedgraph.stats(start_list=self.start_anchor_list[0], end_list=self.start_anchor_list[1], chrom_name=self.name, stat='max_index') self.end_list = bedgraph.stats(start_list=self.end_anchor_list[0], end_list=self.end_anchor_list[1], chrom_name=self.name, stat='max_index') # Get peak value for every anchor interval start_list_peaks = bedgraph.stats(start_list=self.start_anchor_list[0], end_list=self.start_anchor_list[1], chrom_name=self.name, stat='max') end_list_peaks = bedgraph.stats(start_list=self.end_anchor_list[0], end_list=self.end_anchor_list[1], chrom_name=self.name, stat='max') self.start_list_peaks = start_list_peaks self.end_list_peaks = end_list_peaks bedgraph.free_chrom_data(self.name) start_list_peaks = start_list_peaks / start_list_peaks.sum() end_list_peaks = end_list_peaks / end_list_peaks.sum() for i in range(self.numb_loops): # loop_start = self.start_list[i] # loop_end = self.end_list[i] # Remove anchors that have the same* peak # Keep indexes of loop length to avoid comparisons in interval # if not loop_start < loop_end: # self.value_list[i] = 0 # # # Removed interval goes from # # (start of start anchor, end of end anchor) # self.removed_intervals[0].append(self.start_anchor_list[0][i]) # self.removed_intervals[1].append(self.end_anchor_list[1][i]) # continue # Weigh each loop based on its corresponding bedgraph peak # peak_value = max(start_list_peaks[i], end_list_peaks[i]) peak_value = start_list_peaks[i] + end_list_peaks[i] self.value_list[i] *= peak_value self.max_loop_value = np.max(self.value_list) # Should be very small due to peaks being weighted earlier log.debug(f"Max loop weighted value: {self.max_loop_value}")
def __init__(self, chrom_size_file: str, loop_file: str, bedgraph: BedGraph, peak_dict: Dict[str, list], chroms_to_load: List[str] = None, min_loop_value: int = 0): """ Initializes all chromosomes and adds loops to them from given file. Finds peak max from bedgraph Parameters ---------- chrom_size_file : str File containing the base pair size of each chromosome to use loop_file : str File containing loops in format: chrom1 start1 end1 chrom2 start2 end2 pet_count bedgraph : BedGraph The bedgraph file for this sample (from pyBedGraph) peak_dict : dict[str, list] Key: Name of chromosome (chr1, chr2, ...) Value: List of peaks in chromosome Peak format: [start, end, length] chroms_to_load : list, optional List of names of chromosome to load (default is None) min_loop_value : int, optional Minimum loop value (PET count) to include (default is 0) """ # Prints peak_dict which is too large to be meaningful # log.debug(locals()) self.species_name = os.path.basename(chrom_size_file).split('.')[0] self.sample_name = os.path.basename(loop_file).split('.')[0] self.total_samples = 0 self.peak_dict = {} # Find values for each peak since peak caller is not accurate sometimes for chrom_name, peak_chrom in peak_dict.items(): if not bedgraph.has_chrom(chrom_name): log.warning(f'{bedgraph.name} does not have {chrom_name}') continue bedgraph.load_chrom_data(chrom_name) start_list = [x[0] for x in peak_chrom] end_list = [x[1] for x in peak_chrom] max_list = \ bedgraph.stats(start_list=start_list, end_list=end_list, chrom_name=chrom_name, stat='max') mean_list = \ bedgraph.stats(start_list=start_list, end_list=end_list, chrom_name=chrom_name, stat='mean') for i in range(max_list.size): peak_chrom[i].append(max_list[i]) peak_chrom[i].append(mean_list[i]) bedgraph.free_chrom_data(chrom_name) self.peak_dict[chrom_name] = peak_dict[chrom_name] # Initialize all chromosomes to be loaded self.chrom_dict = {} with open(chrom_size_file) as in_file: for line in in_file: line = line.strip().split() chrom_name = line[0] if chroms_to_load and chrom_name not in chroms_to_load: continue if chrom_name in CHROMS_TO_IGNORE: continue if chrom_name not in peak_dict: continue chrom_size = int(line[1]) self.chrom_dict[chrom_name] = \ ChromLoopData(chrom_name, chrom_size, self.sample_name) with open(loop_file) as in_file: loop_anchor_list = [] for line in in_file: line = line.strip().split() chrom_name = line[0] if chrom_name not in self.chrom_dict: continue loop_value = int(line[6]) if loop_value < min_loop_value: continue # head interval loop_start1 = int(line[1]) loop_end1 = int(line[2]) # tail anchor loop_start2 = int(line[4]) loop_end2 = int(line[5]) self.chrom_dict[chrom_name].add_loop(loop_start1, loop_end1, loop_start2, loop_end2, loop_value) head_interval = loop_end1 - loop_start1 tail_interval = loop_end2 - loop_start2 loop_anchor_list.append(head_interval) loop_anchor_list.append(tail_interval) log.debug(f'Anchor mean width: {np.mean(loop_anchor_list)}') # Get rid of chroms that had problems initializing to_remove = [] for chrom_name in self.chrom_dict: if self.chrom_dict[chrom_name].finish_init(bedgraph): self.total_samples += \ np.sum(self.chrom_dict[chrom_name].value_list) else: to_remove.append(chrom_name) # Chromosomes with no loops or other random problems for chrom_name in to_remove: del self.chrom_dict[chrom_name]