def __getitem__(self, idx): if self.fasta_extractors is None: self.fasta_extractors = FastaStringExtractor( self.fasta_file, use_strand=False, # self.use_strand, force_upper=self.force_upper) interval, labels = self.bed[idx] if self.auto_resize_len: # automatically resize the sequence to cerat interval = resize_interval(interval, self.auto_resize_len, anchor='center') # QUESTION: @kromme - why to we need max_seq_len? # if self.max_seq_len is not None: # assert interval.stop - interval.start <= self.max_seq_len # Run the fasta extractor and transform if necessary seq = self.fasta_extractors.extract(interval) return { "inputs": np.array(seq), "targets": labels, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)) } }
def test_resize_interval(anchor, ilen): import pybedtools dummy_start, dummy_end = 10, 20 dummy_center = int((dummy_start + dummy_end) / 2) dummy_inter = pybedtools.create_interval_from_list( ['chr2', dummy_start, dummy_end, 'intname']) ret_inter = resize_interval(dummy_inter, ilen, anchor) # the original interval was left intact assert dummy_inter.chrom == 'chr2' assert dummy_inter.start == dummy_start assert dummy_inter.end == dummy_end assert dummy_inter.name == 'intname' # metadata kept assert ret_inter.chrom == dummy_inter.chrom assert ret_inter.name == 'intname' # desired output width assert ret_inter.length == ilen # correct anchor point if anchor == "start": assert ret_inter.start == dummy_start elif anchor == "end": assert ret_inter.end == dummy_end elif anchor == "center": assert int((ret_inter.start + ret_inter.end) / 2) == dummy_center
def extract(self, interval): """ Extract the coverage corresponding to the interval from the bbi_files. returns: np.array of shape (window, number of files per annotation, number of annotation) """ if not self.sampling_mode: assert self.window <= abs(interval.stop - interval.start),\ """The target window must be smaller than the input length""" interval = resize_interval(interval, self.window, anchor='center') seq = list() for bbi_file in self.bbi_files: bw = pyBigWig.open(bbi_file) array = bw.values(interval.chrom, interval.start, interval.stop, numpy=True) array[np.isnan(array)] = 0 seq.append(self.norm_dico[bbi_file](array)) seq = np.array(seq).T if self.sampling_mode: assert abs(interval.stop - interval.start) % self.window == 0,\ """Window must divide the input length to use downsampling""" sampling_length = abs(interval.stop - interval.start) // self.window if self.sampling_mode == 'downsampling': seq = seq[::sampling_length] elif self.sampling_mode == 'mean': seq = self._calculate_rolling_mean(seq) else: raise NameError('sampling_mode must be None, "mean" or "downsampling"') if self.nb_annotation_type: nb_files_per_ann = len(self.bbi_files) // self.nb_annotation_type return seq.reshape((self.window, nb_files_per_ann, self.nb_annotation_type)) else: return seq
def __getitem__(self, idx): self.fasta_extractor = FastaStringExtractor(self.fasta_file) # get the intervals interval, targets = self.bt[idx] # resize to 500bp interval = resize_interval(interval, 500, anchor='center') # extract the sequence seq = self.fasta_extractor.extract(interval) # one-hot encode the sequence seq_onehot = self.transform(seq) ranges = GenomicRanges.from_interval(interval) return {"inputs": [seq_onehot], "metadata": [ranges]}
def __call__(self, interval): return F.resize_interval(interval, self.width, self.anchor)