Esempio n. 1
0
    def __getitem__(self, idx):
        if self.fasta_extractors is None:
            self.fasta_extractors = FastaStringExtractor(
                self.fasta_file,
                use_strand=False,  # self.use_strand,
                force_upper=self.force_upper)

        interval, labels = self.bed[idx]

        if self.auto_resize_len:
            # automatically resize the sequence to cerat
            interval = resize_interval(interval,
                                       self.auto_resize_len,
                                       anchor='center')

        # QUESTION: @kromme - why to we need max_seq_len?
        # if self.max_seq_len is not None:
        #     assert interval.stop - interval.start <= self.max_seq_len

        # Run the fasta extractor and transform if necessary
        seq = self.fasta_extractors.extract(interval)

        return {
            "inputs": np.array(seq),
            "targets": labels,
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx))
            }
        }
Esempio n. 2
0
def test_resize_interval(anchor, ilen):
    import pybedtools
    dummy_start, dummy_end = 10, 20
    dummy_center = int((dummy_start + dummy_end) / 2)

    dummy_inter = pybedtools.create_interval_from_list(
        ['chr2', dummy_start, dummy_end, 'intname'])
    ret_inter = resize_interval(dummy_inter, ilen, anchor)

    # the original interval was left intact
    assert dummy_inter.chrom == 'chr2'
    assert dummy_inter.start == dummy_start
    assert dummy_inter.end == dummy_end
    assert dummy_inter.name == 'intname'

    # metadata kept
    assert ret_inter.chrom == dummy_inter.chrom
    assert ret_inter.name == 'intname'

    # desired output width
    assert ret_inter.length == ilen

    # correct anchor point
    if anchor == "start":
        assert ret_inter.start == dummy_start
    elif anchor == "end":
        assert ret_inter.end == dummy_end
    elif anchor == "center":
        assert int((ret_inter.start + ret_inter.end) / 2) == dummy_center
Esempio n. 3
0
    def extract(self, interval):
        """
        Extract the coverage corresponding to the interval from the bbi_files.
        
        returns:
            np.array of shape (window,
                               number of files per annotation,
                               number of annotation)
        """
        if not self.sampling_mode:
            assert self.window <= abs(interval.stop - interval.start),\
            """The target window must be smaller than the input length"""

            interval = resize_interval(interval,
                                       self.window,
                                       anchor='center')
        seq = list()
        for bbi_file in self.bbi_files:
            bw = pyBigWig.open(bbi_file)
            array = bw.values(interval.chrom,
                              interval.start,
                              interval.stop, numpy=True)
            array[np.isnan(array)] = 0
            seq.append(self.norm_dico[bbi_file](array))
        seq = np.array(seq).T

        if self.sampling_mode:
            assert abs(interval.stop - interval.start) % self.window == 0,\
            """Window must divide the input length to use downsampling"""
            sampling_length = abs(interval.stop - interval.start) // self.window

            if self.sampling_mode == 'downsampling':
                    seq = seq[::sampling_length]
            elif self.sampling_mode == 'mean':
                    seq = self._calculate_rolling_mean(seq)
            else:
                raise NameError('sampling_mode must be None, "mean" or "downsampling"')
        
        if self.nb_annotation_type:
            nb_files_per_ann = len(self.bbi_files) // self.nb_annotation_type
            return seq.reshape((self.window,
                                nb_files_per_ann,
                                self.nb_annotation_type))
        
        else:
            return seq
Esempio n. 4
0
    def __getitem__(self, idx):
        self.fasta_extractor = FastaStringExtractor(self.fasta_file)

        # get the intervals
        interval, targets = self.bt[idx]

        # resize to 500bp
        interval = resize_interval(interval, 500, anchor='center')

        # extract the sequence
        seq = self.fasta_extractor.extract(interval)

        # one-hot encode the sequence
        seq_onehot = self.transform(seq)

        ranges = GenomicRanges.from_interval(interval)

        return {"inputs": [seq_onehot], "metadata": [ranges]}
Esempio n. 5
0
 def __call__(self, interval):
     return F.resize_interval(interval, self.width, self.anchor)