コード例 #1
0
ファイル: genomicarray.py プロジェクト: Annalaura94/janggu
    def __setitem__(self, index, value):
        interval = index[0]
        condition = index[1]
        if isinstance(interval, GenomicInterval) and isinstance(condition, int):
            chrom = interval.chrom
            start = interval.start // self.resolution
            end = int(numpy.ceil(interval.end / self.resolution))
            strand = interval.strand
            sind = 1 if self.stranded and strand == '-' else 0

            for idx, iarray in enumerate(range(start, end)):
                if hasattr(value, '__len__'):
                    # value is a numpy array or a list
                    val = value[idx]
                else:
                    # value is a scalar value
                    val = value

                if val > 0:
                    if not self._full_genome_stored:
                        self.handle[_iv_to_str(chrom, interval.start,
                                               interval.end)][idx,
                                               sind * len(self.condition)
                                               + condition] = val
                    else:
                        self.handle[chrom][iarray,
                                           sind * len(self.condition)
                                           + condition] = val

            return
        raise IndexError("Index must be a GenomicInterval and a condition index")
コード例 #2
0
 def _setitem(self, interval, condition, length, value):
     if not self._full_genome_stored:
         regidx = self.region2index[_iv_to_str(interval.chrom,
                                               interval.start,
                                               interval.end)]
         nconditions = len(self.condition)
         ncondstrand = len(self.condition) * value.shape[-1]
         #end = end - self.order + 1
         idxs = np.where(value > 0)
         for idx in zip(*idxs):
             basepos = idx[0] * ncondstrand
             strand = idx[1] * nconditions
             cond = condition if isinstance(condition, int) else idx[2]
             self.handle['data'][regidx,
                                 basepos + strand + cond] = value[idx]
     else:
         ref_start, ref_end, array_start, _ = self._get_indices(
             interval, value.shape[0])
         idxs = np.where(value > 0)
         iarray = np.arange(ref_start, ref_end)
         for idx in zip(*idxs):
             cond = condition if isinstance(condition, int) else idx[2]
             self.handle[interval.chrom][iarray[idx[0]],
                                         idx[1] * len(self.condition) +
                                         cond] = value[idx[0] +
                                                       array_start][idx[1:]]
コード例 #3
0
    def __getitem__(self, index):
        # for now lets ignore everything except for chrom, start and end.
        if isinstance(index, Interval):
            interval = index
            chrom = interval.chrom
            start = self.get_iv_start(interval.start)
            end = self.get_iv_end(interval.end)

            # original length
            length = end - start - self.order + 1

            if not self._full_genome_stored:
                idx = self.region2index[_iv_to_str(chrom, interval.start,
                                                   interval.end)]
                # correcting for the overshooting starts and ends is not necessary
                # for partially loaded data
                return self._reshape(
                    self.handle['data'][idx],
                    (length, 2 if self.stranded else 1, len(self.condition)))

            if chrom not in self.handle:
                return np.ones(
                    (length, 2 if self.stranded else 1, len(self.condition)),
                    dtype=self.typecode) * self.padding_value

            if start >= 0 and end <= self.handle[chrom].shape[0]:
                end = end - self.order + 1
                # this is a short-cut, which does not require zero-padding
                return self._reshape(self.handle[chrom][start:end],
                                     (end - start, 2 if self.stranded else 1,
                                      len(self.condition)))

            # below is some functionality for zero-padding, in case the region
            # reaches out of the chromosome size

            if self.padding_value == 0.0:
                data = np.zeros(
                    (length, 2 if self.stranded else 1, len(self.condition)),
                    dtype=self.typecode)
            else:
                data = np.ones(
                    (length, 2 if self.stranded else 1, len(self.condition)),
                    dtype=self.typecode) * self.padding_value

            ref_start, ref_end, array_start, array_end = self._get_indices(
                interval, data.shape[0])

            data[array_start:array_end, :, :] = self._reshape(
                self.handle[chrom][ref_start:ref_end],
                (ref_end - ref_start, 2 if self.stranded else 1,
                 len(self.condition)))
            return data

        raise IndexError("Cannot interpret interval: {}".format(index))
コード例 #4
0
    def __getitem__(self, index):
        # for now lets ignore everything except for chrom, start and end.
        if isinstance(index, GenomicInterval):
            interval = index
            chrom = interval.chrom
            start = self.get_iv_start(interval.start)
            end = self.get_iv_end(interval.end)

            # original length
            length = end - start

            if not self._full_genome_stored:
                # correcting for the overshooting starts and ends is not necessary
                # for partially loaded data
                return self._reshape(
                    self.handle[_iv_to_str(chrom, interval.start,
                                           interval.end)][:(length)],
                    (length, 2 if self.stranded else 1, len(self.condition)))

            if start >= 0 and end <= self.handle[chrom].shape[0]:
                # this is a short-cut, which does not require zero-padding
                return self._reshape(self.handle[chrom][start:end],
                                     (end - start, 2 if self.stranded else 1,
                                      len(self.condition)))

            # below is some functionality for zero-padding, in case the region
            # reaches out of the chromosome size

            data = np.zeros(
                (length, 2 if self.stranded else 1, len(self.condition)),
                dtype=self.handle[chrom].dtype)

            dstart = 0
            dend = length
            # if start of interval is negative, due to flank, discard the start
            if start < 0:
                dstart = -start
                start = 0

            # if end of interval reached out of the chromosome, clip it
            if self.handle[chrom].shape[0] < end:
                dend -= end - self.handle[chrom].shape[0]
                end = self.handle[chrom].shape[0]

            # dstart and dend are offset by the number of positions
            # the region reaches out of the chromosome
            data[dstart:dend, :, :] = self._reshape(
                self.handle[chrom][start:end],
                (end - start, 2 if self.stranded else 1, len(self.condition)))
            return data

        raise IndexError("Index must be a GenomicInterval")
コード例 #5
0
    def _setitem(self, interval, condition, length, value):
        if not self._full_genome_stored:
            idx = self.region2index[_iv_to_str(interval.chrom, interval.start,
                                               interval.end)]

            # correcting for the overshooting starts and ends is not necessary
            # for partially loaded data
            self.handle['data'][idx, :length, :, condition] = value

        else:
            ref_start, ref_end, array_start, \
                array_end = self._get_indices(interval, value.shape[0])
            self.handle[interval.chrom][ref_start:ref_end, :, condition] = \
                               value[array_start:array_end]
コード例 #6
0
ファイル: genomicarray.py プロジェクト: Annalaura94/janggu
    def __setitem__(self, index, value):
        interval = index[0]
        condition = index[1]
        if isinstance(interval, GenomicInterval) and isinstance(condition, int):
            chrom = interval.chrom
            start = interval.start // self.resolution
            end = int(numpy.ceil(interval.end / self.resolution))
            strand = interval.strand

            try:
                if not self._full_genome_stored:
                    length = end-start
                    # correcting for the overshooting starts and ends is not necessary
                    # for partially loaded data

                    self.handle[_iv_to_str(chrom, interval.start,
                                           interval.end)][:(length),
                                           1 if self.stranded and strand == '-' else 0,
                                           condition] = value
#                       raise IndexError('Region {} not '.format(_iv_to_str(
#                               chrom, interval.start, interval.end)) +
#                                        'contained in the genomic array. '
#                                        'Consider adjusting the regions, '
#                                        'binsize, stepsize and flank.')

                else:
                    self.handle[chrom][start:end,
                                       1 if self.stranded and strand == '-' else 0,
                                       condition] = value
            except KeyError:
                print('Skipping region {} - not in genomic array.'.format(
                    _iv_to_str(chrom, interval.start, interval.end)) + 
                'Consider using store_whole_genome=True or '
                'adjusting adjusting the regions, binsize, stepsize and flank.')

        else:
            raise IndexError("Index must be a GenomicInterval and a condition index")
コード例 #7
0
ファイル: dna.py プロジェクト: MMesbahU/janggu
    def load_sequence(self):
        print('loading from lazy loader')
        store_whole_genome = self.store_whole_genome
        gindexer = self.gindexer

        if isinstance(self.fastafile, str):
            seqs = sequences_from_fasta(self.fastafile, self.seqtype)
        else:
            # This is already a list of SeqRecords
            seqs = self.fastafile

        if not store_whole_genome and gindexer is not None:
            # the genome is loaded with a bed file,
            # only the specific subset is loaded
            # to keep the memory overhead low.
            # Otherwise the entire reference genome is loaded.
            rgen = OrderedDict(((seq.id, seq) for seq in seqs))
            subseqs = []
            for giv in gindexer:
                subseq = rgen[giv.chrom][
                    max(giv.start, 0):min(giv.end, len(rgen[giv.chrom]))]
                if giv.start < 0:
                    subseq = 'N' * (-giv.start) + subseq
                if len(subseq) < giv.length:
                    subseq = subseq + 'N' * (giv.length - len(subseq))
                subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end)
                subseq.name = subseq.id
                subseq.description = subseq.id
                subseqs.append(subseq)
            seqs = subseqs
            gsize = gindexer

        if store_whole_genome:
            gsize = OrderedDict(((seq.id, len(seq)) for seq in seqs))
            gsize = GenomicIndexer.create_from_genomesize(gsize)

        self.gsize_ = gsize
        self.seqs_ = seqs
コード例 #8
0
ファイル: genomicarray.py プロジェクト: MMesbahU/janggu
    def __init__(
            self,
            gsize,  # pylint: disable=too-many-locals
            stranded=True,
            conditions=None,
            typecode='d',
            datatags=None,
            resolution=1,
            order=1,
            padding_value=0.0,
            store_whole_genome=True,
            cache=None,
            overwrite=False,
            loader=None,
            normalizer=None,
            collapser=None):

        super(NPGenomicArray,
              self).__init__(stranded,
                             conditions,
                             typecode,
                             resolution,
                             order=order,
                             padding_value=padding_value,
                             store_whole_genome=store_whole_genome,
                             collapser=collapser)

        gsize_ = None

        if not store_whole_genome:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize

            self.region2index = {_iv_to_str(region.chrom,
                                            region.start,
                                            region.end): i \
                                                for i, region in enumerate(gsize_)}

        cachefile = _get_cachefile(cache, datatags, '.npz')
        load_from_file = _load_data(cache, datatags, '.npz')

        if load_from_file:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize

            if store_whole_genome:
                data = {
                    str(region.chrom): init_with_padding_value(
                        padding_value,
                        shape=(_get_iv_length(region.length - self.order + 1,
                                              self.resolution),
                               2 if stranded else 1, len(self.condition)),
                        dtype=self.typecode)
                    for region in gsize_
                }
                names = [str(region.chrom) for region in gsize_]
                self.handle = data
            else:
                data = {
                    'data':
                    init_with_padding_value(
                        padding_value,
                        shape=(len(gsize_),
                               _get_iv_length(
                                   gsize_.binsize + 2 * gsize_.flank -
                                   self.order + 1, self.resolution)
                               if self.resolution is not None else 1,
                               2 if stranded else 1, len(self.condition)),
                        dtype=self.typecode)
                }
                names = ['data']
                self.handle = data

            # invoke the loader
            if loader:
                loader(self)

            if cachefile is not None:
                np.savez(cachefile, **data)

        if cachefile is not None:
            print('reload {}'.format(cachefile))
            data = np.load(cachefile)
            names = [x for x in data]

        # here we get either the freshly loaded data or the reloaded
        # data from np.load.
        self.handle = {key: data[key] for key in names}

        for norm in normalizer or []:
            get_normalizer(norm)(self)
コード例 #9
0
ファイル: genomicarray.py プロジェクト: MMesbahU/janggu
    def __init__(
            self,
            gsize,  # pylint: disable=too-many-locals
            stranded=True,
            conditions=None,
            typecode='d',
            datatags=None,
            resolution=1,
            order=1,
            padding_value=0.,
            store_whole_genome=True,
            cache=None,
            overwrite=False,
            loader=None,
            normalizer=None,
            collapser=None):
        super(HDF5GenomicArray,
              self).__init__(stranded,
                             conditions,
                             typecode,
                             resolution,
                             order=order,
                             padding_value=padding_value,
                             store_whole_genome=store_whole_genome,
                             collapser=collapser)

        if cache is None:
            raise ValueError('HDF5 format requires cache=True')

        gsize_ = None

        if not store_whole_genome:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize
            self.region2index = {_iv_to_str(region.chrom,
                                            region.start,
                                            region.end): i \
                                                for i, region in enumerate(gsize_)}

        cachefile = _get_cachefile(cache, datatags, '.h5')
        load_from_file = _load_data(cache, datatags, '.h5')

        if load_from_file:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize

            h5file = h5py.File(cachefile, 'w')

            if store_whole_genome:
                for region in gsize_:
                    shape = (_get_iv_length(region.length - self.order + 1,
                                            self.resolution),
                             2 if stranded else 1, len(self.condition))
                    h5file.create_dataset(str(region.chrom),
                                          shape,
                                          dtype=self.typecode,
                                          data=init_with_padding_value(
                                              padding_value, shape,
                                              self.typecode))
                    self.handle = h5file
            else:
                shape = (len(gsize_),
                         _get_iv_length(
                             gsize_.binsize + 2 * gsize_.flank - self.order +
                             1, self.resolution), 2 if stranded else 1,
                         len(self.condition))
                h5file.create_dataset('data',
                                      shape,
                                      dtype=self.typecode,
                                      data=init_with_padding_value(
                                          padding_value, shape, self.typecode))
                self.handle = h5file
            # invoke the loader
            if loader:
                loader(self)

            for norm in normalizer or []:
                get_normalizer(norm)(self)
            h5file.close()
        print('reload {}'.format(cachefile))
        h5file = h5py.File(cachefile, 'a', driver='stdio')

        self.handle = h5file
コード例 #10
0
    def __setitem__(self, index, value):
        interval = index[0]
        condition = index[1]

        if self.stranded and value.shape[-1] != 2:
            raise ValueError(
                'If genomic array is in stranded mode, shape[-1] == 2 is expected'
            )

        if not self.stranded and value.shape[-1] != 1:
            value = value.sum(axis=1).reshape(-1, 1)

        if isinstance(interval, GenomicInterval) and isinstance(
                condition, int):
            chrom = interval.chrom
            start = self.get_iv_start(interval.start)
            end = self.get_iv_end(interval.end)

            # value should be a 2 dimensional array
            # it will be reshaped to a 2D array where the collapse operation is performed
            # along the second dimension.
            if self.collapser is not None:
                if self.resolution is None:
                    # collapse along the entire interval
                    value = value.reshape((1, len(value), value.shape[-1]))
                else:
                    # collapse in bins of size resolution
                    value = value.reshape((len(value) // self.resolution,
                                           self.resolution, value.shape[-1]))

                value = self.collapser(value)

            try:
                if not self._full_genome_stored:
                    length = end - start
                    # correcting for the overshooting starts and ends is not necessary
                    # for partially loaded data

                    self.handle[_iv_to_str(chrom, interval.start,
                                           interval.end)][:(length), :,
                                                          condition] = value

                else:
                    if start < 0:
                        tmp_start = -start
                        ref_start = 0
                    else:
                        tmp_start = 0
                        ref_start = start

                    if end > self.handle[chrom].shape[0]:
                        tmp_end = value.shape[0] - (
                            end - self.handle[chrom].shape[0])
                        ref_end = self.handle[chrom].shape[0]
                    else:
                        tmp_end = value.shape[0]
                        ref_end = end

                    #start_offset = max(start, 0)
                    #end_offset = min(end, self.handle[chrom].shape[0])
                    #dstart = start_offset - start
                    #dend = end_offset - end
                    #cend = end + (dend)
                    #if dend < 0:
                    self.handle[chrom][ref_start:ref_end, :, condition] = \
                                       value[tmp_start:tmp_end, :]

            except KeyError:
                # we end up here if the peak regions are not a subset of
                # the regions of interest. that might be the case if
                # peaks from the holdout proportion of the genome are tried
                # to be added.
                # unfortunately, it is also possible that store_whole_genome=False
                # and the peaks and regions of interest are just not synchronized
                # in which case nothing (or too few peaks) are added. in the latter
                # case an error would help actually, but I am not sure how to
                # check if the first or the second is the case here.
                pass
        else:
            raise IndexError(
                "Index must be a GenomicInterval and a condition index")
コード例 #11
0
    def __setitem__(self, index, value):
        interval = index[0]
        condition = index[1]
        if isinstance(interval, GenomicInterval) and isinstance(
                condition, int):
            chrom = interval.chrom
            start = self.get_iv_start(interval.start)
            end = self.get_iv_end(interval.end)
            #strand = interval.strand
            #sind = 1 if self.stranded and strand == '-' else 0

            if self.stranded and value.shape[-1] != 2:
                raise ValueError(
                    'If genomic array is in stranded mode, shape[-1] == 2 is expected'
                )

            if not self.stranded and value.shape[-1] != 1:
                value = value.sum(axis=1).reshape(-1, 1)

            # value should be a 2 dimensional array
            # it will be reshaped to a 2D array where the collapse operation is performed
            # along the second dimension.
            if self.collapser is not None:
                if self.resolution is None:
                    # collapse along the entire interval
                    value = value.reshape((1, len(value), value.shape[-1]))
                else:
                    # collapse in bins of size resolution
                    value = value.reshape((len(value) // self.resolution,
                                           self.resolution, value.shape[-1]))

                value = self.collapser(value)

            try:
                for sind in range(value.shape[-1]):
                    if not self._full_genome_stored:
                        for idx, iarray in enumerate(range(start, end)):
                            val = value[idx, sind]

                            if val > 0:

                                self.handle[_iv_to_str(
                                    chrom, interval.start,
                                    interval.end)][idx,
                                                   sind * len(self.condition) +
                                                   condition] = val
                    else:
                        if start < 0:
                            tmp_start = -start
                            ref_start = 0
                        else:
                            tmp_start = 0
                            ref_start = start

                        if end > self.handle[chrom].shape[0]:
                            tmp_end = value.shape[0] - (
                                end - self.handle[chrom].shape[0])
                            ref_end = self.handle[chrom].shape[0]
                        else:
                            tmp_end = value.shape[0]
                            ref_end = end

                        for idx, iarray in enumerate(range(ref_start,
                                                           ref_end)):
                            val = value[idx + tmp_start, sind]
                            if val > 0:
                                self.handle[chrom][iarray,
                                                   sind * len(self.condition) +
                                                   condition] = val

            except KeyError:
                # we end up here if the peak regions are not a subset of
                # the regions of interest. that might be the case if
                # peaks from the holdout proportion of the genome are tried
                # to be added.
                # unfortunately, it is also possible that store_whole_genome=False
                # and the peaks and regions of interest are just not synchronized
                # in which case nothing (or too few peaks) are added. in the latter
                # case an error would help actually, but I am not sure how to
                # check if the first or the second is the case here.

                pass
            return
        raise IndexError(
            "Index must be a GenomicInterval and a condition index")
コード例 #12
0
ファイル: coverage.py プロジェクト: Annalaura94/janggu
    def create_from_bed(
            cls,
            name,  # pylint: disable=too-many-locals
            bedfiles,
            regions=None,
            genomesize=None,
            conditions=None,
            binsize=None,
            stepsize=None,
            resolution=1,
            flank=0,
            storage='ndarray',
            dtype='int',
            dimmode='all',
            mode='binary',
            store_whole_genome=False,
            overwrite=False,
            channel_last=True,
            datatags=None,
            cache=False):
        """Create a Cover class from a bed-file (or files).

        Parameters
        -----------
        name : str
            Name of the dataset
        bedfiles : str or list
            bed-file or list of bed files.
        regions : str or None
            Bed-file defining the region of interest.
            If set to None a genomesize must be supplied and
            a genomic indexer must be attached later.
        genomesize : dict or None
            Dictionary containing the genome size to fetch the coverage from.
            If `genomesize=None`, the genome size
            is fetched from the region of interest.
        conditions : list(str) or None
            List of conditions.
            If `conditions=None`,
            the conditions are obtained from
            the filenames (without the directories
            and file-ending).
        binsize : int or None
            Binsize in basepairs. For binsize=None,
            the binsize will be determined from the bed-file directly
            which requires that all intervals in the bed-file are of equal
            length. Otherwise, the intervals in the bed-file will be
            split to subintervals of length binsize in conjunction with
            stepsize. Default: None.
        stepsize : int or None
            stepsize in basepairs for traversing the genome.
            If stepsize is None, it will be set equal to binsize.
            Default: None.
        resolution : int
            Resolution in base pairs divides the region of interest
            in windows of length resolution.
            This effectively reduces the storage for coverage data.
            The resolution must be selected such that min(stepsize, binsize)
            is a multiple of resolution.
            Default: 1.
        flank : int
            Flanking size increases the interval size at both ends by
            flank bins. Note that the binsize is defined by the resolution parameter.
            Default: 0.
        storage : str
            Storage mode for storing the coverage data can be
            'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'.
        dtype : str
            Typecode to define the datatype to be used for storage.
            Default: 'int'.
        dimmode : str
            Dimension mode can be 'first' or 'all'. If 'first', only
            the first element of size resolution is returned. Otherwise,
            all elements of size resolution spanning the interval are returned.
            Default: 'all'.
        mode : str
            Mode of the dataset may be 'binary', 'score' or 'categorical'.
            Default: 'binary'.
        overwrite : boolean
            Overwrite cachefiles. Default: False.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        store_whole_genome : boolean
            Indicates whether the whole genome or only selected regions
            should be loaded. If False, a bed-file with regions of interest
            must be specified. Default: False.
        channel_last : boolean
            Indicates whether the condition axis should be the last dimension
            or the first. For example, tensorflow expects the channel at the
            last position. Default: True.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        """

        if regions is None and genomesize is None:
            raise ValueError('Either regions or genomesize must be specified.')

        if regions is not None:
            gindexer = GenomicIndexer.create_from_file(regions, binsize,
                                                       stepsize, flank)
        else:
            gindexer = None

        if not store_whole_genome:
            # if whole genome should not be loaded
            gsize = {
                _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start
                for iv in gindexer
            }

        else:
            # otherwise the whole genome will be fetched, or at least
            # a set of full length chromosomes
            if genomesize is not None:
                # if a genome size has specifically been given, use it.
                gsize = genomesize.copy()
            else:
                gsize = get_genome_size_from_regions(regions)

        if isinstance(bedfiles, str):
            bedfiles = [bedfiles]

        if mode == 'categorical':
            if len(bedfiles) > 1:
                raise ValueError('Only one bed-file is '
                                 'allowed with mode=categorical')
            sample_file = bedfiles[0]
            regions_ = _get_genomic_reader(sample_file)

            max_class = 0
            for reg in regions_:
                if reg.score > max_class:
                    max_class = reg.score
            if conditions is None:
                conditions = [str(i) for i in range(int(max_class + 1))]
        if conditions is None:
            conditions = [
                os.path.splitext(os.path.basename(f))[0] for f in bedfiles
            ]

        def _bed_loader(garray, bedfiles, genomesize, mode):
            print("load from bed")
            for i, sample_file in enumerate(bedfiles):
                regions_ = _get_genomic_reader(sample_file)

                for region in regions_:
                    gidx = GenomicIndexer.create_from_region(
                        region.iv.chrom, region.iv.start, region.iv.end,
                        region.iv.strand, binsize, stepsize, flank)
                    for greg in gidx:

                        if region.score is None and mode in [
                                'score', 'categorical'
                        ]:
                            raise ValueError(
                                'No Score available. Score field must '
                                'present in {}'.format(sample_file) + \
                                'for mode="{}"'.format(mode))
                        # if region score is not defined, take the mere
                        # presence of a range as positive label.
                        if mode == 'score':
                            garray[greg,
                                   i] = np.dtype(dtype).type(region.score)
                        elif mode == 'categorical':
                            garray[greg,
                                   int(region.score)] = np.dtype(dtype).type(1)
                        elif mode == 'binary':
                            garray[greg, i] = np.dtype(dtype).type(1)
            return garray

        # At the moment, we treat the information contained
        # in each bed-file as unstranded

        datatags = [name] + datatags if datatags else [name]
        datatags += ['resolution{}'.format(resolution)]

        cover = create_genomic_array(gsize,
                                     stranded=False,
                                     storage=storage,
                                     datatags=datatags,
                                     cache=cache,
                                     conditions=conditions,
                                     resolution=resolution,
                                     overwrite=overwrite,
                                     typecode=dtype,
                                     store_whole_genome=store_whole_genome,
                                     loader=_bed_loader,
                                     loader_args=(bedfiles, gsize, mode))

        return cls(name,
                   cover,
                   gindexer,
                   padding_value=0,
                   dimmode=dimmode,
                   channel_last=channel_last)
コード例 #13
0
ファイル: coverage.py プロジェクト: Annalaura94/janggu
    def create_from_bam(
            cls,
            name,  # pylint: disable=too-many-locals
            bamfiles,
            regions=None,
            genomesize=None,
            conditions=None,
            min_mapq=None,
            binsize=None,
            stepsize=None,
            flank=0,
            resolution=1,
            storage='ndarray',
            dtype='int',
            stranded=True,
            overwrite=False,
            pairedend='5prime',
            template_extension=0,
            aggregate=None,
            datatags=None,
            cache=False,
            channel_last=True,
            store_whole_genome=False):
        """Create a Cover class from a bam-file (or files).

        This constructor can be used to obtain coverage from BAM files.
        For single-end reads the read will be counted at the 5 prime end.
        Paired-end reads can be counted relative to the 5 prime ends of the read
        (default) or with respect to the midpoint.


        Parameters
        -----------
        name : str
            Name of the dataset
        bamfiles : str or list
            bam-file or list of bam files.
        regions : str or None
            Bed-file defining the region of interest.
            If set to None, the coverage will be
            fetched from the entire genome and a
            genomic indexer must be attached later.
        genomesize : dict or None
            Dictionary containing the genome size.
            If `genomesize=None`, the genome size
            is determined from the bam header.
            If `store_whole_genome=False`, this option does not have an effect.
        conditions : list(str) or None
            List of conditions.
            If `conditions=None`,
            the conditions are obtained from
            the filenames (without the directories
            and file-ending).
        min_mapq : int
            Minimal mapping quality.
            Reads with lower mapping quality are
            filtered out. If None, all reads are used.
        binsize : int or None
            Binsize in basepairs. For binsize=None,
            the binsize will be determined from the bed-file directly
            which requires that all intervals in the bed-file are of equal
            length. Otherwise, the intervals in the bed-file will be
            split to subintervals of length binsize in conjunction with
            stepsize. Default: None.
        stepsize : int or None
            stepsize in basepairs for traversing the genome.
            If stepsize is None, it will be set equal to binsize.
            Default: None.
        flank : int
            Flanking size increases the interval size at both ends by
            flank base pairs. Default: 0
        resolution : int
            Resolution in base pairs divides the region of interest
            in windows of length resolution.
            This effectively reduces the storage for coverage data.
            The resolution must be selected such that min(stepsize, binsize)
            is a multiple of resolution.
            Default: 1.
        storage : str
            Storage mode for storing the coverage data can be
            'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'.
        dtype : str
            Typecode to be used for storage the data.
            Default: 'int'.
        stranded : boolean
            Indicates whether to extract stranded or
            unstranded coverage. For unstranded
            coverage, reads aligning to both strands will be aggregated.
        overwrite : boolean
            Overwrite cachefiles. Default: False.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        pairedend : str
            Indicates whether to count reads at the '5prime' end or at
            the 'midpoint' for paired-end reads. Default: '5prime'.
        template_extension : int
            Elongates intervals by template_extension which allows to properly count
            template mid-points whose reads lie outside of the interval.
            This option is only relevant for paired-end reads counted at the
            'midpoint' and if the coverage is not obtained from the
            whole genome, e.g. regions is not None.
        aggregate : callable or None
            Aggregation operation for loading genomic array. If None,
            the coverage amounts to the raw counts.
            Default: None
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        channel_last : boolean
            Indicates whether the condition axis should be the last dimension
            or the first. For example, tensorflow expects the channel at the
            last position. Default: True.
        store_whole_genome : boolean
            Indicates whether the whole genome or only selected regions
            should be loaded. If False, a bed-file with regions of interest
            must be specified. Default: False
        """

        if pysam is None:  # pragma: no cover
            raise Exception(
                'pysam not available. '
                '`create_from_bam` requires pysam to be installed.')

        if regions is not None:
            gindexer = GenomicIndexer.create_from_file(regions, binsize,
                                                       stepsize, flank)
        else:
            gindexer = None

        if isinstance(bamfiles, str):
            bamfiles = [bamfiles]

        if conditions is None:
            conditions = [
                os.path.splitext(os.path.basename(f))[0] for f in bamfiles
            ]

        if min_mapq is None:
            min_mapq = 0

        full_genome_index = store_whole_genome

        if not full_genome_index and not gindexer:
            raise ValueError(
                'Either regions must be supplied or store_whole_genome must be True'
            )

        if not full_genome_index:
            # if whole genome should not be loaded
            gsize = {
                _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start
                for iv in gindexer
            }

        else:
            # otherwise the whole genome will be fetched, or at least
            # a set of full length chromosomes
            if genomesize is not None:
                # if a genome size has specifically been given, use it.
                gsize = genomesize.copy()
            else:
                header = pysam.AlignmentFile(bamfiles[0], 'r')  # pylint: disable=no-member
                gsize = {}
                for chrom, length in zip(header.references, header.lengths):
                    gsize[chrom] = length

        def _bam_loader(garray, files):
            print("load from bam")
            for i, sample_file in enumerate(files):
                print('Counting from {}'.format(sample_file))
                aln_file = pysam.AlignmentFile(sample_file, 'rb')  # pylint: disable=no-member
                for chrom in gsize:

                    array = np.zeros(
                        (get_chrom_length(gsize[chrom], resolution), 2),
                        dtype=dtype)

                    locus = _str_to_iv(chrom,
                                       template_extension=template_extension)
                    if len(locus) == 1:
                        locus = (locus[0], 0, gsize[chrom])
                    # locus = (chr, start, end)
                    # or locus = (chr, )

                    for aln in aln_file.fetch(*locus):

                        if aln.is_unmapped:
                            continue

                        if aln.mapq < min_mapq:
                            continue

                        if aln.is_read2:
                            # only consider read1 so as not to double count
                            # fragments for paired end reads
                            # read2 will also be false for single end
                            # reads.
                            continue

                        if aln.is_paired:
                            # if paired end read, consider the midpoint
                            if not (aln.is_proper_pair and aln.reference_name
                                    == aln.next_reference_name):
                                # only consider paired end reads if both mates
                                # are properly mapped and they map to the
                                # same reference_name
                                continue
                            # if the next reference start >= 0,
                            # the read is considered as a paired end read
                            # in this case we consider the mid point
                            if pairedend == 'midpoint':
                                pos = min(aln.reference_start,
                                          aln.next_reference_start) + \
                                          abs(aln.template_length) // 2
                            else:
                                if aln.is_reverse:
                                    # last position of the downstream read
                                    pos = max(
                                        aln.reference_end,
                                        aln.next_reference_start +
                                        aln.query_length)
                                else:
                                    # first position of the upstream read
                                    pos = min(aln.reference_start,
                                              aln.next_reference_start)
                        else:
                            # here we consider single end reads
                            # whose 5 prime end is determined strand specifically
                            if aln.is_reverse:
                                pos = aln.reference_end
                            else:
                                pos = aln.reference_start

                        if not garray._full_genome_stored:
                            # if we get here, a region was given,
                            # otherwise, the entire chromosome is read.
                            pos -= locus[1] + template_extension

                            if pos < 0 or pos >= locus[2] - locus[1]:
                                # if the read 5 p end or mid point is outside
                                # of the region of interest, the read is discarded
                                continue

                        # compute divide by the resolution
                        pos //= resolution

                        # fill up the read strand specifically
                        if aln.is_reverse:
                            array[pos, 1] += 1
                        else:
                            array[pos, 0] += 1
                    # apply the aggregation
                    if aggregate is not None:
                        array = aggregate(array)

                    if stranded:
                        lp = locus + ('+', )
                        garray[GenomicInterval(*lp), i] = array[:, 0]
                        lm = locus + ('-', )
                        garray[GenomicInterval(*lm), i] = array[:, 1]
                    else:
                        # if unstranded, aggregate the reads from
                        # both strands
                        garray[GenomicInterval(*locus), i] = array.sum(axis=1)

            return garray

        datatags = [name] + datatags if datatags else [name]

        # At the moment, we treat the information contained
        # in each bw-file as unstranded
        cover = create_genomic_array(gsize,
                                     stranded=stranded,
                                     storage=storage,
                                     datatags=datatags,
                                     cache=cache,
                                     conditions=conditions,
                                     overwrite=overwrite,
                                     typecode=dtype,
                                     store_whole_genome=store_whole_genome,
                                     resolution=resolution,
                                     loader=_bam_loader,
                                     loader_args=(bamfiles, ))

        return cls(name,
                   cover,
                   gindexer,
                   padding_value=0,
                   dimmode='all',
                   channel_last=channel_last)
コード例 #14
0
ファイル: coverage.py プロジェクト: Annalaura94/janggu
    def create_from_array(
            cls,
            name,  # pylint: disable=too-many-locals
            array,
            gindexer,
            genomesize=None,
            conditions=None,
            resolution=1,
            storage='ndarray',
            overwrite=False,
            datatags=None,
            cache=False,
            channel_last=True,
            store_whole_genome=False):
        """Create a Cover class from a numpy.array.

        The purpose of this function is to convert output prediction from
        keras which are in numpy.array format into a Cover object.

        Parameters
        -----------
        name : str
            Name of the dataset
        array : numpy.array
            A 4D numpy array that will be re-interpreted as genomic array.
        gindexer : GenomicIndexer
            Genomic indices associated with the values contained in array.
        genomesize : dict or None
            Dictionary containing the genome size to fetch the coverage from.
            If `genomesize=None`, the genome size is automatically determined
            from the GenomicIndexer. If `store_whole_genome=False` this
            option does not have an effect.
        conditions : list(str) or None
            List of conditions.
            If `conditions=None`,
            the conditions are obtained from
            the filenames (without the directories
            and file-ending).
        resolution : int
            Resolution in base pairs divides the region of interest
            in windows of length resolution.
            This effectively reduces the storage for coverage data.
            The resolution must be selected such that min(stepsize, binsize)
            is a multiple of resolution.
            Default: 1.
        storage : str
            Storage mode for storing the coverage data can be
            'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'.
        overwrite : boolean
            Overwrite cachefiles. Default: False.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        store_whole_genome : boolean
            Indicates whether the whole genome or only selected regions
            should be loaded. Default: False.
        channel_last : boolean
            This tells the constructor how to interpret the array dimensions.
            It indicates whether the condition axis is the last dimension
            or the first. For example, tensorflow expects the channel at the
            last position. Default: True.
        """

        if not store_whole_genome:
            # if whole genome should not be loaded
            gsize = {
                _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start
                for iv in gindexer
            }
        elif genomesize:
            gsize = genomesize.copy()
        else:
            # if not supplied, determine the genome size automatically
            # based on the gindexer intervals.
            gsize = get_genome_size_from_regions(gindexer)

        if not channel_last:
            array = np.transpose(array, (0, 3, 1, 2))

        if conditions is None:
            conditions = ["Cond_{}".format(i) for i in range(array.shape[-1])]

        # check if dimensions of gindexer and array match
        if len(gindexer) != array.shape[0]:
            raise ValueError(
                "Data incompatible: "
                "The number intervals in gindexer"
                " must match the number of datapoints in the array "
                "(len(gindexer) != array.shape[0])")

        if store_whole_genome:
            # in this case the intervals must be non-overlapping
            # in order to obtain unambiguous data.
            if gindexer.binsize > gindexer.stepsize:
                raise ValueError(
                    "Overlapping intervals: "
                    "With overlapping intervals the mapping between "
                    "the array and genomic-array values is ambiguous. "
                    "Please ensure that binsize <= stepsize.")

        # determine the resolution
        resolution = gindexer[0].length // array.shape[1]

        # determine strandedness
        stranded = True if array.shape[2] == 2 else False

        def _array_loader(garray, array, gindexer):
            print("load from array")

            for i, region in enumerate(gindexer):
                iv = region
                for cond in range(array.shape[-1]):
                    if stranded:
                        iv.strand = '+'
                        garray[iv, cond] = array[i, :, 0, cond].astype(dtype)
                        iv.strand = '-'
                        garray[iv, cond] = array[i, :, 1, cond].astype(dtype)
                    else:
                        garray[iv, cond] = array[i, :, 0, cond]

            return garray

        # At the moment, we treat the information contained
        # in each bw-file as unstranded

        datatags = [name] + datatags if datatags else [name]
        datatags += ['resolution{}'.format(resolution)]

        cover = create_genomic_array(gsize,
                                     stranded=stranded,
                                     storage=storage,
                                     datatags=datatags,
                                     cache=cache,
                                     conditions=conditions,
                                     resolution=resolution,
                                     overwrite=overwrite,
                                     typecode=array.dtype,
                                     store_whole_genome=store_whole_genome,
                                     loader=_array_loader,
                                     loader_args=(array, gindexer))

        return cls(name,
                   cover,
                   gindexer,
                   padding_value=0,
                   dimmode='all',
                   channel_last=channel_last)
コード例 #15
0
ファイル: genomicarray.py プロジェクト: MMesbahU/janggu
    def __init__(
            self,
            gsize,  # pylint: disable=too-many-locals
            stranded=True,
            conditions=None,
            typecode='d',
            datatags=None,
            resolution=1,
            order=1,
            store_whole_genome=True,
            cache=None,
            padding_value=0.0,
            overwrite=False,
            loader=None,
            collapser=None):
        super(SparseGenomicArray,
              self).__init__(stranded,
                             conditions,
                             typecode,
                             resolution,
                             order=order,
                             padding_value=padding_value,
                             store_whole_genome=store_whole_genome,
                             collapser=collapser)

        cachefile = _get_cachefile(cache, datatags, '.npz')
        load_from_file = _load_data(cache, datatags, '.npz')

        gsize_ = None

        if not store_whole_genome:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize

            self.region2index = {_iv_to_str(region.chrom,
                                            region.start,
                                            region.end): i \
                                                for i, region in enumerate(gsize_)}

        if load_from_file:
            if gsize_ is None:
                gsize_ = gsize() if callable(gsize) else gsize

            if store_whole_genome:
                data = {
                    str(region.chrom): sparse.dok_matrix(
                        (_get_iv_length(region.length - self.order + 1,
                                        resolution),
                         (2 if stranded else 1) * len(self.condition)),
                        dtype=self.typecode)
                    for region in gsize_
                }
            else:
                data = {
                    'data':
                    sparse.dok_matrix(
                        (len(gsize_), (_get_iv_length(
                            gsize_.binsize + 2 * gsize_.flank - self.order +
                            1, self.resolution)
                                       if self.resolution is not None else 1) *
                         (2 if stranded else 1) * len(self.condition)),
                        dtype=self.typecode)
                }
            self.handle = data

            # invoke the loader
            if loader:
                loader(self)

            data = self.handle

            data = {chrom: data[chrom].tocoo() for chrom in data}

            storage = {chrom: np.column_stack([data[chrom].data,
                                               data[chrom].row,
                                               data[chrom].col]) \
                                               for chrom in data}
            for region in gsize_:
                if store_whole_genome:
                    storage[region.chrom + '__length__'] = region.length

            names = [name for name in storage]

            if cachefile is not None:
                np.savez(cachefile, **storage)

        if cachefile is not None:
            print('reload {}'.format(cachefile))
            storage = np.load(cachefile)

        names = [name for name in storage if '__length__' not in name]

        if store_whole_genome:
            self.handle = {
                name: sparse.coo_matrix(
                    (storage[name][:, 0], (storage[name][:, 1].astype('int'),
                                           storage[name][:, 2].astype('int'))),
                    shape=(_get_iv_length(storage[str(name) + '__length__'],
                                          resolution), (2 if stranded else 1) *
                           len(self.condition))).tocsr()
                for name in names
            }
        else:
            # gsize_ is always available for store_whole_genome=False
            self.handle = {
                name: sparse.coo_matrix(
                    (storage[name][:, 0], (storage[name][:, 1].astype('int'),
                                           storage[name][:, 2].astype('int'))),
                    shape=(len(gsize_),
                           (_get_iv_length(gsize_.binsize +
                                           2 * gsize_.flank, resolution)
                            if self.resolution is not None else 1) *
                           (2 if stranded else 1) *
                           len(self.condition))).tocsr()
                for name in names
            }
コード例 #16
0
ファイル: dna.py プロジェクト: bigdataguru/janggu
    def create_from_refgenome(cls, name, refgenome, roi=None,
                              binsize=None,
                              stepsize=None,
                              flank=0, order=1,
                              storage='ndarray',
                              datatags=None,
                              cache=False,
                              overwrite=False,
                              channel_last=True,
                              store_whole_genome=False):
        """Create a Bioseq class from a reference genome.

        This constructor loads nucleotide sequences from a reference genome.
        If regions of interest (ROI) is supplied, only the respective sequences
        are loaded, otherwise the entire genome is fetched.

        Parameters
        -----------
        name : str
            Name of the dataset
        refgenome : str
            Fasta file.
        roi : str or None
            Bed-file defining the region of interest.
            If set to None, the sequence will be
            fetched from the entire genome and a
            genomic indexer must be attached later.
            Otherwise, the coverage is only determined
            for the region of interest.
        binsize : int or None
            Binsize in basepairs. For binsize=None,
            the binsize will be determined from the bed-file directly
            which requires that all intervals in the bed-file are of equal
            length. Otherwise, the intervals in the bed-file will be
            split to subintervals of length binsize in conjunction with
            stepsize. Default: None.
        stepsize : int or None
            stepsize in basepairs for traversing the genome.
            If stepsize is None, it will be set equal to binsize.
            Default: None.
        flank : int
            Flanking region in basepairs to be extended up and downstream of each interval.
            Default: 0.
        order : int
            Order for the one-hot representation. Default: 1.
        storage : str
            Storage mode for storing the sequence may be 'ndarray', 'hdf5' or
            'sparse'. Default: 'hdf5'.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        overwrite : boolean
            Overwrite the cachefiles. Default: False.
        store_whole_genome : boolean
            Indicates whether the whole genome or only ROI
            should be loaded. If False, a bed-file with regions of interest
            must be specified. Default: False.
        """
        # fill up int8 rep of DNA
        # load bioseq, region index, and within region index


        if roi is not None:
            gindexer = GenomicIndexer.create_from_file(roi, binsize,
                                                       stepsize, flank)
        else:
            gindexer = None

        if not store_whole_genome and gindexer is None:
            raise ValueError('Either roi must be supplied or store_whole_genome must be True')

        if isinstance(refgenome, str):
            seqs = sequences_from_fasta(refgenome, 'dna')
        else:
            # This is already a list of SeqRecords
            seqs = refgenome

        if not store_whole_genome and gindexer is not None:
            # the genome is loaded with a bed file,
            # only the specific subset is loaded
            # to keep the memory overhead low.
            # Otherwise the entire reference genome is loaded.
            rgen = {seq.id: seq for seq in seqs}
            subseqs = []
            for giv in gindexer:
                subseq = rgen[giv.chrom][giv.start:(giv.end)]
                subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end - order + 1)
                subseq.name = subseq.id
                subseq.description = subseq.id

                subseqs.append(subseq)
            seqs = subseqs

        garray = cls._make_genomic_array(name, seqs, order, storage,
                                         datatags=datatags,
                                         cache=cache,
                                         overwrite=overwrite,
                                         store_whole_genome=store_whole_genome)

        return cls(name, garray, gindexer,
                   alphabetsize=len(seqs[0].seq.alphabet.letters),
                   channel_last=channel_last)
コード例 #17
0
ファイル: genomicarray.py プロジェクト: MMesbahU/janggu
    def __setitem__(self, index, value):
        interval = index[0]
        condition = index[1]
        if isinstance(condition, slice) and value.ndim != 3:
            raise ValueError('Expected 3D array with condition slice.')
        if isinstance(condition, slice):
            condition = slice(None, value.shape[-1], None)

        if self.stranded and value.shape[1] != 2:
            raise ValueError(
                'If genomic array is in stranded mode, shape[-1] == 2 is expected'
            )

        if not self.stranded and value.shape[1] != 1:
            value = value.sum(axis=1, keepdims=True)

        if isinstance(interval, Interval) and isinstance(
                condition, (int, slice)):
            chrom = interval.chrom
            start = self.get_iv_start(interval.start)
            end = self.get_iv_end(interval.end)

            # value should be a 2 dimensional array
            # it will be reshaped to a 2D array where the collapse operation is performed
            # along the second dimension.
            if self.collapser is not None:
                if self.resolution is None and value.shape[0] == 1 or \
                    self.resolution is not None and \
                    value.shape[0] == interval.length//self.resolution:
                    # collapsing becomes obsolete, because the data has already
                    # the expected shape (after collapsing)
                    pass
                else:
                    if self.resolution is None:
                        # collapse along the entire interval
                        value = value.reshape((1, ) + value.shape)
                    else:
                        # collapse in bins of size resolution
                        value = value.reshape((
                            value.shape[0] //
                            min(self.resolution, value.shape[0]),
                            min(self.resolution, value.shape[0]),
                        ) + value.shape[1:])

                    value = self.collapser(value)

            try:
                if not self._full_genome_stored:
                    regidx = self.region2index[_iv_to_str(
                        chrom, interval.start, interval.end)]
                    nconditions = len(self.condition)
                    ncondstrand = len(self.condition) * value.shape[-1]
                    end = end - self.order + 1
                    idxs = np.where(value > 0)
                    for idx in zip(*idxs):
                        basepos = idx[0] * ncondstrand
                        strand = idx[1] * nconditions
                        cond = condition if isinstance(condition,
                                                       int) else idx[2]
                        self.handle['data'][regidx, basepos + strand +
                                            cond] = value[idx]
                else:
                    ref_start, ref_end, array_start, \
                        array_end = self._get_indices(interval, value.shape[0])
                    idxs = np.where(value > 0)
                    iarray = np.arange(ref_start, ref_end)
                    for idx in zip(*idxs):
                        cond = condition if isinstance(condition,
                                                       int) else idx[2]
                        self.handle[chrom][iarray[idx[0]],
                                           idx[1] * len(self.condition) +
                                           cond] = value[idx[0] +
                                                         array_start][idx[1:]]

            except KeyError:
                # we end up here if the peak regions are not a subset of
                # the regions of interest. that might be the case if
                # peaks from the holdout proportion of the genome are tried
                # to be added.
                # unfortunately, it is also possible that store_whole_genome=False
                # and the peaks and regions of interest are just not synchronized
                # in which case nothing (or too few peaks) are added. in the latter
                # case an error would help actually, but I am not sure how to
                # check if the first or the second is the case here.

                pass
            return
        raise IndexError("Index must be a Interval and a condition index")
コード例 #18
0
ファイル: coverage.py プロジェクト: Annalaura94/janggu
    def create_from_bigwig(
            cls,
            name,  # pylint: disable=too-many-locals
            bigwigfiles,
            regions=None,
            genomesize=None,
            conditions=None,
            binsize=None,
            stepsize=None,
            resolution=1,
            flank=0,
            storage='ndarray',
            dtype='float32',
            overwrite=False,
            dimmode='all',
            aggregate=np.mean,
            datatags=None,
            cache=False,
            store_whole_genome=False,
            channel_last=True,
            nan_to_num=True):
        """Create a Cover class from a bigwig-file (or files).

        Parameters
        -----------
        name : str
            Name of the dataset
        bigwigfiles : str or list
            bigwig-file or list of bigwig files.
        regions : str or None
            Bed-file defining the region of interest.
            If set to None, the coverage will be
            fetched from the entire genome and a
            genomic indexer must be attached later.
            Otherwise, the coverage is only determined
            for the region of interest.
        genomesize : dict or None
            Dictionary containing the genome size.
            If `genomesize=None`, the genome size
            is determined from the bigwig file.
            If `store_whole_genome=False`, this option does not have an effect.
        conditions : list(str) or None
            List of conditions.
            If `conditions=None`,
            the conditions are obtained from
            the filenames (without the directories
            and file-ending).
        binsize : int or None
            Binsize in basepairs. For binsize=None,
            the binsize will be determined from the bed-file directly
            which requires that all intervals in the bed-file are of equal
            length. Otherwise, the intervals in the bed-file will be
            split to subintervals of length binsize in conjunction with
            stepsize. Default: None.
        stepsize : int or None
            stepsize in basepairs for traversing the genome.
            If stepsize is None, it will be set equal to binsize.
            Default: None.
        resolution : int
            Resolution in base pairs divides the region of interest
            in windows of length resolution.
            This effectively reduces the storage for coverage data.
            The resolution must be selected such that min(stepsize, binsize)
            is a multiple of resolution.
            Default: 1.
        flank : int
            Flanking size increases the interval size at both ends by
            flank bins. Note that the binsize is defined by the resolution parameter.
            Default: 0.
        storage : str
            Storage mode for storing the coverage data can be
            'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'.
        dtype : str
            Typecode to define the datatype to be used for storage.
            Default: 'float32'.
        dimmode : str
            Dimension mode can be 'first' or 'all'. If 'first', only
            the first element of size resolution is returned. Otherwise,
            all elements of size resolution spanning the interval are returned.
            Default: 'all'.
        overwrite : boolean
            Overwrite cachefiles. Default: False.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        aggregate : callable
            Aggregation operation for loading genomic array.
            Default: numpy.mean
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        store_whole_genome : boolean
            Indicates whether the whole genome or only selected regions
            should be loaded. If False, a bed-file with regions of interest
            must be specified. Default: False.
        channel_last : boolean
            Indicates whether the condition axis should be the last dimension
            or the first. For example, tensorflow expects the channel at the
            last position. Default: True.
        nan_to_num : boolean
            Indicates whether NaN values contained in the bigwig files should
            be interpreted as zeros. Default: True
        """
        if pyBigWig is None:  # pragma: no cover
            raise Exception(
                'pyBigWig not available. '
                '`create_from_bigwig` requires pyBigWig to be installed.')
        if regions is not None:
            gindexer = GenomicIndexer.create_from_file(regions, binsize,
                                                       stepsize, flank)
        else:
            gindexer = None

        if isinstance(bigwigfiles, str):
            bigwigfiles = [bigwigfiles]

        if not store_whole_genome and not gindexer:
            raise ValueError(
                'Either regions must be supplied or store_whole_genome must be True'
            )

        if not store_whole_genome:
            # if whole genome should not be loaded
            gsize = {
                _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start
                for iv in gindexer
            }

        else:
            # otherwise the whole genome will be fetched, or at least
            # a set of full length chromosomes
            if genomesize is not None:
                # if a genome size has specifically been given, use it.
                gsize = genomesize.copy()
            else:
                bwfile = pyBigWig.open(bigwigfiles[0], 'r')
                gsize = bwfile.chroms()

        if conditions is None:
            conditions = [
                os.path.splitext(os.path.basename(f))[0] for f in bigwigfiles
            ]

        def _bigwig_loader(garray, aggregate):
            print("load from bigwig")
            for i, sample_file in enumerate(bigwigfiles):
                bwfile = pyBigWig.open(sample_file)

                for chrom in gsize:

                    vals = np.zeros(
                        (get_chrom_length(gsize[chrom], resolution), ),
                        dtype=dtype)

                    locus = _str_to_iv(chrom, template_extension=0)
                    if len(locus) == 1:
                        locus = locus + (0, gsize[chrom])

                    # when only to load parts of the genome
                    for start in range(locus[1], locus[2], resolution):

                        if garray._full_genome_stored:
                            # be careful not to overshoot at the chromosome end
                            end = min(start + resolution, gsize[chrom])
                        else:
                            end = start + resolution

                        x = np.asarray(
                            bwfile.values(locus[0], int(start), int(end)))
                        if nan_to_num:
                            x = np.nan_to_num(x, copy=False)
                        vals[(start - locus[1]) // resolution] = aggregate(x)

                    garray[GenomicInterval(*locus), i] = vals
            return garray

        datatags = [name] + datatags if datatags else [name]
        datatags += ['resolution{}'.format(resolution)]

        cover = create_genomic_array(gsize,
                                     stranded=False,
                                     storage=storage,
                                     datatags=datatags,
                                     cache=cache,
                                     conditions=conditions,
                                     overwrite=overwrite,
                                     resolution=resolution,
                                     store_whole_genome=store_whole_genome,
                                     typecode=dtype,
                                     loader=_bigwig_loader,
                                     loader_args=(aggregate, ))

        return cls(name,
                   cover,
                   gindexer,
                   padding_value=0,
                   dimmode=dimmode,
                   channel_last=channel_last)