def view(dataset, use_regions): """Creates a new view on the dataset. It may be used to utilize the same dataset for reading out a training, validation and test set without creating an additional memory overhead. When using this method, consider using the `store_whole_genome=True` option with the datasets. Parameters ---------- dataset : Cover or Bioseq object Original Dataset containing a union of training and test set. use_regions: str BED file name defining the regions to use for the new view. """ if not hasattr(dataset, 'gindexer'): raise ValueError("Unknown dataset type: {}".format(type(dataset))) gind = GenomicIndexer.create_from_file( use_regions, dataset.gindexer.binsize, dataset.gindexer.stepsize, dataset.gindexer.flank, zero_padding=dataset.gindexer.zero_padding, collapse=dataset.gindexer.collapse, random_state=dataset.gindexer.random_state) check_gindexer_compatibility(gind, dataset.garray.resolution, dataset.garray._full_genome_stored) subdata = copy(dataset) subdata.gindexer = gind return subdata
def _bed_loader(garray, bedfiles, genomesize, mode): print("load from bed") for i, sample_file in enumerate(bedfiles): regions_ = _get_genomic_reader(sample_file) for region in regions_: gidx = GenomicIndexer.create_from_region( region.iv.chrom, region.iv.start, region.iv.end, region.iv.strand, binsize, stepsize, flank) for greg in gidx: if region.score is None and mode in [ 'score', 'categorical' ]: raise ValueError( 'No Score available. Score field must ' 'present in {}'.format(sample_file) + \ 'for mode="{}"'.format(mode)) # if region score is not defined, take the mere # presence of a range as positive label. if mode == 'score': garray[greg, i] = np.dtype(dtype).type(region.score) elif mode == 'categorical': garray[greg, int(region.score)] = np.dtype(dtype).type(1) elif mode == 'binary': garray[greg, i] = np.dtype(dtype).type(1) return garray
def load_sequence(self): print('loading from lazy loader') store_whole_genome = self.store_whole_genome gindexer = self.gindexer if isinstance(self.fastafile, str): seqs = sequences_from_fasta(self.fastafile, self.seqtype) else: # This is already a list of SeqRecords seqs = self.fastafile if not store_whole_genome and gindexer is not None: # the genome is loaded with a bed file, # only the specific subset is loaded # to keep the memory overhead low. # Otherwise the entire reference genome is loaded. rgen = OrderedDict(((seq.id, seq) for seq in seqs)) subseqs = [] for giv in gindexer: subseq = rgen[giv.chrom][ max(giv.start, 0):min(giv.end, len(rgen[giv.chrom]))] if giv.start < 0: subseq = 'N' * (-giv.start) + subseq if len(subseq) < giv.length: subseq = subseq + 'N' * (giv.length - len(subseq)) subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end) subseq.name = subseq.id subseq.description = subseq.id subseqs.append(subseq) seqs = subseqs gsize = gindexer if store_whole_genome: gsize = OrderedDict(((seq.id, len(seq)) for seq in seqs)) gsize = GenomicIndexer.create_from_genomesize(gsize) self.gsize_ = gsize self.seqs_ = seqs
def input_attribution(model, inputs, # pylint: disable=too-many-locals chrom=None, start=None, end=None): """Evaluates the integrated gradients method on the input coverage tracks. This allows to attribute feature importance values to the prediction scores. Integrated gradients have been introduced in Sundararajan, Taly and Yan, Axiomatic Attribution for Deep Networks. PMLR 70, 2017. The method can either be called, by specifying the region of interest directly by setting chrom, start and end. Alternatively, it is possible to specify the region index. For example, the n^th region of the dataset. Parameters ---------- model : Janggu Janggu model wrapper inputs : :code:`Dataset`, list(Dataset) Input Dataset. chrom : str or None Chromosome name. start : int or None Region start. end : int or None Region end. Examples -------- .. code-block:: python # Suppose DATA is a Bioseq or Cover object # To query the input feature importance of a specific genomic region # use input_attribution(model, DATA, chrom='chr1', start=start, end=end) """ output_chrom, output_start, output_end = chrom, start, end inputs = _to_list(inputs) # store original gindexer gindexers_save = [ip.gindexer for ip in inputs] # create new indexers ranging only over the selected region # if chrom, start, end was supplied retrieve the respective indices index_list = [gi.idx_by_region(include=output_chrom, start=output_start, end=output_end) for gi in gindexers_save] # first construct the union of indices index_set = set() for idx_list_el in index_list: index_set = index_set | set(idx_list_el) # only keep the indices that remain in the across all inputs # indices that are only present in some of the inputs are discarded. for idx_list_el in index_list: index_set = index_set & set(idx_list_el) idxs = list(index_set) idxs.sort() subgindexers = [copy.copy(gi) for gi in gindexers_save] for subgi in subgindexers: subgi.chrs = [subgi.chrs[i] for i in idxs] subgi.starts = [subgi.starts[i] for i in idxs] subgi.ends = [subgi.ends[i] for i in idxs] subgi.strand = [subgi.strand[i] for i in idxs] # assign it to the input datasets temporarily for inp, _ in enumerate(inputs): inputs[inp].gindexer = subgindexers[inp] try: #allocate arrays output = [np.zeros((1, output_end-output_start, inp.shape[-2], inp.shape[-1])) for inp in inputs] resols = [inp.garray.resolution for inp in inputs] for igi in range(len(inputs[0])): # current influence influence = [np.zeros((1,) + inp.shape[1:]) for inp in inputs] # get influence for current window with integrated gradient x_in = [inp[igi] for inp in inputs] for step in range(1, 51): grad = model._influence([x*step/50 for x in x_in]) for iinp, inp in enumerate(x_in): for idim, _ in np.ndenumerate(inp): influence[iinp][idim] += (x_in[iinp][idim]/50)*grad[iinp][idim] # scale length to nucleotide resolution influence = [np.repeat(influence[i], resols[i], axis=1) for i, _ in enumerate(inputs)] for iout in range(len(output)): if influence[iout].shape[1] < inputs[iout].gindexer[igi].length: order = inputs[iout].gindexer[igi].length - influence[iout].shape[1] else: order = 0 # incremetally add the influence results into the output # array for all subwindows in the genomic indexer if output_start < inputs[iout].gindexer[igi].start: ostart = inputs[iout].gindexer[igi].start - output_start lstart = 0 else: ostart = 0 lstart = output_start - inputs[iout].gindexer[igi].start if output_end > inputs[iout].gindexer[igi].end: oend = inputs[iout].gindexer[igi].end - output_start lend = inputs[iout].gindexer[igi].end - inputs[iout].gindexer[igi].start else: oend = output_end - output_start lend = output_end - inputs[iout].gindexer[igi].start # for mutually overlapping positions, we employ a heuristic # that keeps the maximum influence over the overlapping intervals # spanning the position m = np.zeros((2,) + (1, inputs[iout].gindexer[igi].length, ) \ + influence[iout].shape[2:], dtype=influence[iout].dtype) m[0][:, lstart:lend, :, :] = output[iout][:, (ostart):(oend), :, :] m[1][:, lstart:(lend - order), :, :] = \ influence[iout][:, lstart:(lend - order), :, :] m = np.abs(m).max(axis=0) m = m[:, lstart:lend, :, :] output[iout][:, ostart:oend, :, :] = m for iout in range(len(output)): # finally wrap the output up as coverage track output[iout] = Cover.create_from_array('attr_'+inputs[iout].name, output[iout], GenomicIndexer.create_from_region( chrom, start, end, '.', binsize=end-start, stepsize=1, flank=0), conditions=inputs[iout].conditions) for inp, _ in enumerate(inputs): # restore the initial genomic indexers inputs[inp].gindexer = gindexers_save[inp] except Exception: # pragma: no cover model.logger.exception('_influence failed:') raise return output
def create_from_seq( cls, name, # pylint: disable=too-many-locals fastafile, storage='ndarray', seqtype='dna', order=1, fixedlen=None, datatags=None, cache=False, channel_last=True, overwrite=False): """Create a Bioseq class from a biological sequences. This constructor loads a set of nucleotide or amino acid sequences. By default, the sequence are assumed to be of equal length. Alternatively, sequences can be truncated and padded to a fixed length. Parameters ----------- name : str Name of the dataset fastafile : str or list(str) or list(Bio.SeqRecord) Fasta file or list of fasta files from which the sequences are loaded or a list of Bio.SeqRecord.SeqRecord. seqtype : str Indicates whether a nucleotide or peptide sequence is loaded using 'dna' or 'protein' respectively. Default: 'dna'. order : int Order for the one-hot representation. Default: 1. fixedlen : int or None Forces the sequences to be of equal length by truncation or zero-padding. If set to None, it will be assumed that the sequences are already of equal length. An exception is raised if this is not the case. Default: None. storage : str Storage mode for storing the sequence may be 'ndarray' or 'hdf5'. Default: 'ndarray'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. """ if storage not in ['ndarray', 'hdf5']: raise ValueError( 'Available storage options for Bioseq are: ndarray or hdf5') seqs = [] if isinstance(fastafile, str): fastafile = [fastafile] if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord): for fasta in fastafile: # += is necessary since sequences_from_fasta # returns a list seqs += sequences_from_fasta(fasta, seqtype) else: # This is already a list of SeqRecords seqs = fastafile if fixedlen is not None: seqs = sequence_padding(seqs, fixedlen) # Check if sequences are equally long lens = [len(seq) for seq in seqs] assert lens == [len(seqs[0])] * len(seqs), "Input sequences must " + \ "be of equal length." # Chromnames are required to be Unique chroms = [seq.id for seq in seqs] assert len(set(chroms)) == len(seqs), "Sequence IDs must be unique." # now mimic a dataframe representing a bed file reglen = lens[0] flank = 0 stepsize = 1 gindexer = GenomicIndexer(reglen, stepsize, flank, zero_padding=False) for chrom in chroms: gindexer.add_interval(chrom, 0, reglen, '.') garray = cls._make_genomic_array(name, gindexer, seqs, order, storage, cache=cache, datatags=datatags, overwrite=overwrite, store_whole_genome=False) return cls(name, garray, gindexer, alphabet=seqs[0].seq.alphabet.letters, channel_last=channel_last)
def create_from_refgenome(cls, name, refgenome, roi=None, binsize=None, stepsize=None, flank=0, order=1, storage='ndarray', datatags=None, cache=False, overwrite=False, channel_last=True, random_state=None, store_whole_genome=False): """Create a Bioseq class from a reference genome. This constructor loads nucleotide sequences from a reference genome. If regions of interest (ROI) is supplied, only the respective sequences are loaded, otherwise the entire genome is fetched. Parameters ----------- name : str Name of the dataset refgenome : str or Bio.SeqRecord.SeqRecord Reference genome location pointing to a fasta file or a SeqRecord object from Biopython that contains the sequences. roi : str or None Bed-file defining the region of interest. If set to None, the sequence will be fetched from the entire genome and a genomic indexer must be attached later. Otherwise, the coverage is only determined for the region of interest. binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. flank : int Flanking region in basepairs to be extended up and downstream of each interval. Default: 0. order : int Order for the one-hot representation. Default: 1. storage : str Storage mode for storing the sequence may be 'ndarray' or 'hdf5'. Default: 'ndarray'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. store_whole_genome : boolean Indicates whether the whole genome or only ROI should be loaded. If False, a bed-file with regions of interest must be specified. Default: False. random_state : None or int random_state used to internally randomize the dataset. This option is best used when consuming data for training from an HDF5 file. Since random data access from HDF5 may be probibitively slow, this option allows to randomize the dataset during loading. In case an integer-valued random_state seed is supplied, make sure that all training datasets (e.g. input and output datasets) use the same random_state value so that the datasets are synchronized. Default: None means that no randomization is used. """ # fill up int8 rep of DNA # load bioseq, region index, and within region index if storage not in ['ndarray', 'hdf5']: raise ValueError( 'Available storage options for Bioseq are: ndarray or hdf5') if roi is not None: gindexer = GenomicIndexer.create_from_file( roi, binsize, stepsize, flank, random_state=random_state) else: gindexer = None if not store_whole_genome and gindexer is None: raise ValueError( 'Either roi must be supplied or store_whole_genome must be True' ) gsize = GenomicSizeLazyLoader(refgenome, 'dna', store_whole_genome, gindexer) garray = cls._make_genomic_array(name, gsize, [refgenome], order, storage, datatags=datatags, cache=cache, overwrite=overwrite, store_whole_genome=store_whole_genome, random_state=random_state) return cls(name, garray, gindexer, alphabet='ACGT', channel_last=channel_last)
def create_from_refgenome(cls, name, refgenome, roi=None, binsize=None, stepsize=None, flank=0, order=1, storage='ndarray', datatags=None, cache=False, overwrite=False, channel_last=True, store_whole_genome=False): """Create a Bioseq class from a reference genome. This constructor loads nucleotide sequences from a reference genome. If regions of interest (ROI) is supplied, only the respective sequences are loaded, otherwise the entire genome is fetched. Parameters ----------- name : str Name of the dataset refgenome : str Fasta file. roi : str or None Bed-file defining the region of interest. If set to None, the sequence will be fetched from the entire genome and a genomic indexer must be attached later. Otherwise, the coverage is only determined for the region of interest. binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. flank : int Flanking region in basepairs to be extended up and downstream of each interval. Default: 0. order : int Order for the one-hot representation. Default: 1. storage : str Storage mode for storing the sequence may be 'ndarray', 'hdf5' or 'sparse'. Default: 'hdf5'. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. cache : boolean Indicates whether to cache the dataset. Default: False. overwrite : boolean Overwrite the cachefiles. Default: False. store_whole_genome : boolean Indicates whether the whole genome or only ROI should be loaded. If False, a bed-file with regions of interest must be specified. Default: False. """ # fill up int8 rep of DNA # load bioseq, region index, and within region index if roi is not None: gindexer = GenomicIndexer.create_from_file(roi, binsize, stepsize, flank) else: gindexer = None if not store_whole_genome and gindexer is None: raise ValueError('Either roi must be supplied or store_whole_genome must be True') if isinstance(refgenome, str): seqs = sequences_from_fasta(refgenome, 'dna') else: # This is already a list of SeqRecords seqs = refgenome if not store_whole_genome and gindexer is not None: # the genome is loaded with a bed file, # only the specific subset is loaded # to keep the memory overhead low. # Otherwise the entire reference genome is loaded. rgen = {seq.id: seq for seq in seqs} subseqs = [] for giv in gindexer: subseq = rgen[giv.chrom][giv.start:(giv.end)] subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end - order + 1) subseq.name = subseq.id subseq.description = subseq.id subseqs.append(subseq) seqs = subseqs garray = cls._make_genomic_array(name, seqs, order, storage, datatags=datatags, cache=cache, overwrite=overwrite, store_whole_genome=store_whole_genome) return cls(name, garray, gindexer, alphabetsize=len(seqs[0].seq.alphabet.letters), channel_last=channel_last)
def create_from_bam( cls, name, # pylint: disable=too-many-locals bamfiles, regions=None, genomesize=None, conditions=None, min_mapq=None, binsize=None, stepsize=None, flank=0, resolution=1, storage='ndarray', dtype='int', stranded=True, overwrite=False, pairedend='5prime', template_extension=0, aggregate=None, datatags=None, cache=False, channel_last=True, store_whole_genome=False): """Create a Cover class from a bam-file (or files). This constructor can be used to obtain coverage from BAM files. For single-end reads the read will be counted at the 5 prime end. Paired-end reads can be counted relative to the 5 prime ends of the read (default) or with respect to the midpoint. Parameters ----------- name : str Name of the dataset bamfiles : str or list bam-file or list of bam files. regions : str or None Bed-file defining the region of interest. If set to None, the coverage will be fetched from the entire genome and a genomic indexer must be attached later. genomesize : dict or None Dictionary containing the genome size. If `genomesize=None`, the genome size is determined from the bam header. If `store_whole_genome=False`, this option does not have an effect. conditions : list(str) or None List of conditions. If `conditions=None`, the conditions are obtained from the filenames (without the directories and file-ending). min_mapq : int Minimal mapping quality. Reads with lower mapping quality are filtered out. If None, all reads are used. binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. flank : int Flanking size increases the interval size at both ends by flank base pairs. Default: 0 resolution : int Resolution in base pairs divides the region of interest in windows of length resolution. This effectively reduces the storage for coverage data. The resolution must be selected such that min(stepsize, binsize) is a multiple of resolution. Default: 1. storage : str Storage mode for storing the coverage data can be 'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'. dtype : str Typecode to be used for storage the data. Default: 'int'. stranded : boolean Indicates whether to extract stranded or unstranded coverage. For unstranded coverage, reads aligning to both strands will be aggregated. overwrite : boolean Overwrite cachefiles. Default: False. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. pairedend : str Indicates whether to count reads at the '5prime' end or at the 'midpoint' for paired-end reads. Default: '5prime'. template_extension : int Elongates intervals by template_extension which allows to properly count template mid-points whose reads lie outside of the interval. This option is only relevant for paired-end reads counted at the 'midpoint' and if the coverage is not obtained from the whole genome, e.g. regions is not None. aggregate : callable or None Aggregation operation for loading genomic array. If None, the coverage amounts to the raw counts. Default: None cache : boolean Indicates whether to cache the dataset. Default: False. channel_last : boolean Indicates whether the condition axis should be the last dimension or the first. For example, tensorflow expects the channel at the last position. Default: True. store_whole_genome : boolean Indicates whether the whole genome or only selected regions should be loaded. If False, a bed-file with regions of interest must be specified. Default: False """ if pysam is None: # pragma: no cover raise Exception( 'pysam not available. ' '`create_from_bam` requires pysam to be installed.') if regions is not None: gindexer = GenomicIndexer.create_from_file(regions, binsize, stepsize, flank) else: gindexer = None if isinstance(bamfiles, str): bamfiles = [bamfiles] if conditions is None: conditions = [ os.path.splitext(os.path.basename(f))[0] for f in bamfiles ] if min_mapq is None: min_mapq = 0 full_genome_index = store_whole_genome if not full_genome_index and not gindexer: raise ValueError( 'Either regions must be supplied or store_whole_genome must be True' ) if not full_genome_index: # if whole genome should not be loaded gsize = { _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start for iv in gindexer } else: # otherwise the whole genome will be fetched, or at least # a set of full length chromosomes if genomesize is not None: # if a genome size has specifically been given, use it. gsize = genomesize.copy() else: header = pysam.AlignmentFile(bamfiles[0], 'r') # pylint: disable=no-member gsize = {} for chrom, length in zip(header.references, header.lengths): gsize[chrom] = length def _bam_loader(garray, files): print("load from bam") for i, sample_file in enumerate(files): print('Counting from {}'.format(sample_file)) aln_file = pysam.AlignmentFile(sample_file, 'rb') # pylint: disable=no-member for chrom in gsize: array = np.zeros( (get_chrom_length(gsize[chrom], resolution), 2), dtype=dtype) locus = _str_to_iv(chrom, template_extension=template_extension) if len(locus) == 1: locus = (locus[0], 0, gsize[chrom]) # locus = (chr, start, end) # or locus = (chr, ) for aln in aln_file.fetch(*locus): if aln.is_unmapped: continue if aln.mapq < min_mapq: continue if aln.is_read2: # only consider read1 so as not to double count # fragments for paired end reads # read2 will also be false for single end # reads. continue if aln.is_paired: # if paired end read, consider the midpoint if not (aln.is_proper_pair and aln.reference_name == aln.next_reference_name): # only consider paired end reads if both mates # are properly mapped and they map to the # same reference_name continue # if the next reference start >= 0, # the read is considered as a paired end read # in this case we consider the mid point if pairedend == 'midpoint': pos = min(aln.reference_start, aln.next_reference_start) + \ abs(aln.template_length) // 2 else: if aln.is_reverse: # last position of the downstream read pos = max( aln.reference_end, aln.next_reference_start + aln.query_length) else: # first position of the upstream read pos = min(aln.reference_start, aln.next_reference_start) else: # here we consider single end reads # whose 5 prime end is determined strand specifically if aln.is_reverse: pos = aln.reference_end else: pos = aln.reference_start if not garray._full_genome_stored: # if we get here, a region was given, # otherwise, the entire chromosome is read. pos -= locus[1] + template_extension if pos < 0 or pos >= locus[2] - locus[1]: # if the read 5 p end or mid point is outside # of the region of interest, the read is discarded continue # compute divide by the resolution pos //= resolution # fill up the read strand specifically if aln.is_reverse: array[pos, 1] += 1 else: array[pos, 0] += 1 # apply the aggregation if aggregate is not None: array = aggregate(array) if stranded: lp = locus + ('+', ) garray[GenomicInterval(*lp), i] = array[:, 0] lm = locus + ('-', ) garray[GenomicInterval(*lm), i] = array[:, 1] else: # if unstranded, aggregate the reads from # both strands garray[GenomicInterval(*locus), i] = array.sum(axis=1) return garray datatags = [name] + datatags if datatags else [name] # At the moment, we treat the information contained # in each bw-file as unstranded cover = create_genomic_array(gsize, stranded=stranded, storage=storage, datatags=datatags, cache=cache, conditions=conditions, overwrite=overwrite, typecode=dtype, store_whole_genome=store_whole_genome, resolution=resolution, loader=_bam_loader, loader_args=(bamfiles, )) return cls(name, cover, gindexer, padding_value=0, dimmode='all', channel_last=channel_last)
def create_from_bed( cls, name, # pylint: disable=too-many-locals bedfiles, regions=None, genomesize=None, conditions=None, binsize=None, stepsize=None, resolution=1, flank=0, storage='ndarray', dtype='int', dimmode='all', mode='binary', store_whole_genome=False, overwrite=False, channel_last=True, datatags=None, cache=False): """Create a Cover class from a bed-file (or files). Parameters ----------- name : str Name of the dataset bedfiles : str or list bed-file or list of bed files. regions : str or None Bed-file defining the region of interest. If set to None a genomesize must be supplied and a genomic indexer must be attached later. genomesize : dict or None Dictionary containing the genome size to fetch the coverage from. If `genomesize=None`, the genome size is fetched from the region of interest. conditions : list(str) or None List of conditions. If `conditions=None`, the conditions are obtained from the filenames (without the directories and file-ending). binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. resolution : int Resolution in base pairs divides the region of interest in windows of length resolution. This effectively reduces the storage for coverage data. The resolution must be selected such that min(stepsize, binsize) is a multiple of resolution. Default: 1. flank : int Flanking size increases the interval size at both ends by flank bins. Note that the binsize is defined by the resolution parameter. Default: 0. storage : str Storage mode for storing the coverage data can be 'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'. dtype : str Typecode to define the datatype to be used for storage. Default: 'int'. dimmode : str Dimension mode can be 'first' or 'all'. If 'first', only the first element of size resolution is returned. Otherwise, all elements of size resolution spanning the interval are returned. Default: 'all'. mode : str Mode of the dataset may be 'binary', 'score' or 'categorical'. Default: 'binary'. overwrite : boolean Overwrite cachefiles. Default: False. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. store_whole_genome : boolean Indicates whether the whole genome or only selected regions should be loaded. If False, a bed-file with regions of interest must be specified. Default: False. channel_last : boolean Indicates whether the condition axis should be the last dimension or the first. For example, tensorflow expects the channel at the last position. Default: True. cache : boolean Indicates whether to cache the dataset. Default: False. """ if regions is None and genomesize is None: raise ValueError('Either regions or genomesize must be specified.') if regions is not None: gindexer = GenomicIndexer.create_from_file(regions, binsize, stepsize, flank) else: gindexer = None if not store_whole_genome: # if whole genome should not be loaded gsize = { _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start for iv in gindexer } else: # otherwise the whole genome will be fetched, or at least # a set of full length chromosomes if genomesize is not None: # if a genome size has specifically been given, use it. gsize = genomesize.copy() else: gsize = get_genome_size_from_regions(regions) if isinstance(bedfiles, str): bedfiles = [bedfiles] if mode == 'categorical': if len(bedfiles) > 1: raise ValueError('Only one bed-file is ' 'allowed with mode=categorical') sample_file = bedfiles[0] regions_ = _get_genomic_reader(sample_file) max_class = 0 for reg in regions_: if reg.score > max_class: max_class = reg.score if conditions is None: conditions = [str(i) for i in range(int(max_class + 1))] if conditions is None: conditions = [ os.path.splitext(os.path.basename(f))[0] for f in bedfiles ] def _bed_loader(garray, bedfiles, genomesize, mode): print("load from bed") for i, sample_file in enumerate(bedfiles): regions_ = _get_genomic_reader(sample_file) for region in regions_: gidx = GenomicIndexer.create_from_region( region.iv.chrom, region.iv.start, region.iv.end, region.iv.strand, binsize, stepsize, flank) for greg in gidx: if region.score is None and mode in [ 'score', 'categorical' ]: raise ValueError( 'No Score available. Score field must ' 'present in {}'.format(sample_file) + \ 'for mode="{}"'.format(mode)) # if region score is not defined, take the mere # presence of a range as positive label. if mode == 'score': garray[greg, i] = np.dtype(dtype).type(region.score) elif mode == 'categorical': garray[greg, int(region.score)] = np.dtype(dtype).type(1) elif mode == 'binary': garray[greg, i] = np.dtype(dtype).type(1) return garray # At the moment, we treat the information contained # in each bed-file as unstranded datatags = [name] + datatags if datatags else [name] datatags += ['resolution{}'.format(resolution)] cover = create_genomic_array(gsize, stranded=False, storage=storage, datatags=datatags, cache=cache, conditions=conditions, resolution=resolution, overwrite=overwrite, typecode=dtype, store_whole_genome=store_whole_genome, loader=_bed_loader, loader_args=(bedfiles, gsize, mode)) return cls(name, cover, gindexer, padding_value=0, dimmode=dimmode, channel_last=channel_last)
def create_from_bigwig( cls, name, # pylint: disable=too-many-locals bigwigfiles, regions=None, genomesize=None, conditions=None, binsize=None, stepsize=None, resolution=1, flank=0, storage='ndarray', dtype='float32', overwrite=False, dimmode='all', aggregate=np.mean, datatags=None, cache=False, store_whole_genome=False, channel_last=True, nan_to_num=True): """Create a Cover class from a bigwig-file (or files). Parameters ----------- name : str Name of the dataset bigwigfiles : str or list bigwig-file or list of bigwig files. regions : str or None Bed-file defining the region of interest. If set to None, the coverage will be fetched from the entire genome and a genomic indexer must be attached later. Otherwise, the coverage is only determined for the region of interest. genomesize : dict or None Dictionary containing the genome size. If `genomesize=None`, the genome size is determined from the bigwig file. If `store_whole_genome=False`, this option does not have an effect. conditions : list(str) or None List of conditions. If `conditions=None`, the conditions are obtained from the filenames (without the directories and file-ending). binsize : int or None Binsize in basepairs. For binsize=None, the binsize will be determined from the bed-file directly which requires that all intervals in the bed-file are of equal length. Otherwise, the intervals in the bed-file will be split to subintervals of length binsize in conjunction with stepsize. Default: None. stepsize : int or None stepsize in basepairs for traversing the genome. If stepsize is None, it will be set equal to binsize. Default: None. resolution : int Resolution in base pairs divides the region of interest in windows of length resolution. This effectively reduces the storage for coverage data. The resolution must be selected such that min(stepsize, binsize) is a multiple of resolution. Default: 1. flank : int Flanking size increases the interval size at both ends by flank bins. Note that the binsize is defined by the resolution parameter. Default: 0. storage : str Storage mode for storing the coverage data can be 'ndarray', 'hdf5' or 'sparse'. Default: 'ndarray'. dtype : str Typecode to define the datatype to be used for storage. Default: 'float32'. dimmode : str Dimension mode can be 'first' or 'all'. If 'first', only the first element of size resolution is returned. Otherwise, all elements of size resolution spanning the interval are returned. Default: 'all'. overwrite : boolean Overwrite cachefiles. Default: False. datatags : list(str) or None List of datatags. Together with the dataset name, the datatags are used to construct a cache file. If :code:`cache=False`, this option does not have an effect. Default: None. aggregate : callable Aggregation operation for loading genomic array. Default: numpy.mean cache : boolean Indicates whether to cache the dataset. Default: False. store_whole_genome : boolean Indicates whether the whole genome or only selected regions should be loaded. If False, a bed-file with regions of interest must be specified. Default: False. channel_last : boolean Indicates whether the condition axis should be the last dimension or the first. For example, tensorflow expects the channel at the last position. Default: True. nan_to_num : boolean Indicates whether NaN values contained in the bigwig files should be interpreted as zeros. Default: True """ if pyBigWig is None: # pragma: no cover raise Exception( 'pyBigWig not available. ' '`create_from_bigwig` requires pyBigWig to be installed.') if regions is not None: gindexer = GenomicIndexer.create_from_file(regions, binsize, stepsize, flank) else: gindexer = None if isinstance(bigwigfiles, str): bigwigfiles = [bigwigfiles] if not store_whole_genome and not gindexer: raise ValueError( 'Either regions must be supplied or store_whole_genome must be True' ) if not store_whole_genome: # if whole genome should not be loaded gsize = { _iv_to_str(iv.chrom, iv.start, iv.end): iv.end - iv.start for iv in gindexer } else: # otherwise the whole genome will be fetched, or at least # a set of full length chromosomes if genomesize is not None: # if a genome size has specifically been given, use it. gsize = genomesize.copy() else: bwfile = pyBigWig.open(bigwigfiles[0], 'r') gsize = bwfile.chroms() if conditions is None: conditions = [ os.path.splitext(os.path.basename(f))[0] for f in bigwigfiles ] def _bigwig_loader(garray, aggregate): print("load from bigwig") for i, sample_file in enumerate(bigwigfiles): bwfile = pyBigWig.open(sample_file) for chrom in gsize: vals = np.zeros( (get_chrom_length(gsize[chrom], resolution), ), dtype=dtype) locus = _str_to_iv(chrom, template_extension=0) if len(locus) == 1: locus = locus + (0, gsize[chrom]) # when only to load parts of the genome for start in range(locus[1], locus[2], resolution): if garray._full_genome_stored: # be careful not to overshoot at the chromosome end end = min(start + resolution, gsize[chrom]) else: end = start + resolution x = np.asarray( bwfile.values(locus[0], int(start), int(end))) if nan_to_num: x = np.nan_to_num(x, copy=False) vals[(start - locus[1]) // resolution] = aggregate(x) garray[GenomicInterval(*locus), i] = vals return garray datatags = [name] + datatags if datatags else [name] datatags += ['resolution{}'.format(resolution)] cover = create_genomic_array(gsize, stranded=False, storage=storage, datatags=datatags, cache=cache, conditions=conditions, overwrite=overwrite, resolution=resolution, store_whole_genome=store_whole_genome, typecode=dtype, loader=_bigwig_loader, loader_args=(aggregate, )) return cls(name, cover, gindexer, padding_value=0, dimmode=dimmode, channel_last=channel_last)