def __getitem__(self, idxs): if isinstance(idxs, tuple): if len(idxs) == 3 or len(idxs) == 4: # interpret idxs as genomic interval idxs = GenomicInterval(*idxs) else: raise ValueError('idxs cannot be interpreted as genomic interval.' ' use (chr, start, end) or (chr, start, end, strand)') if isinstance(idxs, int): idxs = [idxs] elif isinstance(idxs, slice): idxs = range(idxs.start if idxs.start else 0, idxs.stop if idxs.stop else len(self), idxs.step if idxs.step else 1) elif isinstance(idxs, GenomicInterval): if not self.garray._full_genome_stored: raise ValueError('Indexing with GenomicInterval only possible ' 'when the whole genome (or chromosome) was loaded') data = np.zeros((1, idxs.length - self.garray.order + 1)) data[0] = self._getsingleitem(idxs) # accept a genomic interval directly data = as_onehot(data, self.garray.order, self._alphabetsize) for transform in self.transformations: data = transform(data) if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data try: iter(idxs) except TypeError: raise IndexError('Bioseq.__getitem__: ' + 'index must be iterable') data = as_onehot(self.iseq4idx(idxs), self.garray.order, self._alphabetsize) for transform in self.transformations: data = transform(data) if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data
def __getitem__(self, idxs): if isinstance(idxs, tuple): if len(idxs) == 3 or len(idxs) == 4: # interpret idxs as genomic interval idxs = Interval(*idxs) else: raise ValueError( 'Cannot interpret genomic interval.' ' Use (chr, start, end) or (chr, start, end, strand)') if isinstance(idxs, int): idxs = [idxs] elif isinstance(idxs, slice): idxs = range(idxs.start if idxs.start else 0, idxs.stop if idxs.stop else len(self), idxs.step if idxs.step else 1) elif isinstance(idxs, Interval): if not self.garray._full_genome_stored: raise ValueError('Indexing with Interval ' 'requires store_whole_genome=True.') data = np.zeros((1, idxs.length - self.garray.order + 1)) data[0] = self._getsingleitem(idxs) # accept a genomic interval directly data = as_onehot(data, self.garray.order, self._alphabetsize) return data try: iter(idxs) except TypeError: raise IndexError('Bioseq.__getitem__: ' + 'index must be iterable') data = as_onehot(self.iseq4idx(idxs), self.garray.order, self._alphabetsize) return data
def flow(self): """Data flow generator.""" refs = np.zeros( (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1, pow(self.bioseq._alphabetsize, self.bioseq.garray.order))) alts = np.zeros_like(refs) vcf = VariantFile(self.variants).fetch() try: while True: # construct genomic region names = [] chroms = [] poss = [] rallele = [] aallele = [] ibatch = 0 while ibatch < self.batch_size: rec = next(vcf) if not self.is_compatible(rec): continue start = rec.pos - self.binsize // 2 + (1 if self.binsize % 2 == 0 else 0) - 1 end = rec.pos + self.binsize // 2 if start < 0: continue names.append(rec.id if rec.id is not None else '') chroms.append(rec.chrom) poss.append(rec.pos - 1) rallele.append(rec.ref.upper()) aallele.append(rec.alts[0].upper()) iref = self.bioseq._getsingleitem( Interval(rec.chrom, start, end)).copy() ref = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) refs[ibatch] = ref #mutate the position with the variant # only support single nucleotide variants at the moment for o in range(self.bioseq.garray.order): irefbase = iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] irefbase = irefbase // pow(self.bioseq._alphabetsize, o) irefbase = irefbase % self.bioseq._alphabetsize if NMAP[rec.ref.upper()] != irefbase: self.logger.info( 'VCF reference and reference genome not compatible.' 'Expected reference {}, but VCF indicates {}.'. format(irefbase, NMAP[rec.ref.upper()]) + 'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format( rec.chrom, rec.pos, rec.ref, rec.alts[0], rec.id)) else: replacement = (NMAP[rec.alts[0].upper()] - NMAP[rec.ref.upper()]) * \ pow(self.bioseq._alphabetsize, o) iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement alt = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) alts[ibatch] = alt ibatch += 1 yield names, chroms, poss, rallele, aallele, refs, alts except StopIteration: refs = refs[:ibatch] alts = alts[:ibatch] yield names, chroms, poss, rallele, aallele, refs, alts
def flow(self): """Data flow generator.""" refs = np.zeros( (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1, pow(self.bioseq._alphabetsize, self.bioseq.garray.order))) alts = np.zeros_like(refs) # get variants vcf = VariantFile(self.variants).fetch() def _get_replacement(new_nucleotide, previous_nucleotide, o): # helper function for replacing old with new nucleotides return (new_nucleotide - previous_nucleotide) * \ pow(self.bioseq._alphabetsize, o) # annotation is used to inform about the strandedness # to evaluate the variant if self.annotation is not None: varbed = BedTool(self.variants) n_vcf_fields = len(varbed[0].fields) vcf_strand_augment = iter( varbed.intersect(self.annotation, loj=True)) try: while True: # construct genomic region names = [] chroms = [] poss = [] rallele = [] aallele = [] ibatch = 0 # prepare mini-batches of variants while ibatch < self.batch_size: rec = next(vcf) rec_strandedness = '+' if self.annotation is not None: rec_aug = next(vcf_strand_augment) rec_strandedness = '-' if '-' in rec_aug[ n_vcf_fields:] else '+' if not self.is_compatible(rec): continue start, end = self.get_interval(rec) names.append(rec.id if rec.id is not None else '') chroms.append(rec.chrom) poss.append(rec.pos - 1) rallele.append(rec.ref.upper()) aallele.append(rec.alts[0].upper()) # obtain the nucleotide indices around the variant iref = self.bioseq._getsingleitem( Interval(rec.chrom, start, end)).copy() ialt = iref.copy() for o in range(self.bioseq.garray.order): # in the loop we adjust the original DNA sequence # by using the alternative alleele instead # # the loop is required for the higher-order nucleotide representation # in which a single variant position affects multiple # mutually overlapping positions in the one-hot encoding # # furthermore, the alternative alleele is only set if # the reference alleele matches with the reference genome. # unless the ignore_reference_match option was used. # this is the positions at which to change the nucleotide position_to_change = self.binsize//2 + o - \ self.bioseq.garray.order + \ (0 if self.binsize%2 == 0 else 1) # determine the reference nucleotide # this would be just irefbase itself for order=1 # but for higher-order representation it needs to # be determined. e.g. for TT for order=2 would be irefbase==15 # which should give the nucleotides 3, 3 irefbase = iref[position_to_change] irefbase = irefbase // pow(self.bioseq._alphabetsize, o) irefbase = irefbase % self.bioseq._alphabetsize if self.ignore_reference_match: # process the variant even if # it does not match with the reference base # replace nucleotides in the reference # and in the alternative alleele iref[position_to_change] += _get_replacement( NMAP[rec.ref.upper()], irefbase, o) ialt[position_to_change] += _get_replacement( NMAP[rec.alts[0].upper()], irefbase, o) continue if NMAP[rec.ref.upper()] != irefbase: self.logger.info( 'VCF reference and reference genome not compatible.' 'Expected reference {}, but VCF indicates {}.'. format(irefbase, NMAP[rec.ref.upper()]) + 'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format( rec.chrom, rec.pos, rec.ref, rec.alts[0], rec.id)) else: # at this point, it is ensured that the VCF reference # agrees with the reference genome. # keep the reference as it is, only change # the alternative alleele ialt[position_to_change] += _get_replacement( NMAP[rec.alts[0].upper()], NMAP[rec.ref.upper()], o) # if the strandedness is negative (from the annotation) # the DNA sequences are reverse complemented if rec_strandedness == '-': ialt = self.bioseq._revcomp(ialt) iref = self.bioseq._revcomp(iref) alt = as_onehot(ialt[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) alts[ibatch] = alt ref = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) refs[ibatch] = ref ibatch += 1 yield names, chroms, poss, rallele, aallele, refs, alts except StopIteration: refs = refs[:ibatch] alts = alts[:ibatch] yield names, chroms, poss, rallele, aallele, refs, alts
def flow(self): """Data flow generator.""" refs = np.zeros( (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1, pow(self.bioseq._alphabetsize, self.bioseq.garray.order))) alts = np.zeros_like(refs) vcf = VariantFile(self.variants).fetch() if self.annotation is not None: varbed = BedTool(self.variants) n_vcf_fields = len(varbed[0].fields) vcf_strand_augment = iter( varbed.intersect(self.annotation, loj=True)) try: while True: # construct genomic region names = [] chroms = [] poss = [] rallele = [] aallele = [] ibatch = 0 while ibatch < self.batch_size: rec = next(vcf) rec_strandedness = '+' if self.annotation is not None: rec_aug = next(vcf_strand_augment) rec_strandedness = '-' if '-' in rec_aug[ n_vcf_fields:] else '+' if not self.is_compatible(rec): continue start, end = self.get_interval(rec) names.append(rec.id if rec.id is not None else '') chroms.append(rec.chrom) poss.append(rec.pos - 1) rallele.append(rec.ref.upper()) aallele.append(rec.alts[0].upper()) iref = self.bioseq._getsingleitem( Interval(rec.chrom, start, end)).copy() ialt = iref.copy() for o in range(self.bioseq.garray.order): irefbase = iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] irefbase = irefbase // pow(self.bioseq._alphabetsize, o) irefbase = irefbase % self.bioseq._alphabetsize if self.ignore_reference_match: # process the variant even if # it does not match with the reference base replacement = (NMAP[rec.ref.upper()] - irefbase) * \ pow(self.bioseq._alphabetsize, o) iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement replacement = (NMAP[rec.alts[0].upper()] - irefbase) * \ pow(self.bioseq._alphabetsize, o) ialt[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement continue if NMAP[rec.ref.upper()] != irefbase: self.logger.info( 'VCF reference and reference genome not compatible.' 'Expected reference {}, but VCF indicates {}.'. format(irefbase, NMAP[rec.ref.upper()]) + 'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format( rec.chrom, rec.pos, rec.ref, rec.alts[0], rec.id)) else: # at this point, it is ensured that the VCF reference # agrees with the reference genome. replacement = (NMAP[rec.alts[0].upper()] - NMAP[rec.ref.upper()]) * \ pow(self.bioseq._alphabetsize, o) ialt[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement if rec_strandedness == '-': ialt = self.bioseq._revcomp(ialt) iref = self.bioseq._revcomp(iref) alt = as_onehot(ialt[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) alts[ibatch] = alt ref = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) refs[ibatch] = ref ibatch += 1 yield names, chroms, poss, rallele, aallele, refs, alts except StopIteration: refs = refs[:ibatch] alts = alts[:ibatch] yield names, chroms, poss, rallele, aallele, refs, alts