Exemple #1
0
    def __getitem__(self, idxs):
        if isinstance(idxs, tuple):
            if len(idxs) == 3 or len(idxs) == 4:
                # interpret idxs as genomic interval
                idxs = GenomicInterval(*idxs)
            else:
                raise ValueError('idxs cannot be interpreted as genomic interval.'
                                 ' use (chr, start, end) or (chr, start, end, strand)')

        if isinstance(idxs, int):
            idxs = [idxs]
        elif isinstance(idxs, slice):
            idxs = range(idxs.start if idxs.start else 0,
                         idxs.stop if idxs.stop else len(self),
                         idxs.step if idxs.step else 1)
        elif isinstance(idxs, GenomicInterval):
            if not self.garray._full_genome_stored:
                raise ValueError('Indexing with GenomicInterval only possible '
                                 'when the whole genome (or chromosome) was loaded')

            data = np.zeros((1, idxs.length  - self.garray.order + 1))
            data[0] = self._getsingleitem(idxs)
            # accept a genomic interval directly
            data = as_onehot(data,
                             self.garray.order,
                             self._alphabetsize)
            for transform in self.transformations:
                data = transform(data)
            if not self._channel_last:
                data = np.transpose(data, (0, 3, 1, 2))
            return data

        try:
            iter(idxs)
        except TypeError:
            raise IndexError('Bioseq.__getitem__: '
                             + 'index must be iterable')

        data = as_onehot(self.iseq4idx(idxs), self.garray.order,
                         self._alphabetsize)

        for transform in self.transformations:
            data = transform(data)

        if not self._channel_last:
            data = np.transpose(data, (0, 3, 1, 2))

        return data
Exemple #2
0
    def __getitem__(self, idxs):
        if isinstance(idxs, tuple):
            if len(idxs) == 3 or len(idxs) == 4:
                # interpret idxs as genomic interval
                idxs = Interval(*idxs)
            else:
                raise ValueError(
                    'Cannot interpret genomic interval.'
                    ' Use (chr, start, end) or (chr, start, end, strand)')

        if isinstance(idxs, int):
            idxs = [idxs]
        elif isinstance(idxs, slice):
            idxs = range(idxs.start if idxs.start else 0,
                         idxs.stop if idxs.stop else len(self),
                         idxs.step if idxs.step else 1)
        elif isinstance(idxs, Interval):
            if not self.garray._full_genome_stored:
                raise ValueError('Indexing with Interval '
                                 'requires store_whole_genome=True.')

            data = np.zeros((1, idxs.length - self.garray.order + 1))
            data[0] = self._getsingleitem(idxs)
            # accept a genomic interval directly
            data = as_onehot(data, self.garray.order, self._alphabetsize)

            return data

        try:
            iter(idxs)
        except TypeError:
            raise IndexError('Bioseq.__getitem__: ' + 'index must be iterable')

        data = as_onehot(self.iseq4idx(idxs), self.garray.order,
                         self._alphabetsize)

        return data
Exemple #3
0
    def flow(self):
        """Data flow generator."""

        refs = np.zeros(
            (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1,
             pow(self.bioseq._alphabetsize, self.bioseq.garray.order)))
        alts = np.zeros_like(refs)

        vcf = VariantFile(self.variants).fetch()

        try:
            while True:
                # construct genomic region
                names = []
                chroms = []
                poss = []
                rallele = []
                aallele = []

                ibatch = 0

                while ibatch < self.batch_size:
                    rec = next(vcf)

                    if not self.is_compatible(rec):
                        continue

                    start = rec.pos - self.binsize // 2 + (1 if self.binsize %
                                                           2 == 0 else 0) - 1
                    end = rec.pos + self.binsize // 2

                    if start < 0:
                        continue

                    names.append(rec.id if rec.id is not None else '')
                    chroms.append(rec.chrom)
                    poss.append(rec.pos - 1)
                    rallele.append(rec.ref.upper())
                    aallele.append(rec.alts[0].upper())

                    iref = self.bioseq._getsingleitem(
                        Interval(rec.chrom, start, end)).copy()

                    ref = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)
                    refs[ibatch] = ref
                    #mutate the position with the variant

                    # only support single nucleotide variants at the moment
                    for o in range(self.bioseq.garray.order):

                        irefbase = iref[self.binsize // 2 + o -
                                        self.bioseq.garray.order +
                                        (0 if self.binsize % 2 == 0 else 1)]
                        irefbase = irefbase // pow(self.bioseq._alphabetsize,
                                                   o)
                        irefbase = irefbase % self.bioseq._alphabetsize

                        if NMAP[rec.ref.upper()] != irefbase:
                            self.logger.info(
                                'VCF reference and reference genome not compatible.'
                                'Expected reference {}, but VCF indicates {}.'.
                                format(irefbase, NMAP[rec.ref.upper()]) +
                                'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format(
                                    rec.chrom, rec.pos, rec.ref, rec.alts[0],
                                    rec.id))
                        else:

                            replacement = (NMAP[rec.alts[0].upper()] -
                                           NMAP[rec.ref.upper()]) * \
                                          pow(self.bioseq._alphabetsize, o)

                            iref[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement

                    alt = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)

                    alts[ibatch] = alt

                    ibatch += 1
                yield names, chroms, poss, rallele, aallele, refs, alts

        except StopIteration:
            refs = refs[:ibatch]
            alts = alts[:ibatch]

            yield names, chroms, poss, rallele, aallele, refs, alts
Exemple #4
0
    def flow(self):
        """Data flow generator."""

        refs = np.zeros(
            (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1,
             pow(self.bioseq._alphabetsize, self.bioseq.garray.order)))
        alts = np.zeros_like(refs)

        # get variants
        vcf = VariantFile(self.variants).fetch()

        def _get_replacement(new_nucleotide, previous_nucleotide, o):
            # helper function for replacing old with new nucleotides
            return (new_nucleotide - previous_nucleotide) * \
                   pow(self.bioseq._alphabetsize, o)

        # annotation is used to inform about the strandedness
        # to evaluate the variant
        if self.annotation is not None:
            varbed = BedTool(self.variants)
            n_vcf_fields = len(varbed[0].fields)
            vcf_strand_augment = iter(
                varbed.intersect(self.annotation, loj=True))

        try:
            while True:
                # construct genomic region
                names = []
                chroms = []
                poss = []
                rallele = []
                aallele = []

                ibatch = 0

                # prepare mini-batches of variants
                while ibatch < self.batch_size:
                    rec = next(vcf)
                    rec_strandedness = '+'
                    if self.annotation is not None:
                        rec_aug = next(vcf_strand_augment)
                        rec_strandedness = '-' if '-' in rec_aug[
                            n_vcf_fields:] else '+'

                    if not self.is_compatible(rec):
                        continue

                    start, end = self.get_interval(rec)

                    names.append(rec.id if rec.id is not None else '')
                    chroms.append(rec.chrom)
                    poss.append(rec.pos - 1)
                    rallele.append(rec.ref.upper())
                    aallele.append(rec.alts[0].upper())

                    # obtain the nucleotide indices around the variant
                    iref = self.bioseq._getsingleitem(
                        Interval(rec.chrom, start, end)).copy()
                    ialt = iref.copy()

                    for o in range(self.bioseq.garray.order):
                        # in the loop we adjust the original DNA sequence
                        # by using the alternative alleele instead
                        #
                        # the loop is required for the higher-order nucleotide representation
                        # in which a single variant position affects multiple
                        # mutually overlapping positions in the one-hot encoding
                        #
                        # furthermore, the alternative alleele is only set if
                        # the reference alleele matches with the reference genome.
                        # unless the ignore_reference_match option was used.

                        # this is the positions at which to change the nucleotide
                        position_to_change = self.binsize//2 + o - \
                                          self.bioseq.garray.order + \
                                          (0 if self.binsize%2 == 0 else 1)

                        # determine the reference nucleotide
                        # this would be just irefbase itself for order=1
                        # but for higher-order representation it needs to
                        # be determined. e.g. for TT for order=2 would be irefbase==15
                        # which should give the nucleotides 3, 3
                        irefbase = iref[position_to_change]
                        irefbase = irefbase // pow(self.bioseq._alphabetsize,
                                                   o)
                        irefbase = irefbase % self.bioseq._alphabetsize

                        if self.ignore_reference_match:
                            # process the variant even if
                            # it does not match with the reference base

                            # replace nucleotides in the reference
                            # and in the alternative alleele
                            iref[position_to_change] += _get_replacement(
                                NMAP[rec.ref.upper()], irefbase, o)

                            ialt[position_to_change] += _get_replacement(
                                NMAP[rec.alts[0].upper()], irefbase, o)
                            continue

                        if NMAP[rec.ref.upper()] != irefbase:
                            self.logger.info(
                                'VCF reference and reference genome not compatible.'
                                'Expected reference {}, but VCF indicates {}.'.
                                format(irefbase, NMAP[rec.ref.upper()]) +
                                'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format(
                                    rec.chrom, rec.pos, rec.ref, rec.alts[0],
                                    rec.id))
                        else:
                            # at this point, it is ensured that the VCF reference
                            # agrees with the reference genome.
                            # keep the reference as it is, only change
                            #  the alternative alleele

                            ialt[position_to_change] += _get_replacement(
                                NMAP[rec.alts[0].upper()],
                                NMAP[rec.ref.upper()], o)

                    # if the strandedness is negative (from the annotation)
                    # the DNA sequences are reverse complemented
                    if rec_strandedness == '-':
                        ialt = self.bioseq._revcomp(ialt)
                        iref = self.bioseq._revcomp(iref)

                    alt = as_onehot(ialt[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)

                    alts[ibatch] = alt

                    ref = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)
                    refs[ibatch] = ref

                    ibatch += 1
                yield names, chroms, poss, rallele, aallele, refs, alts

        except StopIteration:
            refs = refs[:ibatch]
            alts = alts[:ibatch]

            yield names, chroms, poss, rallele, aallele, refs, alts
Exemple #5
0
    def flow(self):
        """Data flow generator."""

        refs = np.zeros(
            (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1,
             pow(self.bioseq._alphabetsize, self.bioseq.garray.order)))
        alts = np.zeros_like(refs)

        vcf = VariantFile(self.variants).fetch()

        if self.annotation is not None:
            varbed = BedTool(self.variants)
            n_vcf_fields = len(varbed[0].fields)
            vcf_strand_augment = iter(
                varbed.intersect(self.annotation, loj=True))

        try:
            while True:
                # construct genomic region
                names = []
                chroms = []
                poss = []
                rallele = []
                aallele = []

                ibatch = 0

                while ibatch < self.batch_size:
                    rec = next(vcf)
                    rec_strandedness = '+'
                    if self.annotation is not None:
                        rec_aug = next(vcf_strand_augment)
                        rec_strandedness = '-' if '-' in rec_aug[
                            n_vcf_fields:] else '+'

                    if not self.is_compatible(rec):
                        continue

                    start, end = self.get_interval(rec)

                    names.append(rec.id if rec.id is not None else '')
                    chroms.append(rec.chrom)
                    poss.append(rec.pos - 1)
                    rallele.append(rec.ref.upper())
                    aallele.append(rec.alts[0].upper())

                    iref = self.bioseq._getsingleitem(
                        Interval(rec.chrom, start, end)).copy()
                    ialt = iref.copy()

                    for o in range(self.bioseq.garray.order):

                        irefbase = iref[self.binsize // 2 + o -
                                        self.bioseq.garray.order +
                                        (0 if self.binsize % 2 == 0 else 1)]
                        irefbase = irefbase // pow(self.bioseq._alphabetsize,
                                                   o)
                        irefbase = irefbase % self.bioseq._alphabetsize

                        if self.ignore_reference_match:
                            # process the variant even if
                            # it does not match with the reference base
                            replacement = (NMAP[rec.ref.upper()] -
                                           irefbase) * \
                                           pow(self.bioseq._alphabetsize, o)

                            iref[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement

                            replacement = (NMAP[rec.alts[0].upper()] -
                                           irefbase) * \
                                          pow(self.bioseq._alphabetsize, o)

                            ialt[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement
                            continue

                        if NMAP[rec.ref.upper()] != irefbase:
                            self.logger.info(
                                'VCF reference and reference genome not compatible.'
                                'Expected reference {}, but VCF indicates {}.'.
                                format(irefbase, NMAP[rec.ref.upper()]) +
                                'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format(
                                    rec.chrom, rec.pos, rec.ref, rec.alts[0],
                                    rec.id))
                        else:
                            # at this point, it is ensured that the VCF reference
                            # agrees with the reference genome.
                            replacement = (NMAP[rec.alts[0].upper()] -
                                           NMAP[rec.ref.upper()]) * \
                                          pow(self.bioseq._alphabetsize, o)

                            ialt[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement

                    if rec_strandedness == '-':
                        ialt = self.bioseq._revcomp(ialt)
                        iref = self.bioseq._revcomp(iref)

                    alt = as_onehot(ialt[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)

                    alts[ibatch] = alt

                    ref = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)
                    refs[ibatch] = ref

                    ibatch += 1
                yield names, chroms, poss, rallele, aallele, refs, alts

        except StopIteration:
            refs = refs[:ibatch]
            alts = alts[:ibatch]

            yield names, chroms, poss, rallele, aallele, refs, alts