Beispiel #1
0
def test_read_dna_from_biostring_order_1():

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')
    seqs = sequences_from_fasta(filename)
    with pytest.raises(ValueError):
        data = Bioseq.create_from_seq('train',
                                      fastafile=seqs,
                                      storage='sparse',
                                      order=order,
                                      cache=False)

    data = Bioseq.create_from_seq('train',
                                  fastafile=seqs,
                                  order=order,
                                  cache=False)

    np.testing.assert_equal(len(data), 3897)
    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0],
                    [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0],
                    [1, 0, 0, 0], [0, 0, 1, 0]],
                   dtype='int8'))
Beispiel #2
0
def test_dna_loading_from_seqrecord(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')
    seqs = sequences_from_fasta(refgenome)

    data = Bioseq.create_from_refgenome('train', refgenome=seqs,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order)
Beispiel #3
0
def test_dna_loading_from_seqrecord(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')
    seqs = sequences_from_fasta(refgenome)

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=seqs,
                                        roi=bed_merged,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    chrom = data.gindexer[0].chrom
    start = data.gindexer[0].start
    end = data.gindexer[0].end
    np.testing.assert_equal(data[0], data[(chrom, start, end)])
    np.testing.assert_equal(data[0], data[chrom, start, end])
Beispiel #4
0
    def load_sequence(self):
        print('loading from lazy loader')
        store_whole_genome = self.store_whole_genome
        gindexer = self.gindexer

        if isinstance(self.fastafile, str):
            seqs = sequences_from_fasta(self.fastafile, self.seqtype)
        else:
            # This is already a list of SeqRecords
            seqs = self.fastafile

        if not store_whole_genome and gindexer is not None:
            # the genome is loaded with a bed file,
            # only the specific subset is loaded
            # to keep the memory overhead low.
            # Otherwise the entire reference genome is loaded.
            rgen = OrderedDict(((seq.id, seq) for seq in seqs))
            subseqs = []
            for giv in gindexer:
                subseq = rgen[giv.chrom][
                    max(giv.start, 0):min(giv.end, len(rgen[giv.chrom]))]
                if giv.start < 0:
                    subseq = 'N' * (-giv.start) + subseq
                if len(subseq) < giv.length:
                    subseq = subseq + 'N' * (giv.length - len(subseq))
                subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end)
                subseq.name = subseq.id
                subseq.description = subseq.id
                subseqs.append(subseq)
            seqs = subseqs
            gsize = gindexer

        if store_whole_genome:
            gsize = OrderedDict(((seq.id, len(seq)) for seq in seqs))
            gsize = GenomicIndexer.create_from_genomesize(gsize)

        self.gsize_ = gsize
        self.seqs_ = seqs
Beispiel #5
0
    def create_from_seq(
            cls,
            name,  # pylint: disable=too-many-locals
            fastafile,
            storage='ndarray',
            seqtype='dna',
            order=1,
            fixedlen=None,
            datatags=None,
            cache=False,
            channel_last=True,
            overwrite=False):
        """Create a Bioseq class from a biological sequences.

        This constructor loads a set of nucleotide or amino acid sequences.
        By default, the sequence are assumed to be of equal length.
        Alternatively, sequences can be truncated and padded to a fixed length.


        Parameters
        -----------
        name : str
            Name of the dataset
        fastafile : str or list(str) or list(Bio.SeqRecord)
            Fasta file or list of fasta files from which the sequences
            are loaded or a list of Bio.SeqRecord.SeqRecord.
        seqtype : str
            Indicates whether a nucleotide or peptide sequence is loaded
            using 'dna' or 'protein' respectively. Default: 'dna'.
        order : int
            Order for the one-hot representation. Default: 1.
        fixedlen : int or None
            Forces the sequences to be of equal length by truncation or
            zero-padding. If set to None, it will be assumed that the sequences
            are already of equal length. An exception is raised if this is
            not the case. Default: None.
        storage : str
            Storage mode for storing the sequence may be 'ndarray' or 'hdf5'.
            Default: 'ndarray'.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        overwrite : boolean
            Overwrite the cachefiles. Default: False.
        """
        if storage not in ['ndarray', 'hdf5']:
            raise ValueError(
                'Available storage options for Bioseq are: ndarray or hdf5')

        seqs = []
        if isinstance(fastafile, str):
            fastafile = [fastafile]

        if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord):
            for fasta in fastafile:
                # += is necessary since sequences_from_fasta
                # returns a list
                seqs += sequences_from_fasta(fasta, seqtype)
        else:
            # This is already a list of SeqRecords
            seqs = fastafile

        if fixedlen is not None:
            seqs = sequence_padding(seqs, fixedlen)

        # Check if sequences are equally long
        lens = [len(seq) for seq in seqs]
        assert lens == [len(seqs[0])] * len(seqs), "Input sequences must " + \
            "be of equal length."

        # Chromnames are required to be Unique
        chroms = [seq.id for seq in seqs]
        assert len(set(chroms)) == len(seqs), "Sequence IDs must be unique."
        # now mimic a dataframe representing a bed file

        reglen = lens[0]
        flank = 0
        stepsize = 1

        gindexer = GenomicIndexer(reglen, stepsize, flank, zero_padding=False)
        for chrom in chroms:
            gindexer.add_interval(chrom, 0, reglen, '.')

        garray = cls._make_genomic_array(name,
                                         gindexer,
                                         seqs,
                                         order,
                                         storage,
                                         cache=cache,
                                         datatags=datatags,
                                         overwrite=overwrite,
                                         store_whole_genome=False)

        return cls(name,
                   garray,
                   gindexer,
                   alphabet=seqs[0].seq.alphabet.letters,
                   channel_last=channel_last)
Beispiel #6
0
    def create_from_refgenome(cls, name, refgenome, roi=None,
                              binsize=None,
                              stepsize=None,
                              flank=0, order=1,
                              storage='ndarray',
                              datatags=None,
                              cache=False,
                              overwrite=False,
                              channel_last=True,
                              store_whole_genome=False):
        """Create a Bioseq class from a reference genome.

        This constructor loads nucleotide sequences from a reference genome.
        If regions of interest (ROI) is supplied, only the respective sequences
        are loaded, otherwise the entire genome is fetched.

        Parameters
        -----------
        name : str
            Name of the dataset
        refgenome : str
            Fasta file.
        roi : str or None
            Bed-file defining the region of interest.
            If set to None, the sequence will be
            fetched from the entire genome and a
            genomic indexer must be attached later.
            Otherwise, the coverage is only determined
            for the region of interest.
        binsize : int or None
            Binsize in basepairs. For binsize=None,
            the binsize will be determined from the bed-file directly
            which requires that all intervals in the bed-file are of equal
            length. Otherwise, the intervals in the bed-file will be
            split to subintervals of length binsize in conjunction with
            stepsize. Default: None.
        stepsize : int or None
            stepsize in basepairs for traversing the genome.
            If stepsize is None, it will be set equal to binsize.
            Default: None.
        flank : int
            Flanking region in basepairs to be extended up and downstream of each interval.
            Default: 0.
        order : int
            Order for the one-hot representation. Default: 1.
        storage : str
            Storage mode for storing the sequence may be 'ndarray', 'hdf5' or
            'sparse'. Default: 'hdf5'.
        datatags : list(str) or None
            List of datatags. Together with the dataset name,
            the datatags are used to construct a cache file.
            If :code:`cache=False`, this option does not have an effect.
            Default: None.
        cache : boolean
            Indicates whether to cache the dataset. Default: False.
        overwrite : boolean
            Overwrite the cachefiles. Default: False.
        store_whole_genome : boolean
            Indicates whether the whole genome or only ROI
            should be loaded. If False, a bed-file with regions of interest
            must be specified. Default: False.
        """
        # fill up int8 rep of DNA
        # load bioseq, region index, and within region index


        if roi is not None:
            gindexer = GenomicIndexer.create_from_file(roi, binsize,
                                                       stepsize, flank)
        else:
            gindexer = None

        if not store_whole_genome and gindexer is None:
            raise ValueError('Either roi must be supplied or store_whole_genome must be True')

        if isinstance(refgenome, str):
            seqs = sequences_from_fasta(refgenome, 'dna')
        else:
            # This is already a list of SeqRecords
            seqs = refgenome

        if not store_whole_genome and gindexer is not None:
            # the genome is loaded with a bed file,
            # only the specific subset is loaded
            # to keep the memory overhead low.
            # Otherwise the entire reference genome is loaded.
            rgen = {seq.id: seq for seq in seqs}
            subseqs = []
            for giv in gindexer:
                subseq = rgen[giv.chrom][giv.start:(giv.end)]
                subseq.id = _iv_to_str(giv.chrom, giv.start, giv.end - order + 1)
                subseq.name = subseq.id
                subseq.description = subseq.id

                subseqs.append(subseq)
            seqs = subseqs

        garray = cls._make_genomic_array(name, seqs, order, storage,
                                         datatags=datatags,
                                         cache=cache,
                                         overwrite=overwrite,
                                         store_whole_genome=store_whole_genome)

        return cls(name, garray, gindexer,
                   alphabetsize=len(seqs[0].seq.alphabet.letters),
                   channel_last=channel_last)
Beispiel #7
0
    def _make_genomic_array(name,
                            fastafile,
                            order,
                            storage,
                            seqtype,
                            cache=True,
                            datatags=None,
                            overwrite=False,
                            store_whole_genome=True):
        """Create a genomic array or reload an existing one."""

        # always use int 16 to store bioseq indices
        # do not use int8 at the moment, because 'N' is encoded
        # as -1024, which causes an underflow with int8.
        dtype = 'int16'

        # Load sequences from refgenome
        seqs = []
        if isinstance(fastafile, str):
            fastafile = [fastafile]

        if not isinstance(fastafile[0], Bio.SeqRecord.SeqRecord):
            for fasta in fastafile:
                # += is necessary since sequences_from_fasta
                # returns a list
                seqs += sequences_from_fasta(fasta, seqtype)
        else:
            # This is already a list of SeqRecords
            seqs = fastafile

        # Extract chromosome lengths
        chromlens = {}

        for seq in seqs:
            chromlens[seq.id] = len(seq) - order + 1

        def _seq_loader(cover, seqs, order):
            print('Convert sequences to index array')
            for seq in seqs:
                if cover._full_genome_stored:
                    interval = GenomicInterval(seq.id, 0,
                                               len(seq) - order + 1, '.')
                else:
                    interval = GenomicInterval(
                        *_str_to_iv(seq.id, template_extension=0))

                indarray = np.asarray(seq2ind(seq), dtype=dtype)

                if order > 1:
                    # for higher order motifs, this part is used
                    filter_ = np.asarray([
                        pow(len(seq.seq.alphabet.letters), i)
                        for i in range(order)
                    ])
                    indarray = np.convolve(indarray, filter_, mode='valid')

                cover[interval, 0] = indarray

        # At the moment, we treat the information contained
        # in each bw-file as unstranded
        datatags = [name] + datatags if datatags else [name]
        datatags += ['order{}'.format(order)]

        cover = create_genomic_array(chromlens,
                                     stranded=False,
                                     storage=storage,
                                     datatags=datatags,
                                     cache=cache,
                                     store_whole_genome=store_whole_genome,
                                     order=order,
                                     conditions=['idx'],
                                     overwrite=overwrite,
                                     typecode=dtype,
                                     loader=_seq_loader,
                                     loader_args=(seqs, order))

        return cover