Esempio n. 1
0
 def test_write_dataset_datachunkiterator_data_and_time(self):
     a = np.arange(30).reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     tstamps = np.arange(5)
     tsiter = DataChunkIterator.from_iterable(tstamps)
     ts = TimeSeries('ts_name', daiter, 'A', timestamps=tsiter)
     self.nwbfile.add_acquisition(ts)
     with NWBHDF5IO(self.path, 'w') as io:
         io.write(self.nwbfile, cache_spec=False)
     with File(self.path, 'r') as f:
         dset = f['/acquisition/ts_name/data']
         self.assertListEqual(dset[:].tolist(), a.tolist())
Esempio n. 2
0
 def test_write_dataset_iterable_multidimensional_array(self):
     a = np.arange(30).reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     self.io.write_dataset(self.f, DatasetBuilder('test_dataset', daiter, attributes={}))
     dset = self.f['test_dataset']
     self.assertListEqual(dset[:].tolist(), a.tolist())
Esempio n. 3
0
 def test_write_dataset_iterable_multidimensional_array_compression(self):
     a = np.arange(30).reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     wrapped_daiter = H5DataIO(data=daiter,
                               compression='gzip',
                               compression_opts=5,
                               shuffle=True,
                               fletcher32=True)
     self.io.write_dataset(self.f, DatasetBuilder('test_dataset', wrapped_daiter, attributes={}))
     dset = self.f['test_dataset']
     self.assertEqual(dset.shape, a.shape)
     self.assertListEqual(dset[:].tolist(), a.tolist())
     self.assertEqual(dset.compression, 'gzip')
     self.assertEqual(dset.compression_opts, 5)
     self.assertEqual(dset.shuffle, True)
     self.assertEqual(dset.fletcher32, True)
Esempio n. 4
0
 def test_write_dataset_datachunkiterator_with_compression(self):
     a = np.arange(30).reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     wrapped_daiter = H5DataIO(data=daiter,
                               compression='gzip',
                               compression_opts=5,
                               shuffle=True,
                               fletcher32=True)
     ts = TimeSeries(name='ts_name', data=wrapped_daiter, unit='A', timestamps=np.arange(5.))
     self.nwbfile.add_acquisition(ts)
     with NWBHDF5IO(self.path, 'w') as io:
         io.write(self.nwbfile, cache_spec=False)
     with File(self.path, 'r') as f:
         dset = f['/acquisition/ts_name/data']
         self.assertEqual(dset.shape, a.shape)
         self.assertListEqual(dset[:].tolist(), a.tolist())
         self.assertEqual(dset.compression, 'gzip')
         self.assertEqual(dset.compression_opts, 5)
         self.assertEqual(dset.shuffle, True)
         self.assertEqual(dset.fletcher32, True)
Esempio n. 5
0
    embeddings[target_indices] = emb_file['embedding'][to_get]
finally:
    emb_file.close()

h5path = args.out

print("reading %d Fasta files" % len(fapaths))
print("Total size:", sum(os.path.getsize(f) for f in fapaths))

if args.faa:
    seqit = AASeqIterator(fapaths, verbose=True)
else:
    seqit = DNASeqIterator(fapaths, verbose=True)

packed = DataChunkIterator.from_iterable(iter(seqit),
                                         maxshape=(None, ),
                                         buffer_size=2**15,
                                         dtype=np.dtype('uint8'))
seqindex = DataChunkIterator.from_iterable(seqit.index_iter,
                                           maxshape=(None, ),
                                           buffer_size=2**0,
                                           dtype=np.dtype('int'))
names = DataChunkIterator.from_iterable(seqit.names_iter,
                                        maxshape=(None, ),
                                        buffer_size=2**0,
                                        dtype=np.dtype('U'))
ids = DataChunkIterator.from_iterable(seqit.id_iter,
                                      maxshape=(None, ),
                                      buffer_size=2**0,
                                      dtype=np.dtype('int'))
taxa = DataChunkIterator.from_iterable(seqit.taxon_iter,
                                       maxshape=(None, ),
Esempio n. 6
0
def prepare_data(argv=None):
    '''Aggregate sequence data GTDB using a file-of-files'''
    import argparse
    import io
    import sys
    import logging
    import h5py
    import pandas as pd

    from skbio import TreeNode

    from hdmf.common import get_hdf5io
    from hdmf.data_utils import DataChunkIterator

    from ..utils import get_faa_path, get_fna_path, get_genomic_path
    from exabiome.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator
    from exabiome.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'accessions',
        type=str,
        help='file of the NCBI accessions of the genomes to convert')
    parser.add_argument('fadir',
                        type=str,
                        help='directory with NCBI sequence files')
    parser.add_argument('metadata', type=str, help='metadata file from GTDB')
    parser.add_argument('tree', type=str, help='the distances file')
    parser.add_argument('out', type=str, help='output HDF5')
    grp = parser.add_mutually_exclusive_group()
    parser.add_argument('-e',
                        '--emb',
                        type=str,
                        help='embedding file',
                        default=None)
    grp.add_argument('-p',
                     '--protein',
                     action='store_true',
                     default=False,
                     help='get paths for protein files')
    grp.add_argument('-c',
                     '--cds',
                     action='store_true',
                     default=False,
                     help='get paths for CDS files')
    grp.add_argument('-g',
                     '--genomic',
                     action='store_true',
                     default=False,
                     help='get paths for genomic files (default)')
    parser.add_argument('-D',
                        '--dist_h5',
                        type=str,
                        help='the distances file',
                        default=None)
    parser.add_argument(
        '-d',
        '--max_deg',
        type=float,
        default=None,
        help='max number of degenerate characters in protein sequences')
    parser.add_argument('-l',
                        '--min_len',
                        type=float,
                        default=None,
                        help='min length of sequences')
    parser.add_argument('-V',
                        '--vocab',
                        action='store_true',
                        default=False,
                        help='store sequences as vocabulary data')

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(args=argv)

    if not any([args.protein, args.cds, args.genomic]):
        args.genomic = True

    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format='%(asctime)s - %(message)s')
    logger = logging.getLogger()

    # read accessions
    logger.info('reading accessions %s' % args.accessions)
    with open(args.accessions, 'r') as f:
        taxa_ids = [l[:-1] for l in f.readlines()]

    # get paths to Fasta Files
    fa_path_func = get_genomic_path
    if args.cds:
        fa_path_func = get_fna_path
    elif args.protein:
        fa_path_func = get_faa_path
    fapaths = [fa_path_func(acc, args.fadir) for acc in taxa_ids]

    di_kwargs = dict()
    # if a distance matrix file has been given, read and select relevant distances
    if args.dist_h5:
        #############################
        # read and filter distances
        #############################
        logger.info('reading distances from %s' % args.dist_h5)
        with h5py.File(args.dist_h5, 'r') as f:
            dist = f['distances'][:]
            dist_taxa = f['leaf_names'][:].astype('U')
        logger.info('selecting distances for taxa found in %s' %
                    args.accessions)
        dist = select_distances(taxa_ids, dist_taxa, dist)
        dist = CondensedDistanceMatrix('distances', data=dist)
        di_kwargs['distances'] = dist

    #############################
    # read and filter taxonomies
    #############################
    logger.info('reading taxonomies from %s' % args.metadata)
    taxlevels = [
        'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]

    def func(row):
        dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';')))
        dat['species'] = dat['species'].split(' ')[1]
        dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][
            3:]
        dat['accession'] = row['accession'][3:]
        return pd.Series(data=dat)

    logger.info('selecting GTDB taxonomy for taxa found in %s' %
                args.accessions)
    taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative']]\
                        .apply(func, axis=1)\
                        .set_index('accession')\
                        .filter(items=taxa_ids, axis=0)

    #############################
    # read and filter embeddings
    #############################
    emb = None
    if args.emb is not None:
        logger.info('reading embeddings from %s' % args.emb)
        with h5py.File(args.emb, 'r') as f:
            emb = f['embedding'][:]
            emb_taxa = f['leaf_names'][:]
        logger.info('selecting embeddings for taxa found in %s' %
                    args.accessions)
        emb = select_embeddings(taxa_ids, emb_taxa, emb)

    #############################
    # read and trim tree
    #############################
    logger.info('reading tree from %s' % args.tree)
    root = TreeNode.read(args.tree, format='newick')

    logger.info('transforming leaf names for shearing')
    for tip in root.tips():
        tip.name = tip.name[3:].replace(' ', '_')

    logger.info('shearing taxa not found in %s' % args.accessions)
    rep_ids = taxdf['gtdb_genome_representative'].values
    root = root.shear(rep_ids)

    logger.info('converting tree to Newick string')
    bytes_io = io.BytesIO()
    root.write(bytes_io, format='newick')
    tree_str = bytes_io.getvalue()
    tree = NewickString('tree', data=tree_str)

    if di_kwargs.get('distances') is None:
        from scipy.spatial.distance import squareform
        tt_dmat = root.tip_tip_distances()
        if (rep_ids != taxa_ids).any():
            tt_dmat = get_nonrep_matrix(taxa_ids, rep_ids, tt_dmat)
        dmat = tt_dmat.data
        di_kwargs['distances'] = CondensedDistanceMatrix('distances',
                                                         data=dmat)

    h5path = args.out

    logger.info("reading %d Fasta files" % len(fapaths))
    logger.info("Total size: %d", sum(os.path.getsize(f) for f in fapaths))

    if args.vocab:
        if args.protein:
            SeqTable = SequenceTable
            seqit = AAVocabIterator(fapaths,
                                    logger=logger,
                                    min_seq_len=args.min_len)
        else:
            SeqTable = DNATable
            if args.cds:
                logger.info("reading and writing CDS sequences")
                seqit = DNAVocabGeneIterator(fapaths,
                                             logger=logger,
                                             min_seq_len=args.min_len)
            else:
                seqit = DNAVocabIterator(fapaths,
                                         logger=logger,
                                         min_seq_len=args.min_len)
    else:
        if args.protein:
            logger.info("reading and writing protein sequences")
            seqit = AASeqIterator(fapaths,
                                  logger=logger,
                                  max_degenerate=args.max_deg,
                                  min_seq_len=args.min_len)
            SeqTable = AATable
        else:
            logger.info("reading and writing DNA sequences")
            seqit = DNASeqIterator(fapaths,
                                   logger=logger,
                                   min_seq_len=args.min_len)
            SeqTable = DNATable

    seqit_bsize = 2**25
    if args.protein:
        seqit_bsize = 2**15
    elif args.cds:
        seqit_bsize = 2**18

    # set up DataChunkIterators
    packed = DataChunkIterator.from_iterable(iter(seqit),
                                             maxshape=(None, ),
                                             buffer_size=seqit_bsize,
                                             dtype=np.dtype('uint8'))
    seqindex = DataChunkIterator.from_iterable(seqit.index_iter,
                                               maxshape=(None, ),
                                               buffer_size=2**0,
                                               dtype=np.dtype('int'))
    names = DataChunkIterator.from_iterable(seqit.names_iter,
                                            maxshape=(None, ),
                                            buffer_size=2**0,
                                            dtype=np.dtype('U'))
    ids = DataChunkIterator.from_iterable(seqit.id_iter,
                                          maxshape=(None, ),
                                          buffer_size=2**0,
                                          dtype=np.dtype('int'))
    taxa = DataChunkIterator.from_iterable(seqit.taxon_iter,
                                           maxshape=(None, ),
                                           buffer_size=2**0,
                                           dtype=np.dtype('uint16'))
    seqlens = DataChunkIterator.from_iterable(seqit.seqlens_iter,
                                              maxshape=(None, ),
                                              buffer_size=2**0,
                                              dtype=np.dtype('uint32'))

    io = get_hdf5io(h5path, 'w')

    tt_args = ['taxa_table', 'a table for storing taxa data', taxa_ids]
    tt_kwargs = dict()
    for t in taxlevels[1:]:
        tt_args.append(taxdf[t].values)
    if emb is not None:
        tt_kwargs['embedding'] = emb
    tt_kwargs['rep_taxon_id'] = rep_ids

    taxa_table = TaxaTable(*tt_args, **tt_kwargs)

    seq_table = SeqTable(
        'seq_table',
        'a table storing sequences for computing sequence embedding',
        io.set_dataio(names, compression='gzip', chunks=(2**15, )),
        io.set_dataio(packed,
                      compression='gzip',
                      maxshape=(None, ),
                      chunks=(2**15, )),
        io.set_dataio(seqindex,
                      compression='gzip',
                      maxshape=(None, ),
                      chunks=(2**15, )),
        io.set_dataio(seqlens,
                      compression='gzip',
                      maxshape=(None, ),
                      chunks=(2**15, )),
        io.set_dataio(taxa,
                      compression='gzip',
                      maxshape=(None, ),
                      chunks=(2**15, )),
        taxon_table=taxa_table,
        id=io.set_dataio(ids,
                         compression='gzip',
                         maxshape=(None, ),
                         chunks=(2**15, )))

    difile = DeepIndexFile(seq_table, taxa_table, tree, **di_kwargs)

    io.write(difile, exhaust_dci=False)
    io.close()

    logger.info("reading %s" % (h5path))
    h5size = os.path.getsize(h5path)
    logger.info("HDF5 size: %d", h5size)
 def test_dtype(self):
     a = np.arange(30, dtype='int32').reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     self.assertEqual(daiter.dtype, a.dtype)
 def test_maxshape(self):
     a = np.arange(30).reshape(5, 2, 3)
     aiter = iter(a)
     daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2)
     self.assertEqual(daiter.maxshape, (None, 2, 3))