Ejemplo n.º 1
0
def mv_seq(seq, opath, name_dict):
    seq = read(seq, format='fasta')
    with open(opath, 'w') as f1:
        for i in seq:
            pre_name = i.metadata['id']
            i.metadata['id'] = name_dict[pre_name]
            i.metadata['description'] = ''
            write(i, 'fasta', f1)
Ejemplo n.º 2
0
 def test_compute_gene_score(self):
     seqs = get_data_path('pfam.faa')
     for number, exp in [(1, 0.1), (102, 1)]:
         with NamedTemporaryFile() as faa:
             for i, seq in enumerate(read(seqs, format='fasta')):
                 if i == number:
                     break
                 write(seq, into=faa, format='fasta')
             faa.flush()
             obs = compute_gene_score(faa.name)
             self.assertEqual(obs, exp)
Ejemplo n.º 3
0
def write_sequences(seqs, outfile):
    """Write sequence(s) into a multi-sequence FASTA file.

    Parameters
    ----------
    seqs : str
        list of skbio Sequence
    outfile : str
        file path to output multi-sequence FASTA file
    """
    def outseqs():
        for seq in seqs:
            yield seq
    io.write(outseqs(), format='fasta', into=outfile)
Ejemplo n.º 4
0
def parse_inputs(inp_fp=None,
                 inp_from=None,
                 inp_to=None,
                 microprot_inp=None,
                 microprot_out=None):
    """ Parse multi-sequence FASTA file into single-sequence, remove any
    problematic characters from the name and add intormation to
    `processed_sequences.fasta` file
    Parameters
    ----------
    inp_fp : str
        file path to a multi-sequence FASTA file
    inp_from : int
        number of the first sequence in the input file
    inp_to : int
        number of the last sequence in the input file
    microprot_inp : str
        input directory where individual files from inp_fp will be placed
    microprot_out : str
        output directory path where processed_sequences.fasta file will \
        be created

    Returns
    -------
    SEQ_ids : list of str
        list of sequence ids picked from the inp_fp
    """
    for _dir in [microprot_inp, microprot_out]:
        if not os.path.exists(_dir):
            os.makedirs(_dir)

    SEQS = process_fasta.extract_sequences(inp_fp,
                                           identifiers=(inp_from, inp_to))
    SEQ_ids = []
    processed_fh = open('%s/%s' % (microprot_out, 'processed_sequences.fasta'),
                        'a')
    for i, SEQ in enumerate(SEQS):
        _seq = SEQ.metadata['id']
        _seq = _seq.replace('/', '_')
        _seq = _seq.replace('\\', '_')
        _seq = _seq.replace('|', '_')
        SEQ_ids.append(_seq)
        SEQ.metadata['id'] = _seq
        io.write(SEQ,
                 format='fasta',
                 into='%s/%s.fasta' % (microprot_inp, _seq))
        io.write(SEQ, format='fasta', into=processed_fh)
    processed_fh.close()
    return SEQ_ids
Ejemplo n.º 5
0
def write_sequences(seqs, outfile):
    """Write sequence(s) into a multi-sequence FASTA file.

    Parameters
    ----------
    seqs : str
        list of skbio Sequence
    outfile : str
        file path to output multi-sequence FASTA file
    """
    def outseqs():
        for seq in seqs:
            yield seq

    io.write(outseqs(), format='fasta', into=outfile)
Ejemplo n.º 6
0
def align_sequences(seqs):
    import io
    from subprocess import run, PIPE
    from skbio.io import read, write
    from skbio.sequence import Sequence

    fasta = 'rational_designs.fa'
    seqs = (Sequence(x) for x in seqs)
    write(seqs, format='fasta', into=fasta)

    clustalo = 'clustalo', '-i', fasta
    stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout
    stdout_io = io.StringIO(stdout)
    msa = read(stdout_io, format='fasta')

    return [str(x) for x in msa]
Ejemplo n.º 7
0
def split_fasta(seqs, prefix=None, outdir=None):
    """Split a multi-protein FASTA file into single-sequence FASTAs.

    Parameters
    ----------
    seqs : list of skbio.sequence
        List of skbio protein sequences with a protein name in header
    prefix : str
        Name prefix to be added to output single-sequence FASTA files

    Raises
    ------
    TypeError
        seqs needs to be a filepath or skbio.sequence object
    """

    if isinstance(seqs, str):
        if os.path.exists(seqs):
            seqs = extract_sequences(seqs)
        else:
            raise TypeError('split_fasta sequence input is not a filepath or '
                            'filepath does not exist.')
    elif isinstance(seqs, list):
        if len(seqs) == 0:
            raise ValueError('Empty list provided to split_fasta')
        else:
            if not isinstance(seqs[0], Sequence):
                raise TypeError('Object you provided to split_fasta is not '
                                'a skbio.sequence object')
    else:
        raise TypeError('split_fasta input sequences need to be a filepath or'
                        'skbio.sequence object.')

    if not outdir:
        outdir = os.getcwd()
    elif not os.path.exists(outdir):
        os.makedirs(outdir)
    for seq in seqs:
        if prefix:
            io.write(seq,
                     format='fasta',
                     into='%s/%s_%s.fasta' %
                     (outdir, prefix, seq.metadata['id']))
        else:
            io.write(seq,
                     format='fasta',
                     into='%s/%s.fasta' % (outdir, seq.metadata['id']))
Ejemplo n.º 8
0
def parse_inputs(inp_fp=None, inp_from=None, inp_to=None,
                 microprot_inp=None, microprot_out=None):
    """ Parse multi-sequence FASTA file into single-sequence, remove any
    problematic characters from the name and add intormation to
    `processed_sequences.fasta` file
    Parameters
    ----------
    inp_fp : str
        file path to a multi-sequence FASTA file
    inp_from : int
        number of the first sequence in the input file
    inp_to : int
        number of the last sequence in the input file
    microprot_inp : str
        input directory where individual files from inp_fp will be placed
    microprot_out : str
        output directory path where processed_sequences.fasta file will \
        be created

    Returns
    -------
    SEQ_ids : list of str
        list of sequence ids picked from the inp_fp
    """
    for _dir in [microprot_inp, microprot_out]:
        if not os.path.exists(_dir):
            os.makedirs(_dir)

    SEQS = process_fasta.extract_sequences(inp_fp,
                                           identifiers=(inp_from, inp_to))
    SEQ_ids = []
    processed_fh = open('%s/%s' % (microprot_out,
                                   'processed_sequences.fasta'), 'a')
    for i, SEQ in enumerate(SEQS):
        _seq = SEQ.metadata['id']
        _seq = _seq.replace('/', '_')
        _seq = _seq.replace('\\', '_')
        _seq = _seq.replace('|', '_')
        SEQ_ids.append(_seq)
        SEQ.metadata['id'] = _seq
        io.write(SEQ, format='fasta', into='%s/%s.fasta' % (microprot_inp,
                                                            _seq))
        io.write(SEQ, format='fasta',
                 into=processed_fh)
    processed_fh.close()
    return SEQ_ids
Ejemplo n.º 9
0
def split_fasta(seqs, prefix=None, outdir=None):
    """Split a multi-protein FASTA file into single-sequence FASTAs.

    Parameters
    ----------
    seqs : list of skbio.sequence
        List of skbio protein sequences with a protein name in header
    prefix : str
        Name prefix to be added to output single-sequence FASTA files

    Raises
    ------
    TypeError
        seqs needs to be a filepath or skbio.sequence object
    """

    if isinstance(seqs, str):
        if os.path.exists(seqs):
            seqs = extract_sequences(seqs)
        else:
            raise TypeError('split_fasta sequence input is not a filepath or '
                            'filepath does not exist.')
    elif isinstance(seqs, list):
        if len(seqs) == 0:
            raise ValueError('Empty list provided to split_fasta')
        else:
            if not isinstance(seqs[0], Sequence):
                raise TypeError('Object you provided to split_fasta is not '
                                'a skbio.sequence object')
    else:
        raise TypeError('split_fasta input sequences need to be a filepath or'
                        'skbio.sequence object.')

    if not outdir:
        outdir = os.getcwd()
    elif not os.path.exists(outdir):
        os.makedirs(outdir)
    for seq in seqs:
        if prefix:
            io.write(seq, format='fasta', into='%s/%s_%s.fasta' %
                     (outdir, prefix, seq.metadata['id']))
        else:
            io.write(seq, format='fasta', into='%s/%s.fasta' %
                     (outdir, seq.metadata['id']))
Ejemplo n.º 10
0
def _serialize_seq(seq, fh, skip_subregion=True):
    '''Serialize a sequence to GFF3.'''
    _serialize_interval_metadata(
        seq.interval_metadata, seq.metadata['id'], fh, skip_subregion)
    fh.write('##FASTA\n')
    write(seq, into=fh, format='fasta')
Ejemplo n.º 11
0
from skbio.io import read, write

seqs = read("example.fna", qual="example.qual", format="fasta")
write(seqs, into="example.fastq", variant="illumina1.8", format="fastq")
Ejemplo n.º 12
0
def _serialize_seq(seq, fh, skip_subregion=True):
    '''Serialize a sequence to GFF3.'''
    _serialize_interval_metadata(seq.interval_metadata, seq.metadata['id'], fh,
                                 skip_subregion)
    fh.write('##FASTA\n')
    write(seq, into=fh, format='fasta')
Ejemplo n.º 13
0
def prepare_data(argv=None):
    '''Aggregate sequence data GTDB using a file-of-files'''
    from io import BytesIO
    import tempfile
    import h5py

    from datetime import datetime

    from tqdm import tqdm

    from skbio import TreeNode
    from skbio.sequence import DNA, Protein

    from hdmf.common import get_hdf5io
    from hdmf.data_utils import DataChunkIterator

    from ..utils import get_faa_path, get_fna_path, get_genomic_path
    from deep_taxon.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator
    from deep_taxon.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix, GenomeTable, TreeGraph

    parser = argparse.ArgumentParser()
    parser.add_argument('fadir',
                        type=str,
                        help='directory with NCBI sequence files')
    parser.add_argument('metadata', type=str, help='metadata file from GTDB')
    parser.add_argument('out', type=str, help='output HDF5')
    parser.add_argument(
        '-T',
        '--tree',
        type=str,
        help='a Newick file with a tree of representative taxa',
        default=None)
    parser.add_argument(
        '-A',
        '--accessions',
        type=str,
        default=None,
        help='file of the NCBI accessions of the genomes to convert')
    parser.add_argument(
        '-d',
        '--max_deg',
        type=float,
        default=None,
        help='max number of degenerate characters in protein sequences')
    parser.add_argument('-l',
                        '--min_len',
                        type=float,
                        default=None,
                        help='min length of sequences')
    parser.add_argument('--iter',
                        action='store_true',
                        default=False,
                        help='convert using iterators')
    parser.add_argument(
        '-p',
        '--num_procs',
        type=int,
        default=1,
        help='the number of processes to use for counting total sequence size')
    parser.add_argument('-L',
                        '--total_seq_len',
                        type=int,
                        default=None,
                        help='the total sequence length')
    parser.add_argument('-t',
                        '--tmpdir',
                        type=str,
                        default=None,
                        help='a temporary directory to store sequences')
    parser.add_argument('-N',
                        '--n_seqs',
                        type=int,
                        default=None,
                        help='the total number of sequences')
    rep_grp = parser.add_mutually_exclusive_group()
    rep_grp.add_argument(
        '-n',
        '--nonrep',
        action='store_true',
        default=False,
        help='keep non-representative genomes only. keep both by default')
    rep_grp.add_argument(
        '-r',
        '--rep',
        action='store_true',
        default=False,
        help='keep representative genomes only. keep both by default')
    parser.add_argument(
        '-a',
        '--all',
        action='store_true',
        default=False,
        help=
        'keep all non-representative genomes. By default, only non-reps with the highest and lowest contig count are kept'
    )
    grp = parser.add_mutually_exclusive_group()
    grp.add_argument('-P',
                     '--protein',
                     action='store_true',
                     default=False,
                     help='get paths for protein files')
    grp.add_argument('-C',
                     '--cds',
                     action='store_true',
                     default=False,
                     help='get paths for CDS files')
    grp.add_argument('-G',
                     '--genomic',
                     action='store_true',
                     default=False,
                     help='get paths for genomic files (default)')
    parser.add_argument('-z',
                        '--gzip',
                        action='store_true',
                        default=False,
                        help='GZip sequence table')
    dep_grp = parser.add_argument_group(
        title="Legacy options you probably do not need")
    dep_grp.add_argument('-e',
                         '--emb',
                         type=str,
                         help='embedding file',
                         default=None)

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(args=argv)

    if args.total_seq_len is not None:
        if args.n_seqs is None:
            sys.stderr.write(
                "If using --total_seq_len, you must also use --n_seqs\n")
    if args.n_seqs is not None:
        if args.total_seq_len is None:
            sys.stderr.write(
                "If using --n_seqs, you must also use --total_seq_len\n")

    if not any([args.protein, args.cds, args.genomic]):
        args.genomic = True

    logging.basicConfig(stream=sys.stderr,
                        level=logging.INFO,
                        format='%(asctime)s - %(message)s')
    logger = logging.getLogger()

    #############################
    # read and filter taxonomies
    #############################
    logger.info('Reading taxonomies from %s' % args.metadata)
    taxlevels = [
        'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]
    extra_cols = ['contig_count', 'checkm_completeness']

    def func(row):
        dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';')))
        dat['species'] = dat['species']  # .split(' ')[1]
        dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][
            3:]
        dat['accession'] = row['accession'][3:]
        for k in extra_cols:
            dat[k] = row[k]
        return pd.Series(data=dat)

    taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative', 'contig_count', 'checkm_completeness']]\
                        .apply(func, axis=1)

    taxdf = taxdf.set_index('accession')
    dflen = len(taxdf)
    logger.info('Found %d total genomes' % dflen)
    taxdf = taxdf[taxdf['gtdb_genome_representative'].str.contains(
        'GC[A,F]_', regex=True)]  # get rid of genomes that are not at NCBI
    taxdf = taxdf[taxdf.index.str.contains(
        'GC[A,F]_', regex=True)]  # get rid of genomes that are not at NCBI
    logger.info('Discarded %d non-NCBI genomes' % (dflen - len(taxdf)))

    rep_taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']]

    if args.accessions is not None:
        logger.info('reading accessions %s' % args.accessions)
        with open(args.accessions, 'r') as f:
            accessions = [l[:-1] for l in f.readlines()]
        dflen = len(taxdf)
        taxdf = taxdf[taxdf.index.isin(accessions)]
        logger.info('Discarded %d genomes not found in %s' %
                    (dflen - len(taxdf), args.accessions))

    dflen = len(taxdf)
    if args.nonrep:
        taxdf = taxdf[taxdf.index != taxdf['gtdb_genome_representative']]
        logger.info('Discarded %d representative genomes' %
                    (dflen - len(taxdf)))
        dflen = len(taxdf)
        if not args.all:
            groups = taxdf[['gtdb_genome_representative', 'contig_count'
                            ]].groupby('gtdb_genome_representative')
            min_ctgs = groups.idxmin()['contig_count']
            max_ctgs = groups.idxmax()['contig_count']
            accessions = np.unique(np.concatenate([min_ctgs, max_ctgs]))
            taxdf = taxdf.filter(accessions, axis=0)
            logger.info('Discarded %d extra non-representative genomes' %
                        (dflen - len(taxdf)))
    elif args.rep:
        taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']]
        logger.info('Discarded %d non-representative genomes' %
                    (dflen - len(taxdf)))

    dflen = len(taxdf)
    logger.info('%d remaining genomes' % dflen)

    ###############################
    # Arguments for constructing the DeepIndexFile object
    ###############################
    di_kwargs = dict()

    taxa_ids = taxdf.index.values

    # get paths to Fasta Files
    fa_path_func = partial(get_genomic_path, directory=args.fadir)
    if args.cds:
        fa_path_func = partial(get_fna_path, directory=args.fadir)
    elif args.protein:
        fa_path_func = partial(get_faa_path, directory=args.fadir)

    map_func = map
    if args.num_procs > 1:
        logger.info(f'using {args.num_procs} processes to locate Fasta files')
        import multiprocessing as mp
        map_func = mp.Pool(processes=args.num_procs).imap

    logger.info('Locating Fasta files for each taxa')
    fapaths = list(tqdm(map_func(fa_path_func, taxa_ids), total=len(taxa_ids)))

    logger.info('Found Fasta files for all accessions')

    #############################
    # read and filter embeddings
    #############################
    emb = None
    if args.emb is not None:
        logger.info('reading embeddings from %s' % args.emb)
        with h5py.File(args.emb, 'r') as f:
            emb = f['embedding'][:]
            emb_taxa = f['leaf_names'][:]
        logger.info('selecting embeddings for taxa found in %s' %
                    args.accessions)
        emb = select_embeddings(taxa_ids, emb_taxa, emb)

    logger.info(f'Writing {len(rep_taxdf)} taxa to taxa table')
    tt_args = [
        'taxa_table', 'a table for storing taxa data', rep_taxdf.index.values
    ]
    tt_kwargs = dict()
    for t in taxlevels[:-1]:
        enc = LabelEncoder().fit(rep_taxdf[t].values)
        _data = enc.transform(rep_taxdf[t].values).astype(np.uint32)
        _vocab = enc.classes_.astype('U')
        logger.info(f'{t} - {len(_vocab)} classes')
        tt_args.append(
            EnumData(name=t,
                     description=f'label encoded {t}',
                     data=_data,
                     elements=_vocab))
    # we have too many species to store this as VocabData, nor does it save any spaces
    tt_args.append(
        VectorData(name='species',
                   description=f'Microbial species in the form Genus species',
                   data=rep_taxdf['species'].values))

    if emb is not None:
        tt_kwargs['embedding'] = emb
    #tt_kwargs['rep_taxon_id'] = rep_taxdf['gtdb_genome_representative'].values

    taxa_table = TaxaTable(*tt_args, **tt_kwargs)

    h5path = args.out

    logger.info("reading %d Fasta files" % len(fapaths))
    logger.info("Total size: %d", sum(list(map_func(os.path.getsize,
                                                    fapaths))))

    tmp_h5_file = None
    if args.protein:
        vocab_it = AAVocabIterator
        SeqTable = SequenceTable
        skbio_cls = Protein
    else:
        vocab_it = DNAVocabIterator
        SeqTable = DNATable
        skbio_cls = DNA

    vocab = np.array(list(vocab_it.characters()))
    if not args.protein:
        np.testing.assert_array_equal(vocab, list('ACYWSKDVNTGRMHB'))

    if args.total_seq_len is None:
        logger.info('counting total number of sqeuences')
        n_seqs, total_seq_len = np.array(
            list(zip(
                *tqdm(map_func(seqlen, fapaths), total=len(fapaths))))).sum(
                    axis=1)
        logger.info(f'found {total_seq_len} bases across {n_seqs} sequences')
    else:
        n_seqs, total_seq_len = args.n_seqs, args.total_seq_len
        logger.info(
            f'As specified, there are {total_seq_len} bases across {n_seqs} sequences'
        )

    logger.info(
        f'allocating uint8 array of length {total_seq_len} for sequences')

    if args.tmpdir is not None:
        if not os.path.exists(args.tmpdir):
            os.mkdir(args.tmpdir)
        tmpdir = tempfile.mkdtemp(dir=args.tmpdir)
    else:
        tmpdir = tempfile.mkdtemp()

    comp = 'gzip' if args.gzip else None
    tmp_h5_filename = os.path.join(tmpdir, 'sequences.h5')
    logger.info(f'writing temporary sequence data to {tmp_h5_filename}')
    tmp_h5_file = h5py.File(tmp_h5_filename, 'w')
    sequence = tmp_h5_file.create_dataset('sequences',
                                          shape=(total_seq_len, ),
                                          dtype=np.uint8,
                                          compression=comp)
    seqindex = tmp_h5_file.create_dataset('sequences_index',
                                          shape=(n_seqs, ),
                                          dtype=np.uint64,
                                          compression=comp)
    genomes = tmp_h5_file.create_dataset('genomes',
                                         shape=(n_seqs, ),
                                         dtype=np.uint64,
                                         compression=comp)
    seqlens = tmp_h5_file.create_dataset('seqlens',
                                         shape=(n_seqs, ),
                                         dtype=np.uint64,
                                         compression=comp)
    names = tmp_h5_file.create_dataset('seqnames',
                                       shape=(n_seqs, ),
                                       dtype=h5py.special_dtype(vlen=str),
                                       compression=comp)

    taxa = np.zeros(len(fapaths), dtype=int)

    seq_i = 0
    b = 0
    for genome_i, fa in tqdm(enumerate(fapaths), total=len(fapaths)):
        kwargs = {
            'format': 'fasta',
            'constructor': skbio_cls,
            'validate': False
        }
        taxid = taxa_ids[genome_i]
        rep_taxid = taxdf['gtdb_genome_representative'][genome_i]
        taxa[genome_i] = np.where(rep_taxdf.index == rep_taxid)[0][0]
        for seq in skbio.io.read(fa, **kwargs):
            enc_seq = vocab_it.encode(seq)
            e = b + len(enc_seq)
            sequence[b:e] = enc_seq
            seqindex[seq_i] = e
            genomes[seq_i] = genome_i
            seqlens[seq_i] = len(enc_seq)
            names[seq_i] = vocab_it.get_seqname(seq)
            b = e
            seq_i += 1
    ids = tmp_h5_file.create_dataset('ids', data=np.arange(n_seqs), dtype=int)
    tmp_h5_file.flush()

    io = get_hdf5io(h5path, 'w')

    print([a['name'] for a in GenomeTable.__init__.__docval__['args']])

    genome_table = GenomeTable(
        'genome_table',
        'information about the genome each sequence comes from',
        taxa_ids,
        taxa,
        taxa_table=taxa_table)

    #############################
    # read and trim tree
    #############################
    if args.tree:
        logger.info('Reading tree from %s' % args.tree)
        root = TreeNode.read(args.tree, format='newick')

        logger.info('Found %d tips' % len(list(root.tips())))

        logger.info('Transforming leaf names for shearing')
        for tip in root.tips():
            tip.name = tip.name[3:].replace(' ', '_')

        logger.info('converting tree to Newick string')
        bytes_io = BytesIO()
        root.write(bytes_io, format='newick')
        tree_str = bytes_io.getvalue()
        di_kwargs['tree'] = NewickString('tree', data=tree_str)

        # get distances from tree if they are not provided
        tt_dmat = root.tip_tip_distances().filter(rep_taxdf.index)
        di_kwargs['distances'] = CondensedDistanceMatrix('distances',
                                                         data=tt_dmat.data)

        adj, gt_indices = get_tree_graph(root, rep_taxdf)
        di_kwargs['tree_graph'] = TreeGraph(data=adj,
                                            leaves=gt_indices,
                                            table=genome_table,
                                            name='tree_graph')

    if args.gzip:
        names = io.set_dataio(names, compression='gzip', chunks=True)
        sequence = io.set_dataio(sequence,
                                 compression='gzip',
                                 maxshape=(None, ),
                                 chunks=True)
        seqindex = io.set_dataio(seqindex,
                                 compression='gzip',
                                 maxshape=(None, ),
                                 chunks=True)
        seqlens = io.set_dataio(seqlens,
                                compression='gzip',
                                maxshape=(None, ),
                                chunks=True)
        genomes = io.set_dataio(genomes,
                                compression='gzip',
                                maxshape=(None, ),
                                chunks=True)
        ids = io.set_dataio(ids,
                            compression='gzip',
                            maxshape=(None, ),
                            chunks=True)

    seq_table = SeqTable(
        'seq_table',
        'a table storing sequences for computing sequence embedding',
        names,
        sequence,
        seqindex,
        seqlens,
        genomes,
        genome_table=genome_table,
        id=ids,
        vocab=vocab)

    difile = DeepIndexFile(seq_table, taxa_table, genome_table, **di_kwargs)

    before = datetime.now()
    io.write(difile, exhaust_dci=False, link_data=False)
    io.close()
    after = datetime.now()
    delta = (after - before).total_seconds()

    logger.info(
        f'Sequence totals {sequence.dtype.itemsize * sequence.size} bytes')
    logger.info(f'Took {delta} seconds to write after read')

    if tmp_h5_file is not None:
        tmp_h5_file.close()

    logger.info("reading %s" % (h5path))
    h5size = os.path.getsize(h5path)
    logger.info("HDF5 size: %d", h5size)
Ejemplo n.º 14
0
def transport_to_repo(communities,
                      mock_data_dir,
                      project_dir,
                      sample_type_dirname='mock-community',
                      rep_seqs_fn='rep_seqs.qza',
                      feature_table_fn='feature_table.qza',
                      tree_fn='phylogeny.qza',
                      sample_md_fn='sample-metadata.tsv',
                      biom_table_fn='feature_table.biom',
                      fasta_fn='rep_seqs.fna',
                      newick_fn='phylogeny.tre'):
    '''Copy essential mock community data to tax-credit repo

    communities: list
        list of dir names in mock_data_dir, a.k.a. names of mock communities
    mock_data_dir: path
        source directory containing mock communities dirs of results
    project_dir: path
        path to tax-credit repo directory
    sample_type_dirname: str
        name of destination directory to contain communities dirs. The analog
        of mock_data_dir in the repo, dirs for individual communities will be
        located in project_dir/data/sample_type_dirname/community
    rep_seqs_fn: str
        name of rep seqs FeatureData[Sequence] Artifact in community_dir
    feature_table_fn: str
        name of rep seqs FeatureTable[Frequency] Artifact in community_dir
    tree_fn: str
        name of Phylogeny[Rooted] Artifact in community_dir
    sample_md_fn: str
        name of metadata mapping file in community_dir
    biom_table_fn: str
        destination name of biom table in project_dir
    fasta_fn: str
        destination name of fasta file in project_dir
    newick_fn: str
        destination name of newick format tree in project_dir
    '''

    for community in communities:
        community_dir = join(mock_data_dir, community)

        # Define base dir destination for mock community directories
        repo_destination = join(project_dir, "data", sample_type_dirname,
                                community)
        if not exists(repo_destination):
            makedirs(repo_destination)

        # Files to move
        rep_seqs = join(community_dir, rep_seqs_fn)
        feature_table = join(community_dir, feature_table_fn)
        tree = join(community_dir, tree_fn)
        sample_md = join(community_dir, sample_md_fn)

        biom_table_fp = join(community_dir, biom_table_fn)
        rep_seqs_fp = join(community_dir, fasta_fn)
        tree_fp = join(community_dir, newick_fn)

        # Extract biom, tree, rep_seqs
        rep_seqs_fna = qiime2.Artifact.load(rep_seqs).view(DNAIterator)
        io.write(rep_seqs_fna.generator, format='fasta', into=rep_seqs_fp)

        if exists(tree):
            qiime2.Artifact.load(tree).view(TreeNode).write(tree_fp)

        # Move to repo:
        for f in [
                rep_seqs, feature_table, tree, sample_md, biom_table_fp,
                rep_seqs_fp, tree_fp
        ]:
            if exists(f):
                copyfile(f, join(repo_destination, basename(f)))