def mv_seq(seq, opath, name_dict): seq = read(seq, format='fasta') with open(opath, 'w') as f1: for i in seq: pre_name = i.metadata['id'] i.metadata['id'] = name_dict[pre_name] i.metadata['description'] = '' write(i, 'fasta', f1)
def test_compute_gene_score(self): seqs = get_data_path('pfam.faa') for number, exp in [(1, 0.1), (102, 1)]: with NamedTemporaryFile() as faa: for i, seq in enumerate(read(seqs, format='fasta')): if i == number: break write(seq, into=faa, format='fasta') faa.flush() obs = compute_gene_score(faa.name) self.assertEqual(obs, exp)
def write_sequences(seqs, outfile): """Write sequence(s) into a multi-sequence FASTA file. Parameters ---------- seqs : str list of skbio Sequence outfile : str file path to output multi-sequence FASTA file """ def outseqs(): for seq in seqs: yield seq io.write(outseqs(), format='fasta', into=outfile)
def parse_inputs(inp_fp=None, inp_from=None, inp_to=None, microprot_inp=None, microprot_out=None): """ Parse multi-sequence FASTA file into single-sequence, remove any problematic characters from the name and add intormation to `processed_sequences.fasta` file Parameters ---------- inp_fp : str file path to a multi-sequence FASTA file inp_from : int number of the first sequence in the input file inp_to : int number of the last sequence in the input file microprot_inp : str input directory where individual files from inp_fp will be placed microprot_out : str output directory path where processed_sequences.fasta file will \ be created Returns ------- SEQ_ids : list of str list of sequence ids picked from the inp_fp """ for _dir in [microprot_inp, microprot_out]: if not os.path.exists(_dir): os.makedirs(_dir) SEQS = process_fasta.extract_sequences(inp_fp, identifiers=(inp_from, inp_to)) SEQ_ids = [] processed_fh = open('%s/%s' % (microprot_out, 'processed_sequences.fasta'), 'a') for i, SEQ in enumerate(SEQS): _seq = SEQ.metadata['id'] _seq = _seq.replace('/', '_') _seq = _seq.replace('\\', '_') _seq = _seq.replace('|', '_') SEQ_ids.append(_seq) SEQ.metadata['id'] = _seq io.write(SEQ, format='fasta', into='%s/%s.fasta' % (microprot_inp, _seq)) io.write(SEQ, format='fasta', into=processed_fh) processed_fh.close() return SEQ_ids
def align_sequences(seqs): import io from subprocess import run, PIPE from skbio.io import read, write from skbio.sequence import Sequence fasta = 'rational_designs.fa' seqs = (Sequence(x) for x in seqs) write(seqs, format='fasta', into=fasta) clustalo = 'clustalo', '-i', fasta stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout stdout_io = io.StringIO(stdout) msa = read(stdout_io, format='fasta') return [str(x) for x in msa]
def split_fasta(seqs, prefix=None, outdir=None): """Split a multi-protein FASTA file into single-sequence FASTAs. Parameters ---------- seqs : list of skbio.sequence List of skbio protein sequences with a protein name in header prefix : str Name prefix to be added to output single-sequence FASTA files Raises ------ TypeError seqs needs to be a filepath or skbio.sequence object """ if isinstance(seqs, str): if os.path.exists(seqs): seqs = extract_sequences(seqs) else: raise TypeError('split_fasta sequence input is not a filepath or ' 'filepath does not exist.') elif isinstance(seqs, list): if len(seqs) == 0: raise ValueError('Empty list provided to split_fasta') else: if not isinstance(seqs[0], Sequence): raise TypeError('Object you provided to split_fasta is not ' 'a skbio.sequence object') else: raise TypeError('split_fasta input sequences need to be a filepath or' 'skbio.sequence object.') if not outdir: outdir = os.getcwd() elif not os.path.exists(outdir): os.makedirs(outdir) for seq in seqs: if prefix: io.write(seq, format='fasta', into='%s/%s_%s.fasta' % (outdir, prefix, seq.metadata['id'])) else: io.write(seq, format='fasta', into='%s/%s.fasta' % (outdir, seq.metadata['id']))
def _serialize_seq(seq, fh, skip_subregion=True): '''Serialize a sequence to GFF3.''' _serialize_interval_metadata( seq.interval_metadata, seq.metadata['id'], fh, skip_subregion) fh.write('##FASTA\n') write(seq, into=fh, format='fasta')
from skbio.io import read, write seqs = read("example.fna", qual="example.qual", format="fasta") write(seqs, into="example.fastq", variant="illumina1.8", format="fastq")
def _serialize_seq(seq, fh, skip_subregion=True): '''Serialize a sequence to GFF3.''' _serialize_interval_metadata(seq.interval_metadata, seq.metadata['id'], fh, skip_subregion) fh.write('##FASTA\n') write(seq, into=fh, format='fasta')
def prepare_data(argv=None): '''Aggregate sequence data GTDB using a file-of-files''' from io import BytesIO import tempfile import h5py from datetime import datetime from tqdm import tqdm from skbio import TreeNode from skbio.sequence import DNA, Protein from hdmf.common import get_hdf5io from hdmf.data_utils import DataChunkIterator from ..utils import get_faa_path, get_fna_path, get_genomic_path from deep_taxon.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator from deep_taxon.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix, GenomeTable, TreeGraph parser = argparse.ArgumentParser() parser.add_argument('fadir', type=str, help='directory with NCBI sequence files') parser.add_argument('metadata', type=str, help='metadata file from GTDB') parser.add_argument('out', type=str, help='output HDF5') parser.add_argument( '-T', '--tree', type=str, help='a Newick file with a tree of representative taxa', default=None) parser.add_argument( '-A', '--accessions', type=str, default=None, help='file of the NCBI accessions of the genomes to convert') parser.add_argument( '-d', '--max_deg', type=float, default=None, help='max number of degenerate characters in protein sequences') parser.add_argument('-l', '--min_len', type=float, default=None, help='min length of sequences') parser.add_argument('--iter', action='store_true', default=False, help='convert using iterators') parser.add_argument( '-p', '--num_procs', type=int, default=1, help='the number of processes to use for counting total sequence size') parser.add_argument('-L', '--total_seq_len', type=int, default=None, help='the total sequence length') parser.add_argument('-t', '--tmpdir', type=str, default=None, help='a temporary directory to store sequences') parser.add_argument('-N', '--n_seqs', type=int, default=None, help='the total number of sequences') rep_grp = parser.add_mutually_exclusive_group() rep_grp.add_argument( '-n', '--nonrep', action='store_true', default=False, help='keep non-representative genomes only. keep both by default') rep_grp.add_argument( '-r', '--rep', action='store_true', default=False, help='keep representative genomes only. keep both by default') parser.add_argument( '-a', '--all', action='store_true', default=False, help= 'keep all non-representative genomes. By default, only non-reps with the highest and lowest contig count are kept' ) grp = parser.add_mutually_exclusive_group() grp.add_argument('-P', '--protein', action='store_true', default=False, help='get paths for protein files') grp.add_argument('-C', '--cds', action='store_true', default=False, help='get paths for CDS files') grp.add_argument('-G', '--genomic', action='store_true', default=False, help='get paths for genomic files (default)') parser.add_argument('-z', '--gzip', action='store_true', default=False, help='GZip sequence table') dep_grp = parser.add_argument_group( title="Legacy options you probably do not need") dep_grp.add_argument('-e', '--emb', type=str, help='embedding file', default=None) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args(args=argv) if args.total_seq_len is not None: if args.n_seqs is None: sys.stderr.write( "If using --total_seq_len, you must also use --n_seqs\n") if args.n_seqs is not None: if args.total_seq_len is None: sys.stderr.write( "If using --n_seqs, you must also use --total_seq_len\n") if not any([args.protein, args.cds, args.genomic]): args.genomic = True logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='%(asctime)s - %(message)s') logger = logging.getLogger() ############################# # read and filter taxonomies ############################# logger.info('Reading taxonomies from %s' % args.metadata) taxlevels = [ 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] extra_cols = ['contig_count', 'checkm_completeness'] def func(row): dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';'))) dat['species'] = dat['species'] # .split(' ')[1] dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][ 3:] dat['accession'] = row['accession'][3:] for k in extra_cols: dat[k] = row[k] return pd.Series(data=dat) taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative', 'contig_count', 'checkm_completeness']]\ .apply(func, axis=1) taxdf = taxdf.set_index('accession') dflen = len(taxdf) logger.info('Found %d total genomes' % dflen) taxdf = taxdf[taxdf['gtdb_genome_representative'].str.contains( 'GC[A,F]_', regex=True)] # get rid of genomes that are not at NCBI taxdf = taxdf[taxdf.index.str.contains( 'GC[A,F]_', regex=True)] # get rid of genomes that are not at NCBI logger.info('Discarded %d non-NCBI genomes' % (dflen - len(taxdf))) rep_taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']] if args.accessions is not None: logger.info('reading accessions %s' % args.accessions) with open(args.accessions, 'r') as f: accessions = [l[:-1] for l in f.readlines()] dflen = len(taxdf) taxdf = taxdf[taxdf.index.isin(accessions)] logger.info('Discarded %d genomes not found in %s' % (dflen - len(taxdf), args.accessions)) dflen = len(taxdf) if args.nonrep: taxdf = taxdf[taxdf.index != taxdf['gtdb_genome_representative']] logger.info('Discarded %d representative genomes' % (dflen - len(taxdf))) dflen = len(taxdf) if not args.all: groups = taxdf[['gtdb_genome_representative', 'contig_count' ]].groupby('gtdb_genome_representative') min_ctgs = groups.idxmin()['contig_count'] max_ctgs = groups.idxmax()['contig_count'] accessions = np.unique(np.concatenate([min_ctgs, max_ctgs])) taxdf = taxdf.filter(accessions, axis=0) logger.info('Discarded %d extra non-representative genomes' % (dflen - len(taxdf))) elif args.rep: taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']] logger.info('Discarded %d non-representative genomes' % (dflen - len(taxdf))) dflen = len(taxdf) logger.info('%d remaining genomes' % dflen) ############################### # Arguments for constructing the DeepIndexFile object ############################### di_kwargs = dict() taxa_ids = taxdf.index.values # get paths to Fasta Files fa_path_func = partial(get_genomic_path, directory=args.fadir) if args.cds: fa_path_func = partial(get_fna_path, directory=args.fadir) elif args.protein: fa_path_func = partial(get_faa_path, directory=args.fadir) map_func = map if args.num_procs > 1: logger.info(f'using {args.num_procs} processes to locate Fasta files') import multiprocessing as mp map_func = mp.Pool(processes=args.num_procs).imap logger.info('Locating Fasta files for each taxa') fapaths = list(tqdm(map_func(fa_path_func, taxa_ids), total=len(taxa_ids))) logger.info('Found Fasta files for all accessions') ############################# # read and filter embeddings ############################# emb = None if args.emb is not None: logger.info('reading embeddings from %s' % args.emb) with h5py.File(args.emb, 'r') as f: emb = f['embedding'][:] emb_taxa = f['leaf_names'][:] logger.info('selecting embeddings for taxa found in %s' % args.accessions) emb = select_embeddings(taxa_ids, emb_taxa, emb) logger.info(f'Writing {len(rep_taxdf)} taxa to taxa table') tt_args = [ 'taxa_table', 'a table for storing taxa data', rep_taxdf.index.values ] tt_kwargs = dict() for t in taxlevels[:-1]: enc = LabelEncoder().fit(rep_taxdf[t].values) _data = enc.transform(rep_taxdf[t].values).astype(np.uint32) _vocab = enc.classes_.astype('U') logger.info(f'{t} - {len(_vocab)} classes') tt_args.append( EnumData(name=t, description=f'label encoded {t}', data=_data, elements=_vocab)) # we have too many species to store this as VocabData, nor does it save any spaces tt_args.append( VectorData(name='species', description=f'Microbial species in the form Genus species', data=rep_taxdf['species'].values)) if emb is not None: tt_kwargs['embedding'] = emb #tt_kwargs['rep_taxon_id'] = rep_taxdf['gtdb_genome_representative'].values taxa_table = TaxaTable(*tt_args, **tt_kwargs) h5path = args.out logger.info("reading %d Fasta files" % len(fapaths)) logger.info("Total size: %d", sum(list(map_func(os.path.getsize, fapaths)))) tmp_h5_file = None if args.protein: vocab_it = AAVocabIterator SeqTable = SequenceTable skbio_cls = Protein else: vocab_it = DNAVocabIterator SeqTable = DNATable skbio_cls = DNA vocab = np.array(list(vocab_it.characters())) if not args.protein: np.testing.assert_array_equal(vocab, list('ACYWSKDVNTGRMHB')) if args.total_seq_len is None: logger.info('counting total number of sqeuences') n_seqs, total_seq_len = np.array( list(zip( *tqdm(map_func(seqlen, fapaths), total=len(fapaths))))).sum( axis=1) logger.info(f'found {total_seq_len} bases across {n_seqs} sequences') else: n_seqs, total_seq_len = args.n_seqs, args.total_seq_len logger.info( f'As specified, there are {total_seq_len} bases across {n_seqs} sequences' ) logger.info( f'allocating uint8 array of length {total_seq_len} for sequences') if args.tmpdir is not None: if not os.path.exists(args.tmpdir): os.mkdir(args.tmpdir) tmpdir = tempfile.mkdtemp(dir=args.tmpdir) else: tmpdir = tempfile.mkdtemp() comp = 'gzip' if args.gzip else None tmp_h5_filename = os.path.join(tmpdir, 'sequences.h5') logger.info(f'writing temporary sequence data to {tmp_h5_filename}') tmp_h5_file = h5py.File(tmp_h5_filename, 'w') sequence = tmp_h5_file.create_dataset('sequences', shape=(total_seq_len, ), dtype=np.uint8, compression=comp) seqindex = tmp_h5_file.create_dataset('sequences_index', shape=(n_seqs, ), dtype=np.uint64, compression=comp) genomes = tmp_h5_file.create_dataset('genomes', shape=(n_seqs, ), dtype=np.uint64, compression=comp) seqlens = tmp_h5_file.create_dataset('seqlens', shape=(n_seqs, ), dtype=np.uint64, compression=comp) names = tmp_h5_file.create_dataset('seqnames', shape=(n_seqs, ), dtype=h5py.special_dtype(vlen=str), compression=comp) taxa = np.zeros(len(fapaths), dtype=int) seq_i = 0 b = 0 for genome_i, fa in tqdm(enumerate(fapaths), total=len(fapaths)): kwargs = { 'format': 'fasta', 'constructor': skbio_cls, 'validate': False } taxid = taxa_ids[genome_i] rep_taxid = taxdf['gtdb_genome_representative'][genome_i] taxa[genome_i] = np.where(rep_taxdf.index == rep_taxid)[0][0] for seq in skbio.io.read(fa, **kwargs): enc_seq = vocab_it.encode(seq) e = b + len(enc_seq) sequence[b:e] = enc_seq seqindex[seq_i] = e genomes[seq_i] = genome_i seqlens[seq_i] = len(enc_seq) names[seq_i] = vocab_it.get_seqname(seq) b = e seq_i += 1 ids = tmp_h5_file.create_dataset('ids', data=np.arange(n_seqs), dtype=int) tmp_h5_file.flush() io = get_hdf5io(h5path, 'w') print([a['name'] for a in GenomeTable.__init__.__docval__['args']]) genome_table = GenomeTable( 'genome_table', 'information about the genome each sequence comes from', taxa_ids, taxa, taxa_table=taxa_table) ############################# # read and trim tree ############################# if args.tree: logger.info('Reading tree from %s' % args.tree) root = TreeNode.read(args.tree, format='newick') logger.info('Found %d tips' % len(list(root.tips()))) logger.info('Transforming leaf names for shearing') for tip in root.tips(): tip.name = tip.name[3:].replace(' ', '_') logger.info('converting tree to Newick string') bytes_io = BytesIO() root.write(bytes_io, format='newick') tree_str = bytes_io.getvalue() di_kwargs['tree'] = NewickString('tree', data=tree_str) # get distances from tree if they are not provided tt_dmat = root.tip_tip_distances().filter(rep_taxdf.index) di_kwargs['distances'] = CondensedDistanceMatrix('distances', data=tt_dmat.data) adj, gt_indices = get_tree_graph(root, rep_taxdf) di_kwargs['tree_graph'] = TreeGraph(data=adj, leaves=gt_indices, table=genome_table, name='tree_graph') if args.gzip: names = io.set_dataio(names, compression='gzip', chunks=True) sequence = io.set_dataio(sequence, compression='gzip', maxshape=(None, ), chunks=True) seqindex = io.set_dataio(seqindex, compression='gzip', maxshape=(None, ), chunks=True) seqlens = io.set_dataio(seqlens, compression='gzip', maxshape=(None, ), chunks=True) genomes = io.set_dataio(genomes, compression='gzip', maxshape=(None, ), chunks=True) ids = io.set_dataio(ids, compression='gzip', maxshape=(None, ), chunks=True) seq_table = SeqTable( 'seq_table', 'a table storing sequences for computing sequence embedding', names, sequence, seqindex, seqlens, genomes, genome_table=genome_table, id=ids, vocab=vocab) difile = DeepIndexFile(seq_table, taxa_table, genome_table, **di_kwargs) before = datetime.now() io.write(difile, exhaust_dci=False, link_data=False) io.close() after = datetime.now() delta = (after - before).total_seconds() logger.info( f'Sequence totals {sequence.dtype.itemsize * sequence.size} bytes') logger.info(f'Took {delta} seconds to write after read') if tmp_h5_file is not None: tmp_h5_file.close() logger.info("reading %s" % (h5path)) h5size = os.path.getsize(h5path) logger.info("HDF5 size: %d", h5size)
def transport_to_repo(communities, mock_data_dir, project_dir, sample_type_dirname='mock-community', rep_seqs_fn='rep_seqs.qza', feature_table_fn='feature_table.qza', tree_fn='phylogeny.qza', sample_md_fn='sample-metadata.tsv', biom_table_fn='feature_table.biom', fasta_fn='rep_seqs.fna', newick_fn='phylogeny.tre'): '''Copy essential mock community data to tax-credit repo communities: list list of dir names in mock_data_dir, a.k.a. names of mock communities mock_data_dir: path source directory containing mock communities dirs of results project_dir: path path to tax-credit repo directory sample_type_dirname: str name of destination directory to contain communities dirs. The analog of mock_data_dir in the repo, dirs for individual communities will be located in project_dir/data/sample_type_dirname/community rep_seqs_fn: str name of rep seqs FeatureData[Sequence] Artifact in community_dir feature_table_fn: str name of rep seqs FeatureTable[Frequency] Artifact in community_dir tree_fn: str name of Phylogeny[Rooted] Artifact in community_dir sample_md_fn: str name of metadata mapping file in community_dir biom_table_fn: str destination name of biom table in project_dir fasta_fn: str destination name of fasta file in project_dir newick_fn: str destination name of newick format tree in project_dir ''' for community in communities: community_dir = join(mock_data_dir, community) # Define base dir destination for mock community directories repo_destination = join(project_dir, "data", sample_type_dirname, community) if not exists(repo_destination): makedirs(repo_destination) # Files to move rep_seqs = join(community_dir, rep_seqs_fn) feature_table = join(community_dir, feature_table_fn) tree = join(community_dir, tree_fn) sample_md = join(community_dir, sample_md_fn) biom_table_fp = join(community_dir, biom_table_fn) rep_seqs_fp = join(community_dir, fasta_fn) tree_fp = join(community_dir, newick_fn) # Extract biom, tree, rep_seqs rep_seqs_fna = qiime2.Artifact.load(rep_seqs).view(DNAIterator) io.write(rep_seqs_fna.generator, format='fasta', into=rep_seqs_fp) if exists(tree): qiime2.Artifact.load(tree).view(TreeNode).write(tree_fp) # Move to repo: for f in [ rep_seqs, feature_table, tree, sample_md, biom_table_fp, rep_seqs_fp, tree_fp ]: if exists(f): copyfile(f, join(repo_destination, basename(f)))