def test_invalid_files(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for invalid, kwargs, errors, etype in self.invalid_files: with self.assertRaises(etype) as cm: for kwarg in kwargs: _drop_kwargs(kwarg, "constructor", "filter") read(invalid, format="qseq", verify=False, into=constructor, **kwarg) for e in errors: self.assertIn(e, str(cm.exception))
def test_invalid_files(self): for constructor in [Sequence, DNA, RNA, Protein]: for invalid, kwargs, errors, etype in self.invalid_files: with self.assertRaises(etype) as cm: for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor', 'filter') read(invalid, format='qseq', verify=False, into=constructor, **kwarg) for e in errors: self.assertIn(e, str(cm.exception))
def test_invalid_files(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for invalid, kwargs, errors, etype in self.invalid_files: with self.assertRaises(etype) as cm: for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor', 'filter') read(invalid, format='qseq', verify=False, into=constructor, **kwarg) for e in errors: self.assertIn(e, str(cm.exception))
def test_dna_iterator_to_dna_fasta_format(self): transformer = self.get_transformer(DNAIterator, DNAFASTAFormat) filepath = self.get_data_path('dna-sequences.fasta') generator = skbio.read(filepath, format='fasta', constructor=skbio.DNA) input = DNAIterator(generator) obs = transformer(input) self.assertIsInstance(obs, DNAFASTAFormat) obs = skbio.read(str(obs), format='fasta', constructor=skbio.DNA) for act, exp in zip(obs, input): self.assertEqual(act, exp)
def test_pair_dna_sequences_directory_format_to_pair_dna_iterator(self): filenames = ('left-dna-sequences.fasta', 'right-dna-sequences.fasta') input, obs = self.transform_format(PairedDNASequencesDirectoryFormat, PairedDNAIterator, filenames=filenames) exp_left = skbio.read(self.get_data_path(filenames[0]), format='fasta', constructor=skbio.DNA) exp_right = skbio.read(self.get_data_path(filenames[1]), format='fasta', constructor=skbio.DNA) for act, exp in zip(obs, zip(exp_left, exp_right)): self.assertEqual(act, exp) self.assertIsInstance(obs, PairedDNAIterator)
def annotate(in_fp, in_fmt, out_dir, out_fmt, cpus, kingdom, force, config, cache=False): '''Annotate the sequences in the input file. Parameters ---------- in_fp : file_handle Input file handler object. in_fmt : str Input file format. out_dir : str Output file directory. out_fmt : str Output file format. kingdom : str Kingdom index corresponding to database (i.e. virus, bacteria ...) cpus : int Number of cpus to use. force : boolean Force to overwrite. config : ``micronota.config.Configuration`` Container for configuration options. ''' _overwrite(out_dir, overwrite=force) makedirs(out_dir, exist_ok=force) prefix = splitext(basename(in_fp))[0] fn = '{p}.{f}'.format(p=prefix, f=out_fmt) out_fp = join(out_dir, fn) # declare DiamondCache if cache: cache = DiamondCache() else: cache = None with open(out_fp, 'w') as out: for seq in read(in_fp, format=in_fmt): # dir for useful intermediate files for the current input seq # replace non alnum char with "_" seq_fn = ''.join(x if x.isalnum() else '_' for x in seq.metadata['id']) seq_dir = join(out_dir, seq_fn) # identify all features specified im = identify_all_features(seq, seq_dir, config) # pass in and retrieve DiamondCache im, cache = annotate_all_cds(im, seq_dir, kingdom, config, cache=cache) seq.interval_metadata.concat(IntervalMetadata(im), inplace=True) seq.write(out, format=out_fmt)
def setUp(self): super().setUp() tests = ('blastp', 'WP_009885814.faa') self.blast = (tests[0], get_data_path(tests[1]), _get_named_data_path('%s.diamond' % tests[1])) seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta') self.cache = DiamondCache(list(seqs))
def fungi_from_fasta(fasta_fh, accession_fh, taxonomy_fh): """Filter SILVA sequences to keep only fungi. Filters a fasta file of aligned or unaligned sequences to include only fungi. Only keeps sequences that have accession numbers that can be mapped to a fungal taxonomy string that ends at the genus rank. Parameters ---------- fasta_fh : filehandle Fasta file of aligned or unaligned SILVA sequences. Each sequence identifier must be an accession number. accession_fh : filehandle A tab-separated file mapping accession numbers to a mapping number in `taxonomy_map`. This file should contain exactly two columns: accession number and mapping number. taxonomy_fh: filehandle A tab-separated file that identifes the taxonomy and rank of a mapping number in `accession_fh`. This file should contain exactly five columns beginning with taxonomy, mapping number and rank. The last two columns are ignored. Returns ------- generator Yields ``skbio.BiologicalSequence`` objects. """ accession_map = _parse_accession_map(accession_fh) taxonomy_map = _parse_taxonomy_map(taxonomy_fh) for seq in skbio.read(fasta_fh, format="fasta"): map_num = accession_map[seq.id] if map_num in taxonomy_map: yield seq
def _parse_fasta_dictionary(self): fasta_dictionary = {} sequence_type = self.sequence_type for seq_entry in read(self.fasta_path, format="fasta"): seq_id = seq_entry.metadata["id"] fasta_dictionary[seq_id] = sequence_type(seq_entry) return fasta_dictionary
def filter_fasta(exp, filename, negate=False, inplace=False): '''Filter features from experiment based on fasta file Parameters ---------- filename : str the fasta filename containing the sequences to use for filtering negate : bool (optional) False (default) to keep only sequences matching the fasta file, True to remove sequences in the fasta file. inplace : bool (optional) False (default) to create a copy of the experiment, True to filter inplace Returns ------- newexp : Experiment filtered so contains only sequence present in exp and in the fasta file ''' logger.debug('filter_fasta using file %s' % filename) okpos = [] tot_seqs = 0 for cseq in skbio.read(filename, format='fasta'): tot_seqs += 1 cseq = str(cseq).upper() if cseq in exp.feature_metadata.index: pos = exp.feature_metadata.index.get_loc(cseq) okpos.append(pos) logger.debug('loaded %d sequences. found %d sequences in experiment' % (tot_seqs, len(okpos))) if negate: okpos = np.setdiff1d(np.arange(len(exp.feature_metadata.index)), okpos, assume_unique=True) newexp = exp.reorder(okpos, axis=1, inplace=inplace) return newexp
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline, reads_per_batch: int = 0, n_jobs: int = 1, pre_dispatch: str = '2*n_jobs', confidence: float = 0.7, read_orientation: str = None) -> pd.DataFrame: # autotune reads per batch if reads_per_batch == 0: reads_per_batch = _autotune_reads_per_batch(reads, n_jobs) # transform reads to DNAIterator reads = DNAIterator( skbio.read(str(reads), format='fasta', constructor=skbio.DNA)) reads = _autodetect_orientation(reads, classifier, read_orientation=read_orientation) predictions = predict(reads, classifier, chunk_size=reads_per_batch, n_jobs=n_jobs, pre_dispatch=pre_dispatch, confidence=confidence) seq_ids, taxonomy, confidence = list(zip(*predictions)) result = pd.DataFrame({ 'Taxon': taxonomy, 'Confidence': confidence }, index=seq_ids, columns=['Taxon', 'Confidence']) result.index.name = 'Feature ID' return result
def _filter_sequence_ids(in_fp, out_fp, ids, negate=False): '''Filter away the seq with specified IDs.''' with open(out_fp, 'w') as out: for seq in read(in_fp, format='fasta', constructor=Sequence): seq_id = seq.metadata['id'] if seq_id not in ids: write(seq, format='fasta', into=out)
def test_fastq_to_sequence(self): for constructor in [partial(Sequence), partial(DNA, validate=False), partial(RNA, validate=False), partial(Protein, validate=False)]: for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: # skip empty file case since we cannot read a specific # sequencefrom an empty file if len(components) == 0: continue for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = \ constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': np.array(c[3], dtype=np.uint8)}) observed = read(valid, into=constructor.func, format='fastq', verify=False, **kwarg) self.assertEqual(observed, expected)
def test_fastq_to_sequence(self): for constructor in [ BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence ]: for valid, kwargs, components in self.valid_files: # skip empty file case since we cannot read a specific sequence # from an empty file if len(components) == 0: continue for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = constructor(c[2], id=c[0], description=c[1], quality=c[3]) observed = read(valid, into=constructor, format='fastq', verify=False, **kwarg) self.assertTrue(observed.equals(expected))
def find_tree(npop: int, numerical_label: 'np.ndarray[int]', arr: 'np.ndarray[float]', ) -> TreeNode: """Find tree topology using the centers of mass of clusters. 'inferred_labels' contains assigned labels. Return the neighbor join tree, population sizes, and the bloks of original distance matrix that correspond to given population pairs (for further determination of fitting window). """ if npop == 2: tree = read(StringIO('(0:0.1, 1:0.1);'), format='newick', into=TreeNode) return tree arr = arr[:, :npop + OFFSET] ds = np.zeros((npop, npop)) coords = np.zeros((npop, npop+OFFSET)) for i in set(numerical_label): coords[i, :] = np.mean(arr[np.where(numerical_label == i)[0], :], axis=0) for i in range(npop): for j in range(npop): ds[i, j] = np.sqrt(np.sum((coords[i] - coords[j])**2)) ids = list(map(str, range(npop))) dm = DistanceMatrix(ds, ids) tree = nj(dm) new_tree = tree.root_at_midpoint() print(new_tree.ascii_art()) print(new_tree) return new_tree
def body_site(coords, mapping_file, output, filename, sample): """Generates a bodysite figure for a sample in the coordinates file""" o = read(coords, into=OrdinationResults) # coordinates c_df = pd.DataFrame(o.site, o.site_ids) # mapping file mf = pd.read_csv(mapping_file, sep='\t', dtype=str) mf.set_index('#SampleID', inplace=True) mf = mf.loc[o.site_ids] if sample not in o.site_ids: raise ValueError("Sample %s not found" % sample) color_hmp_fecal = sns.color_palette('Paired', 12)[10] # light brown color_agp_fecal = sns.color_palette('Paired', 12)[11] # dark brown color_hmp_oral = sns.color_palette('Paired', 12)[0] # light blue color_agp_oral = sns.color_palette('Paired', 12)[1] # dark blue color_hmp_skin = sns.color_palette('Paired', 12)[2] # light green color_agp_skin = sns.color_palette('Paired', 12)[3] # dark green grp_colors = {'AGP-FECAL': color_agp_fecal, 'AGP-ORAL': color_agp_oral, 'AGP-SKIN': color_agp_skin, 'HMP-FECAL': color_hmp_fecal, 'GG-FECAL': color_hmp_fecal, 'PGP-FECAL': color_hmp_fecal, 'HMP-ORAL': color_hmp_oral, 'PGP-ORAL': color_hmp_oral, 'HMP-SKIN': color_hmp_skin, 'PGP-SKIN': color_hmp_skin} # plot categories as 50 slices with random zorder for grp, color in grp_colors.iteritems(): sub_coords = c_df[mf.TITLE_BODY_SITE == grp].values for i in np.array_split(sub_coords, 50): if i.size == 0: continue plt.scatter(i[:, 0], i[:, 1], color=color, edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH, alpha=ALPHA, zorder=np.random.rand()) # plot participant's dot plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']], s=270, edgecolor='w', zorder=1, lw=LINE_WIDTH_WHITE) plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']], s=250, edgecolor=np.asarray( grp_colors[mf.loc[sample]['TITLE_BODY_SITE']])*0.6, zorder=2, lw=LINE_WIDTH_BLACK) plt.axis('off') my_dpi = 72 figsize = (1000 / my_dpi, 1000 / my_dpi) out_file = os.path.join(output, filename) plt.savefig(out_file, figsize=figsize, dpi=my_dpi) plt.close()
def test_valid_files(self): for constructor in [Sequence, DNA, RNA, Protein]: for valid, kwargs, components in self.valid_files: for observed_kwargs in kwargs: expected_kwargs = {} # Currently not validating the alphabet for qseq # files that are read in for this test. if hasattr(constructor, 'alphabet'): observed_kwargs['validate'] = False expected_kwargs['validate'] = False _drop_kwargs(observed_kwargs, 'constructor', 'filter') seq_num = observed_kwargs.get('seq_num', 1) c = components[seq_num - 1] expected = constructor( c['sequence'], metadata={'id': c['id'], 'machine_name': c['machine_name'], 'run_number': c['run_number'], 'lane_number': c['lane_number'], 'tile_number': c['tile_number'], 'x': c['x'], 'y': c['y'], 'index': c['index'], 'read_number': c['read_number']}, positional_metadata={ 'quality': np.array(c['quality'], np.uint8)}, **expected_kwargs) observed = read(valid, into=constructor, format='qseq', verify=False, **observed_kwargs) self.assertEqual(observed, expected)
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1, outfmt='tab', params=None) -> pd.DataFrame: '''Annotate the sequences in the file. Parameters ---------- params : dict-like Parameters for diamond blastp/blastx that pass to ``run_blast``. ''' found = [] res = pd.DataFrame() for db in self.dat: out_prefix = splitext(basename(db))[0] daa_fp = join(self.out_dir, '%s.daa' % out_prefix) out_fp = join(self.out_dir, '%s.diamond' % out_prefix) self.run_blast(fp, daa_fp, db, aligner=aligner, evalue=evalue, cpus=cpus, params=params) self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt}) res = res.append(self.parse_tabular(out_fp)) found.extend(res.index) # save to a tmp file the seqs that do not hit current database new_fp = join(self.tmp_dir, '%s.fa' % out_prefix) with open(new_fp, 'w') as f: for seq in read(fp, format='fasta'): if seq.metadata['id'] not in found: seq.write(f, format='fasta') # no seq left if stat(new_fp).st_size == 0: break else: fp = new_fp return res
def read(file_name, file_format='newick'): """ Reads in contents from a file. """ if file_format == 'newick': tree = skbio.read(file_name, file_format, into=TreeNode) return tree return None
def test_dna_fasta_format_to_dna_iterator(self): input, obs = self.transform_format(DNAFASTAFormat, DNAIterator, filename='dna-sequences.fasta') exp = skbio.read(str(input), format='fasta', constructor=skbio.DNA) for observed, expected in zip(obs, exp): self.assertEqual(observed, expected)
def set_tree_from_input(asd_file, simulation) -> Tuple[TreeNode, 'np.ndarray[int]', 'np.ndarray[float]']: """Using the given tree topology, Return the neighbor join tree, population sizes, and the bloks of original distance matrix that correspond to given population pairs (for further determination of fitting window). """ print(simulation.topology) tree = read(StringIO(simulation.topology),format='newick', into=TreeNode) print(tree.ascii_art()) return tree
def sort_uniref(db_fp, uniref_fp, out_d, resolution, force=False): '''Sort UniRef sequences into different partitions. This will sort UniRef100 seq into following partitions based on both quality and taxon: * ``uniref100/Swiss-Prot_Archaea.fasta`` * ``uniref100/Swiss-Prot_Bacteria.fasta`` * ``uniref100/Swiss-Prot_Viruses.fasta`` * ``uniref100/Swiss-Prot_other.fasta`` * ``uniref100/Swiss-Prot_Eukaryota.fasta`` * ``uniref100/TrEMBL_Archaea.fasta`` * ``uniref100/TrEMBL_Bacteria.fasta`` * ``uniref100/TrEMBL_Viruses.fasta`` * ``uniref100/TrEMBL_other.fasta`` * ``uniref100/TrEMBL_Eukaryota.fasta`` * ``uniref100/_other.fasta`` Parameters ---------- db_fp : str The database file created by ``prepare_metadata``. uniref_fp : str The UniRef100 fasta file. gzipped or not. out_d : str The output directory to place the resulting fasta files. ''' _overwrite(out_d, force) makedirs(out_d) logger = getLogger(__name__) logger.info('Sorting UniRef sequences') fns = ['%s_%s' % (i, j) for i, j in product(_status, _kingdom)] fns.append('_other') fps = [join(out_d, 'uniref%d_%s.fasta' % (resolution, f)) for f in fns] files = {fn: open(fp, 'w') for fp, fn in zip(fps, fns)} with connect(db_fp) as conn: cursor = conn.cursor() for seq in read(uniref_fp, format='fasta', constructor=Sequence): id = seq.metadata['id'] ac = id.replace('UniRef%d_' % resolution, '') group = ['', 'other'] cursor.execute('''SELECT * FROM metadata WHERE ac = ?''', (ac,)) for _, s, k in cursor.fetchall(): group[0] = _status[s] group[1] = _kingdom[k] seq.write(files['_'.join(group)]) for f in files: files[f].close() for fp in fps: # if the fasta file is not empty if stat(fp).st_size > 0: make_db(fp)
def gradient(coords, mapping_file, color, output, filename, sample): """Generates as many figures as samples in the coordinates file""" o = read(coords, into=OrdinationResults) # coordinates c_df = pd.DataFrame(o.site, o.site_ids) # mapping file mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), dtype=str) mf.set_index('#SampleID', inplace=True) mf = mf.loc[o.site_ids] mf[color] = mf[color].convert_objects(convert_numeric=True) if sample not in o.site_ids: raise ValueError("Sample %s not found" % sample) numeric = mf[~pd.isnull(mf[color])] non_numeric = mf[pd.isnull(mf[color])] color_array = plt.cm.RdBu(numeric[color]/max(numeric[color])) # plot numeric metadata as colored gradient ids = numeric.index x, y = c_df.loc[ids][0], c_df.loc[ids][1] plt.scatter(x, y, c=numeric[color], cmap=plt.get_cmap('RdBu'), alpha=ALPHA, lw=LINE_WIDTH, edgecolor=color_array*0.6) # plot non-numeric metadata as gray ids = non_numeric.index x, y = c_df.loc[ids][0], c_df.loc[ids][1] plt.scatter(x, y, c='0.5', alpha=ALPHA, lw=LINE_WIDTH, edgecolor='0.3') # plot individual's dot try: color_index = numeric.index.tolist().index(sample) except ValueError: color_index = None if color_index is None: _color = (0.5, 0.5, 0.5) else: _color = color_array[color_index] plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=_color, s=270, edgecolor='w', lw=LINE_WIDTH_WHITE) plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=_color, s=250, edgecolor=np.asarray(_color)*0.6, lw=LINE_WIDTH_BLACK) plt.axis('off') my_dpi = 72 figsize = (1000 / my_dpi, 1000 / my_dpi) out_file = os.path.join(output, filename) plt.savefig(out_file, figsize=figsize, dpi=my_dpi) plt.close()
def sort_uniref(db_fp, uniref_fp, out_d, resolution, force=False): '''Sort UniRef sequences into different partitions. This will sort UniRef100 seq into following partitions based on both quality and taxon: * ``uniref100/Swiss-Prot_Archaea.fasta`` * ``uniref100/Swiss-Prot_Bacteria.fasta`` * ``uniref100/Swiss-Prot_Viruses.fasta`` * ``uniref100/Swiss-Prot_other.fasta`` * ``uniref100/Swiss-Prot_Eukaryota.fasta`` * ``uniref100/TrEMBL_Archaea.fasta`` * ``uniref100/TrEMBL_Bacteria.fasta`` * ``uniref100/TrEMBL_Viruses.fasta`` * ``uniref100/TrEMBL_other.fasta`` * ``uniref100/TrEMBL_Eukaryota.fasta`` * ``uniref100/_other.fasta`` Parameters ---------- db_fp : str The database file created by ``prepare_metadata``. uniref_fp : str The UniRef100 fasta file. gzipped or not. out_d : str The output directory to place the resulting fasta files. ''' _overwrite(out_d, force) makedirs(out_d) logger = getLogger(__name__) logger.info('Sorting UniRef sequences') fns = ['%s_%s' % (i, j) for i, j in product(_status, _kingdom)] fns.append('_other') fps = [join(out_d, 'uniref%d_%s.fasta' % (resolution, f)) for f in fns] files = {fn: open(fp, 'w') for fp, fn in zip(fps, fns)} with connect(db_fp) as conn: cursor = conn.cursor() for seq in read(uniref_fp, format='fasta', constructor=Sequence): id = seq.metadata['id'] ac = id.replace('UniRef%d_' % resolution, '') group = ['', 'other'] cursor.execute( '''SELECT * FROM metadata WHERE ac = ?''', (ac, )) for _, s, k in cursor.fetchall(): group[0] = _status[s] group[1] = _kingdom[k] seq.write(files['_'.join(group)]) for f in files: files[f].close() for fp in fps: # if the fasta file is not empty if stat(fp).st_size > 0: run_makedb(fp)
def test_save_fasta(self): exp = ca.read(self.test1_biom, self.test1_samp, normalize=None) d = mkdtemp() f = join(d, 'test1.fasta') exp.save_fasta(f) seqs = [] for seq in skbio.read(f, format='fasta'): seqs.append(str(seq)) self.assertCountEqual(seqs, exp.feature_metadata.index.values) shutil.rmtree(d)
def test_pair_dna_iterator_to_pair_dna_sequences_directory_format(self): transformer = self.get_transformer(PairedDNAIterator, PairedDNASequencesDirectoryFormat) l_seqs = skbio.read(self.get_data_path('left-dna-sequences.fasta'), format='fasta', constructor=skbio.DNA) r_seqs = skbio.read(self.get_data_path('right-dna-sequences.fasta'), format='fasta', constructor=skbio.DNA) input = PairedDNAIterator(zip(l_seqs, r_seqs)) obs = transformer(input) obs_l = skbio.read('%s/left-dna-sequences.fasta' % str(obs), format='fasta', constructor=skbio.DNA) obs_r = skbio.read('%s/right-dna-sequences.fasta' % str(obs), format='fasta', constructor=skbio.DNA) for act, exp in zip(zip(obs_l, obs_r), zip(l_seqs, r_seqs)): self.assertEqual(act, exp) self.assertIsInstance(obs, PairedDNASequencesDirectoryFormat)
def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file=None, **kwargs): '''Read a qiime2 feature table and additional optional artifact files (representative sequences and taxonomy) into a Calour.AmpliconExperiment Parameters ---------- fp: str name of the qiime2 feature table .qza artifact file sample_metadata_file : None or str, optional None (default) to just use sample names (no additional metadata). if not None, file path to the sample metadata (aka mapping file in QIIME). rep_seq_file: None or str, optional None (default) to use the feature ids in the feature table if not None, file path to the qiime2 representative sequences artifact file (defined by the qiime2 --o-representative-sequences parameter) taxonomy_file: None or str, optional if not None, add taxonomy for each feature using the qiime2 taxonomy artifact file (output of the qiime2 feature-classifier command) Keyword Arguments ----------------- %(io.read.parameters)s ''' newexp = read_amplicon(fp, sample_metadata_file=sample_metadata_file, data_file_type='qiime2', **kwargs) with tempfile.TemporaryDirectory() as tempdir: # if rep-seqs file is supplied, translate hashes to sequences if rep_seq_file is not None: logger.debug('loading rep_seqs file %s' % rep_seq_file) rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta') rseqs = [] rids = [] for cseq in skbio.read(rs_name, format='fasta'): rseqs.append(str(cseq).upper()) rids.append(cseq.metadata['id']) rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id') # test if all hashes are identical to the rep_seqs file supplied if not newexp.feature_metadata.index.equals(rep_seqs.index): logger.info('Rep seqs hashes and table hashes are not equal. Using table hashes.') # switch the columns so now _feature_id (and the index) is the sequence and not the hash. The hash is copied to '_hash' newexp.feature_metadata.rename(columns={'_feature_id': '_hash'}, inplace=True) newexp.feature_metadata = newexp.feature_metadata.join(other=rep_seqs, on='_hash', how='left') newexp.feature_metadata.set_index('_feature_id', inplace=True, drop=False) # if taxonomy file is supplied, load it into the feature metadata if taxonomy_file is not None: logger.debug('loading taxonomy file %s' % taxonomy_file) tax_name = _file_from_zip(tempdir, taxonomy_file, internal_data='data/taxonomy.tsv') taxonomy_df = pd.read_table(tax_name) taxonomy_df.set_index('Feature ID', inplace=True) newexp.feature_metadata = newexp.feature_metadata.join(other=taxonomy_df, how='left') if len(newexp.feature_metadata.index.intersection(taxonomy_df.index)) == 0: logger.info('No matching sequences in taxonomy file.') if '_hash' in newexp.feature_metadata.columns: logger.info('Trying to use hashes for taxonomy') newexp.feature_metadata = newexp.feature_metadata.drop(taxonomy_df.columns, axis=1) newexp.feature_metadata = newexp.feature_metadata.join(other=taxonomy_df, on='_hash', how='left') return newexp
def test_filter_partial_genes(self): in_fp = join(self.tmpd, 'in.gff') out_fp = join(self.tmpd, 'out.gff') imd1 = IntervalMetadata(None) imd1.add( [(0, 100)], metadata={ 'partial': '01', 'phase': 0, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd2 = IntervalMetadata(None) imd2.add( [(200, 300)], metadata={ 'partial': '10', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '-', 'type': 'CDS', 'score': '1' }) imd2.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd3 = IntervalMetadata(None) imd3.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) data = (('seq1', imd1), ('seq2', imd2)) write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3') filter_partial_genes(in_fp, out_fp) obs = read(out_fp, format='gff3') for i, j in zip(obs, [('seq2', imd3)]): self.assertEqual(i, j)
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1, outfmt='sam', params=None): '''Annotate the sequences in the file.''' if self.has_cache() and not self.cache.is_empty(): self.cache.build() dbs = [self.cache.db] + self.dat else: dbs = self.dat seqs = [] found = set() res = pd.DataFrame() logger = getLogger(__name__) for db in dbs: out_prefix = splitext(basename(db))[0] daa_fp = join(self.out_dir, '%s.daa' % out_prefix) out_fp = join(self.out_dir, '%s.diamond' % out_prefix) self.run_blast(fp, daa_fp, db, aligner=aligner, evalue=evalue, cpus=cpus, params=params) self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt}) # res = res.append(self.parse_tabular(out_fp)) if outfmt == 'tab': res = res.append( self._filter_best(self.parse_tabular(out_fp))) elif outfmt == 'sam': res = res.append( self._filter_id_cov(self.parse_sam(out_fp))) # save to a tmp file the seqs that do not hit current database new_fp = join(self.tmp_dir, '%s.fa' % out_prefix) found = found | set(res.index) with open(new_fp, 'w') as f: for seq in read(fp, format='fasta'): if seq.metadata['id'] not in found: seq.write(f, format='fasta') logger.info('Number of diamond hits: %d' % len(res.index)) # no seq left if stat(new_fp).st_size == 0: break else: fp = new_fp if outfmt == 'sam' and self.has_cache(): for x in res.index: seqs.append( Sequence(res.loc[x, 'sseq'], metadata={'id': res.loc[x, 'sseqid']})) # Update cache (inplace) if self.has_cache(): self.cache.update(seqs) self.cache.close() return res
def setUp(self): super().setUp() cases = [('blastp', 'WP_009885814.faa'), ('blastx', 'WP_009885814.fna')] Test = namedtuple('Test', ['aligner', 'input', 'exp']) self.tests = [ Test(i[0], get_data_path(i[1]), _get_named_data_path('%s.diamond' % i[1])) for i in cases ] seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta') self.cache = DiamondCache(list(seqs))
def subsample_dm(distmat, mapping_file, max, category, output): """Subsample the distmat to max samples per category value""" mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), index_col='#SampleID') id_to_cat = dict(mf[category]) def bin_f(x): return id_to_cat[x] dm = read(distmat, into=DistanceMatrix) dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)]) dm.to_file(output)
def test_valid_files(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, "constructor", "filter") seq_num = kwarg.get("seq_num", 1) c = components[seq_num - 1] expected = constructor(c[1], id=c[0], quality=c[2]) observed = read(valid, into=constructor, format="qseq", verify=False, **kwarg) self.assertTrue(observed.equals(expected))
def setUp(self): super().setUp() cases = [('blastp', 'WP_009885814.faa'), ('blastx', 'WP_009885814.fna')] Test = namedtuple('Test', ['aligner', 'input', 'exp']) self.tests = [Test(i[0], get_data_path(i[1]), _get_named_data_path('%s.diamond' % i[1])) for i in cases] seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta') self.cache = DiamondCache(list(seqs))
def sequence_generator(input_fp): """Yield (id, sequence) from an input file Parameters ---------- input_fp : filepath A filepath, which can be any valid fasta or fastq file within the limitations of scikit-bio's IO registry. Notes ----- The use of this method is a stopgap to replicate the existing `parse_fasta` functionality while at the same time allowing for fastq support. Raises ------ skbio.io.FormatIdentificationWarning If the format of the input file cannot be determined. Returns ------- (str, str) The ID and sequence. """ logger = logging.getLogger(__name__) kw = {} if sniff_fasta(input_fp)[0]: format = 'fasta' elif sniff_fastq(input_fp)[0]: format = 'fastq' # WARNING: the variant is currently forced to illumina 1.8 as the # quality scores are _not_ used in downstream processing. However, if # in the future, quality scores are to be interrogated, it is critical # that this variant parameter be exposed to the user at the command # line. The list of allowable paramters can be found here: # http://scikit-bio.org/docs/latest/generated/skbio.io.format.fastq.html#format-parameters kw['variant'] = 'illumina1.8' else: # usually happens when the fasta file is empty # so need to return no sequences (and warn) msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp logger.warn(msg) warnings.warn(msg, UserWarning) return # some of the test code is using file paths, some is using StringIO. if isinstance(input_fp, io.TextIOBase): input_fp.seek(0) for record in skbio.read(input_fp, format=format, **kw): yield (record.metadata['id'], str(record))
def test_valid_files(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor', 'filter') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = constructor(c[1], id=c[0], quality=c[2]) observed = read(valid, into=constructor, format='qseq', verify=False, **kwarg) self.assertTrue(observed.equals(expected))
def test_summarize(self): gff = get_data_path('summarize.gff') seqs = [ DNA('A' * 5000000, metadata={'id': 'gi|556503834|ref|NC_000913.3|'}), DNA('AG' * 2500000, metadata={'id': 'gi|556503834|ref|NC_000913.2|'}) ] for (seq_id, imd), seq in zip(read(gff, format='gff3'), seqs): seq.interval_metadata = imd with StringIO() as obs, open(get_data_path('summarize.txt')) as exp: summarize(seqs, obs) self.assertEqual(obs.getvalue(), exp.read())
def parse_sam(diamond_res, column=None, collapse=False): '''Parse the output of diamond blastp/blastx. Parameters ---------- diamond_res : str file path column : str The column used to pick the best hits. Returns ------- pandas.DataFrame The best matched records for each query sequence. ''' seqs = read(diamond_res, format='sam') columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'sequence'] df = pd.DataFrame(columns=columns) for i, seq in enumerate(seqs): s = str(seq) qseqid = seq.metadata['QNAME'] sseqid = seq.metadata['RNAME'] pident = seq.metadata['ZI'] length = seq.metadata['ZL'] mismatch = seq.metadata['CIGAR'] gapopen = '' qstart = seq.metadata['POS'] qend = '' sstart = seq.metadata['ZS'] send = '' evalue = seq.metadata['ZE'] bitscore = seq.metadata['ZR'] row = pd.Series([qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore, s], index=columns) df.loc[i] = row if column is not None: idx = df.groupby('qseqid')[column].idxmax() df_max = df.loc[idx] df_max.index = idx.index df = df_max[['sseqid', 'evalue', 'bitscore', 'sequence']] else: df = df[['sseqid', 'evalue', 'bitscore', 'sequence']] return df
def main(argv): parser = argparse.ArgumentParser( description= 'Filter sequences from biom table using a fasta file. Version ' + __version__) parser.add_argument('-i', '--inputtable', help='input biom table file name') parser.add_argument('-o', '--output', help='output biom file name') parser.add_argument('-f', '--fasta', help='filtering fasta file name') parser.add_argument( '-n', '--number', help='number of sOTUs from the fasta file to use (-1 means all)', default=-1, type=int) parser.add_argument( '--ignore_table_seq_length', help= "don't trim the fasta file sequences to the biom table sequence length", action='store_true') args = parser.parse_args(argv) seqs = skbio.read(args.fasta, format='fasta') table = biom.load_table(args.inputtable) totorigreads = table.sum(axis='whole') print('loaded biom table %s containing %d unique sOTUs' % (args.inputtable, table.shape[0])) length = min(map(len, table.ids(axis='observation'))) if not args.ignore_table_seq_length: seqs = trim_seqs(seqs, seqlength=length) # if need to remove only a subset of the sOTUs from the fasta file seqs = list(seqs) if args.number >= 0: if len(seqs) > args.number: seqs = seqs[:args.number] print('filtering %d sOTUs (from file %s)' % (len(seqs), args.fasta)) outtable = remove_seqs(table, seqs) totfilteredreads = outtable.sum(axis='whole') print('removed %d reads (from %d to %d)' % (totorigreads - totfilteredreads, totorigreads, totfilteredreads)) print('saving filtered biom table with %d sOTUs to file %s' % (outtable.shape[0], args.output)) with biom.util.biom_open(args.output, 'w') as f: outtable.to_hdf5(f, "filterbiomseqs")
def test_valid_files(self): for constructor in [partial(Sequence), partial(DNA, validate=False), partial(RNA, validate=False), partial(Protein, validate=False)]: for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor', 'filter') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = constructor(c[1], id=c[0], quality=c[2]) observed = read(valid, into=constructor.func, format='qseq', verify=False, **kwarg) self.assertTrue(observed.equals(expected))
def check_seq(in_seq, in_fmt=None, discard=lambda s: len(s) < 500): '''Validate and filter input seq file. 1. filter seq; 2. validate seq IDs (no duplicates) 3. remove gaps in the sequence if there is any Parameters ---------- in_seq : str or Iterable of ``Sequence`` objects input seq file path if it is a str in_fmt : str the format of seq file discard : callable a callable that applies on a ``Sequence`` and return a boolean Yields ------ ``Sequence`` object TODO ---- add an option to ignore the abnormal seq and continue yielding ''' logger.info('Filter and validate input sequences') ids = set() if isinstance(in_seq, str): # allow lowercase in DNA seq in_seq = read(in_seq, format=in_fmt, constructor=DNA, lowercase=True) for seq in in_seq: seq = seq.degap() if discard(seq): continue if in_fmt == 'genbank': seq.metadata['id'] = seq.metadata['LOCUS']['locus_name'] try: ident = seq.metadata['id'] except KeyError: raise KeyError('Ill input file format: at least one sequences do not have IDs.') if ident in ids: raise ValueError( 'Duplicate seq IDs in your input file: {}'.format(ident)) else: ids.add(ident) yield seq
def subsample_dm(distmat, mapping_file, max, category, output): """Subsample the distmat to max samples per category value""" mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), dtype=str) mf.set_index('#SampleID', inplace=True) id_to_cat = dict(mf[category]) def bin_f(x): return id_to_cat.get(x) dm = read(distmat, into=DistanceMatrix) dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)]) dm.to_file(output)
def convert(in_f, in_fmt, out_f, out_fmt): '''convert between file formats Parameters ---------- in_fmt : str input file format out_fmt : str output file format in_f : str input file path out_f: str output file path ''' for obj in read(in_f, format=in_fmt): write(obj, format=out_fmt, into=out_f)
def _make_nr_foundation_alignment(foundation_alignment_fh, extension_genus_accession_list_dic): all_genus_list = extension_genus_accession_list_dic.keys() global foundation_accession_genus_dic foundation_accession_genus_dic = {} for seq in skbio.read(foundation_alignment_fh, format="fasta"): try: for i in all_genus_list: if_case = (re.search(";" + i + ";", seq.description) or re.search("g__" + i + ";", seq.description)) if if_case: all_genus_list.remove(i) foundation_accession_genus_dic[seq.id] = i yield seq except: pass
def sequence_generator(input_fp): """Yield (id, sequence) from an input file Parameters ---------- input_fp : filepath A filepath, which can be any valid fasta or fastq file within the limitations of scikit-bio's IO registry. Notes ----- The use of this method is a stopgap to replicate the existing `parse_fasta` functionality while at the same time allowing for fastq support. Raises ------ skbio.io.FormatIdentificationWarning If the format of the input file cannot be determined. Returns ------- (str, str) The ID and sequence. """ logger = logging.getLogger(__name__) kw = {} if sniff_fasta(input_fp)[0]: format = 'fasta' elif sniff_fastq(input_fp)[0]: format = 'fastq' kw['variant'] = _get_fastq_variant(input_fp) else: # usually happens when the fasta file is empty # so need to return no sequences (and warn) msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp logger.warn(msg) warnings.warn(msg, UserWarning) return # some of the test code is using file paths, some is using StringIO. if isinstance(input_fp, io.TextIOBase): input_fp.seek(0) for record in skbio.read(input_fp, format=format, **kw): yield (record.metadata['id'], str(record))
def _fasta_from_sqlite(conn, input_fasta_fp, output_fasta_fp): input_seqs = skbio.read(input_fasta_fp, format='fasta', constructor=skbio.DNA) c = conn.cursor() # Create a second in-memory table with the following schema (displayed # below with dummy data): # feature_id | sequence_string # -----------|------------------ # feature1 | ACGTACGTACGTACGT # feature2 | GGGGAAAACCCCTTTT # feature3 | TCAGAAAATTTTTCAG # feature4 | AAAAAAAAAAAAAAAA # feature5 | GGGGGGGGGGGGGGGG c.execute('CREATE TABLE rep_seqs (feature_id TEXT PRIMARY KEY, ' 'sequence_string TEXT NOT NULL);') c.executemany('INSERT INTO rep_seqs VALUES (?, ?);', [(seq.metadata['id'], str(seq)) for seq in input_seqs]) conn.commit() # Preemptively sort the table to deal with tie-breaking, later. # This is a table, not a view, because we want/need sqlite's rowid. c.execute('CREATE TABLE sorted_feature_cluster_map AS ' 'SELECT * FROM feature_cluster_map ORDER BY cluster_id ASC,' 'feature_id ASC;') c.execute('CREATE INDEX idx2 ON ' 'sorted_feature_cluster_map(cluster_id, count);') conn.commit() # The results from this query should look like the following (displayed # below with dummy data): # cluster_id | sequence_string # -----------|------------------ # r1 | ACGTACGTACGTACGT # r2 | AAAAAAAAAAAAAAAA c.execute('''SELECT fcm.cluster_id, rs.sequence_string, MAX(fcm.count) FROM sorted_feature_cluster_map fcm INNER JOIN rep_seqs rs ON rs.feature_id = fcm.feature_id GROUP BY fcm.cluster_id ORDER BY fcm.cluster_id ASC; ''') with open(output_fasta_fp, 'w') as output_seqs: while True: partial_results = c.fetchmany(size=100) if partial_results: output_seqs.writelines( ['>%s\n%s\n' % (i, s) for (i, s, _) in partial_results]) else: break
def __init__(self, config): biom_fp = config.get("distance", "biom_table") tree_path = config.get("distance", "rep_tree") assert(biom_fp and tree_path) self.otu_table = biom.load_table(biom_fp) self.sample_names = self.otu_table.ids(axis="sample") tree = read(tree_path, format="newick", into=TreeNode).root_at_midpoint() self.tips = [tip.name for tip in tree.tips()] ids = self.otu_table.ids(axis="observation") self.id_mask = np.array([id_ in self.tips for id_ in ids], dtype=bool) self.masked_ids = ids[self.id_mask] tree = tree.shear(self.masked_ids) self.tree_index = tree.to_array(nan_length_value=0.0)
def test_valid_files(self): for constructor in [partial(Sequence), partial(DNA, validate=False), partial(RNA, validate=False), partial(Protein, validate=False)]: for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor', 'filter') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = constructor( c[1], metadata={'id': c[0]}, positional_metadata={ 'quality': np.array(c[2], np.uint8)}) observed = read(valid, into=constructor.func, format='qseq', verify=False, **kwarg) self.assertEqual(observed, expected)
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1, outfmt='tab', params=None): '''Annotate the sequences in the file.''' if self.has_cache(): # Build cache self.cache.build() dbs = [self.cache.db] + self.dat else: dbs = self.dat found = [] res = pd.DataFrame() seqs = [] for db in dbs: out_prefix = splitext(basename(db))[0] daa_fp = join(self.out_dir, '%s.daa' % out_prefix) out_fp = join(self.out_dir, '%s.diamond' % out_prefix) self.run_blast(fp, daa_fp, db, aligner=aligner, evalue=evalue, cpus=cpus, params=params) self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt}) res = res.append(self.parse_tabular(out_fp)) found.extend(res.index) # save to a tmp file the seqs that do not hit current database new_fp = join(self.tmp_dir, '%s.fa' % out_prefix) with open(new_fp, 'w') as f: for seq in read(fp, format='fasta'): if seq.metadata['id'] not in found: seq.write(f, format='fasta') seqs.append(seq) # no seq left if stat(new_fp).st_size == 0: break else: fp = new_fp # Update cache (inplace) if self.has_cache(): self.cache.update(seqs) self.cache.close() return res
def test_fastq_to_sequence(self): for constructor in [Sequence, DNA, RNA, Protein]: for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: # skip empty file case since we cannot read a specific # sequencefrom an empty file if len(components) == 0: continue for observed_kwargs in kwargs: expected_kwargs = {} # TODO: # some of the test files contain characters which are # invalid for RNA, so don't validate for now. Need to # fix this if constructor is RNA: observed_kwargs['validate'] = False expected_kwargs['validate'] = False _drop_kwargs(observed_kwargs, 'constructor') # Can't use partials for this because the read # function below can't operate on partials if hasattr(constructor, 'lowercase'): expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' seq_num = observed_kwargs.get('seq_num', 1) c = components[seq_num - 1] expected = \ constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': np.array(c[3], dtype=np.uint8)}, **expected_kwargs) observed = read(valid, into=constructor, format='fastq', verify=False, **observed_kwargs) self.assertEqual(observed, expected)
def test_fastq_to_sequence(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for valid, kwargs, components in self.valid_files: # skip empty file case since we cannot read a specific sequence # from an empty file if len(components) == 0: continue for kwarg in kwargs: _drop_kwargs(kwarg, 'constructor') seq_num = kwarg.get('seq_num', 1) c = components[seq_num - 1] expected = constructor(c[2], id=c[0], description=c[1], quality=c[3]) observed = read(valid, into=constructor, format='fastq', verify=False, **kwarg) self.assertTrue(observed.equals(expected))
def setUp(self): self.test_dir = abspath( join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100')) files = [ 'Swiss-Prot_Archaea.fna', 'Swiss-Prot_Bacteria.fna', 'Swiss-Prot_Eukaryota.fna', 'Swiss-Prot_Viruses.fna', 'TrEMBL_Archaea.fna', 'TrEMBL_Bacteria.fna', 'TrEMBL_Eukaryota.fna', 'TrEMBL_Viruses.fna'] files = [join(self.test_dir, f) for f in files] self.tmp = mkdtemp() self.test1 = join(self.tmp, 'test1.fna') self.test1_exp = 'test1.genbank' with open(self.test1, 'w') as f: for seq in read(files[1], format='fasta'): write(seq, format='fasta', into=f) self.obs_tmp = mkdtemp()
def parse_sam(diamond_res): '''Parse the output of diamond blastp/blastx. Parameters ---------- diamond_res : str file path Returns ------- pandas.DataFrame The best matched records for each query sequence. ''' columns = ['qseqid', 'sseqid', 'pident', 'qlen', 'mismatch', 'qstart', 'sstart', 'evalue', 'bitscore', 'sseq'] df = pd.DataFrame(columns=columns) try: seqs = read(diamond_res, format='sam') except StopIteration: return df for i, seq in enumerate(seqs): sseq = str(seq) qseqid = seq.metadata['QNAME'] sseqid = seq.metadata['RNAME'] pident = seq.metadata['ZI'] qlen = seq.metadata['ZL'] mismatch = seq.metadata['CIGAR'] qstart = seq.metadata['POS'] sstart = seq.metadata['ZS'] evalue = seq.metadata['ZE'] bitscore = seq.metadata['ZR'] row = pd.Series([qseqid, sseqid, pident, qlen, mismatch, qstart, sstart, evalue, bitscore, sseq], index=columns) df.loc[i] = row return df