def _uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn): # this function only exists to simplify testing chimeras = DNAFASTAFormat() nonchimeras = DNAFASTAFormat() uchime_stats = UchimeStatsFmt() with tempfile.NamedTemporaryFile() as fasta_with_sizes: with tempfile.NamedTemporaryFile() as temp_chimeras: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = ['vsearch', '--uchime_denovo', fasta_with_sizes.name, '--uchimeout', str(uchime_stats), '--nonchimeras', str(nonchimeras), '--chimeras', temp_chimeras.name, '--dn', str(dn), '--mindiffs', str(mindiffs), '--mindiv', str(mindiv), '--minh', str(minh), '--xn', str(xn), '--qmask', 'none', # ensures no lowercase DNA chars '--xsize', '--minseqlength', '1', '--fasta_width', '0'] run_command(cmd) # this processing step should be removed, pending fix of: # https://github.com/qiime2/q2-vsearch/issues/39 _fix_chimera_ids(temp_chimeras, chimeras) return cmd, chimeras, nonchimeras, uchime_stats
def filter_seqs_length_by_taxon( sequences: DNAFASTAFormat, taxonomy: pd.Series, labels: str, min_lens: int = None, max_lens: int = None, global_min: int = None, global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat): # Validate filtering options if min_lens is max_lens is None: raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.') # validate that all seqIDs are present in taxonomy # Note we view as DNAIterator to take a first pass (should take a few # seconds) as initial validation before performing length filtering. seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)} _index_is_superset(seq_ids, set(taxonomy.index)) # set filter options mins = maxs = None if min_lens is not None: if len(labels) != len(min_lens): raise ValueError( 'labels and min_lens must contain the same number of elements') else: mins = {k: v for k, v in zip(labels, min_lens)} if max_lens is not None: if len(labels) != len(max_lens): raise ValueError( 'labels and max_lens must contain the same number of elements') else: maxs = {k: v for k, v in zip(labels, max_lens)} # Stream seqs, apply filter(s) result = DNAFASTAFormat() failures = DNAFASTAFormat() with result.open() as out_fasta, failures.open() as out_failed: for seq in sequences.view(DNAIterator): # taxon is required, we always use taxon-based filtering # grab taxon affiliation for seq taxon = taxonomy[seq.metadata['id']] # search taxon for filter terms # NOTE: we find all matching search terms and pass them all to # _seq_length_within_range below; that function determines and # applies the most stringent matching length thresholds. taxahits = [t for t in labels if t in taxon] # if there are no taxahits or global filters, just write out if not any(taxahits) and global_min is global_max is None: seq.write(out_fasta) # if there are taxahits or global filters, always check length elif _seq_length_within_range(seq, taxahits, mins, maxs, global_min, global_max): seq.write(out_fasta) else: seq.write(out_failed) return result, failures
def setUp(self): super().setUp() input_sequences_fp = self.get_data_path('dna-sequences-1.fasta') self.input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') ref_sequences_1_fp = self.get_data_path('ref-sequences-1.fasta') self.ref_sequences_1 = DNAFASTAFormat(ref_sequences_1_fp, mode='r') ref_sequences_2_fp = self.get_data_path('ref-sequences-2.fasta') self.ref_sequences_2 = DNAFASTAFormat(ref_sequences_2_fp, mode='r') self.input_table = biom.Table( np.array([[100, 101, 103], [1, 1, 2], [4, 5, 6], [7, 8, 9]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3']) self.input_sequences_list = _read_seqs(self.input_sequences)
def test_uchime_ref_no_chimeras(self): ref_sequences_fp = self.get_data_path('ref-sequences-4.fasta') ref_sequences = DNAFASTAFormat(ref_sequences_fp, mode='r') with redirected_stdio(stderr=os.devnull): chime, nonchime, stats = uchime_ref( sequences=self.input_sequences, table=self.input_table, reference_sequences=ref_sequences) obs_chime = _read_seqs(chime) exp_chime = [] self.assertEqual(obs_chime, exp_chime) # sequences are reverse-sorted by abundance in output obs_nonchime = _read_seqs(nonchime) exp_nonchime = [ self.input_sequences_list[0], self.input_sequences_list[1], self.input_sequences_list[2], self.input_sequences_list[3] ] self.assertEqual(obs_nonchime, exp_nonchime) with stats.open() as stats_fh: stats_text = stats_fh.read() self.assertTrue('feature1' in stats_text) self.assertTrue('feature2' in stats_text) self.assertTrue('feature3' in stats_text) self.assertTrue('feature4' in stats_text) stats_lines = [e for e in stats_text.split('\n') if len(e) > 0] self.assertEqual(len(stats_lines), 4)
def _prepare_sequence_data(self): sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') sequences = DNAFASTAFormat(sequences_fp, mode='r') alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta') alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r') exp = skbio.TabularMSA([ skbio.DNA('AGGGGG-', metadata={ 'id': 'aln-seq-1', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'aln-seq-2', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-GGGGGG', metadata={ 'id': 'seq2', 'description': '' }) ]) return alignment, sequences, exp
def test_dna_fasta_format_empty_file(self): filepath = os.path.join(self.temp_dir.name, 'empty') with open(filepath, 'w') as fh: fh.write('\n') format = DNAFASTAFormat(filepath, mode='r') format.validate()
def test_duplicate_input_ids(self): input_fp = self.get_data_path('unaligned-duplicate-ids.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'): with redirected_stdio(stderr=os.devnull): mafft(input_sequences)
def test_dna_fasta_format_invalid_characters(self): filepath = self.get_data_path('not-dna-sequences.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, "Invalid character '1' " ".*0 on line 2"): format.validate()
def test_dna_fasta_format_consecutive_IDs(self): filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex( ValidationError, 'consecutive descriptions.*1'): format.validate()
def test_dna_fasta_format_id_starts_with_space(self): filepath = self.get_data_path( 'dna-sequences-id-starts-with-space.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, '1 starts with a space'): format.validate()
def _process_primers(primer_fwd: Union[str, None], primer_rev: Union[str, None]) -> DNAFASTAFormat: """ Convert provided primers into skbio DNA format. Will reverse complement the reverse primer, if provided. Arguments: primer_fwd (str, None): forward primer primer_rev (str, None): reverse primer Returns: primers_fasta (DNAFASTAFormat): primers in FASTA format """ primers = { 'forward': DNA(primer_fwd, metadata={'id': 'forward'}) if primer_fwd else None, 'reverse': DNA(primer_rev, metadata={ 'id': 'reverse' }).reverse_complement() if primer_rev else None } # save primers in that format to pass them to mafft_add primers_fasta = DNAFASTAFormat() with primers_fasta.open() as out: [primer.write(out) for primer in primers.values() if primer] return primers_fasta
def setUp(self): super().setUp() taxonomy = Artifact.import_data('FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv')) self.taxonomy = taxonomy.view(pd.Series) self.taxartifact = taxonomy # TODO: use `Artifact.import_data` here once we have a transformer # for DNASequencesDirectoryFormat -> DNAFASTAFormat reads_fp = self.get_data_path('se-dna-sequences.fasta') reads = DNAFASTAFormat(reads_fp, mode='r') self.reads = Artifact.import_data('FeatureData[Sequence]', reads) fitter = getattr(feature_classifier.methods, 'fit_classifier_' + _specific_fitters[0][0]) self.classifier = fitter(self.reads, self.taxartifact).classifier self.query = Artifact.import_data( 'FeatureData[Sequence]', pd.Series({ 'A': 'GCCTAACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGA', 'B': 'TAACACATGCAAGTCAACGATGCTTATGTAGCAATATGTAAGTAGAGTGGCGCACG', 'C': 'ATACATGCAAGTCGTACGGTATTCCGGTTTCGGCCGGGAGAGAGTGGCGGATGGGT', 'D': 'GACGAACGCTGGCGACGTGCTTAACACATGCAAGTCGTGCGAGGACGGGCGGTGCT' 'TGCACTGCTCGAGCCGAGCGGCGGACGGGTGAGTAACACGTGAGCAACCTATCTCC' 'GTGCGGGGGACAACCCGGGGAAACCCGGGCTAATACCG' }))
def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table, perc_identity: float, threads: int = 1 ) -> (biom.Table, DNAFASTAFormat): clustered_sequences = DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes: with tempfile.NamedTemporaryFile() as out_uc: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = ['vsearch', '--cluster_size', fasta_with_sizes.name, '--id', str(perc_identity), '--centroids', str(clustered_sequences), '--uc', out_uc.name, '--qmask', 'none', # ensures no lowercase DNA chars '--xsize', '--threads', str(threads)] run_command(cmd) out_uc.seek(0) conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, clustered_sequences
def orient_seqs( sequences: DNAFASTAFormat, reference_sequences: DNAFASTAFormat, perc_identity: float = 0.9, query_cov: float = 0.9, threads: int = 1, left_justify: bool = False, ) -> (DNAFASTAFormat, DNAFASTAFormat): matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat() # use vsearch to search query seqs against reference database # report orientation of query seqs relative to reference seqs. with tempfile.NamedTemporaryFile() as out: # note: qmask is disabled as DNAFASTAFormat requires all output seqs # to be uppercase. Could loop through output seqs to convert to upper # but which is faster: disabling masking or looping through with skbio? cmd = [ 'vsearch', '--usearch_global', str(sequences), '--matched', str(matched_temp), '--notmatched', str(notmatched), '--db', str(reference_sequences), '--id', str(perc_identity), '--maxaccepts', '1', '--strand', 'both', '--qmask', 'none', '--query_cov', str(query_cov), '--threads', str(threads), '--userfields', 'qstrand', '--userout', out.name ] if left_justify: cmd.append('--leftjust') run_command(cmd) with open(out.name, 'r') as orient: orientations = [line.strip() for line in orient] # if any query seqs are in reverse orientation, reverse complement if '-' in orientations: matched = DNAFASTAFormat() with matched.open() as out_fasta: for seq, orientation in zip(matched_temp.view(DNAIterator), orientations): if orientation == '+': seq.write(out_fasta) elif orientation == '-': seq.reverse_complement().write(out_fasta) else: matched = matched_temp return matched, notmatched
def setUp(self): super().setUp() aligned_input_fp = self.get_data_path('trim-test-alignment.fasta') unaligned_input_fp = self.get_data_path('trim-test-sequences.fasta') self.alignedseqs = AlignedDNAFASTAFormat( aligned_input_fp, mode='r').view(AlignedDNAIterator) self.seqs = DNAFASTAFormat(unaligned_input_fp, mode='r').view(DNAIterator)
def call_otus(table: pd.DataFrame, sequences: DNAFASTAFormat, gen_crit: float = 0.1, abund_crit: float = 10.0, pval_crit: float = 0.0005) -> (pd.DataFrame, DNAFASTAFormat): ''' Read in input files, call OTUs, and return output feature table. table: pandas Dataframe sequence count table sequences: DNAFASTAFormat sequences fasta gen_crit, abund_crit, pval_crit: float threshold values for genetic criterion, abundance criterion, and distribution criterion (pvalue) ''' # ensure valid argument values assert gen_crit >= 0 assert abund_crit >= 0.0 assert pval_crit >= 0.0 and pval_crit <= 1.0 ## read in the sequences table #seq_table = read_sequence_table(seq_table_fh) ## set up the input fasta records # Note: calling str(DNAFastaFormat) returns the file path of the fasta records = SeqIO.index(str(sequences), 'fasta') # generate the caller object # Note: the dbotu code needs sequences in rows and samples in columns. # qiime feature tables have sequences in columns and samples in rows. # need to transpose the table when calling dbotu caller and before writing # results caller = dbotu.DBCaller(table.T, records, gen_crit, abund_crit, pval_crit, log=None, debug=None) # Call OTUs caller.run() # Get OTU table and sequences # Need to transpose to get back into qiime format dbotu_table = caller.otu_table().T # Write the representative sequences # First, initiate new object with type DNAFASTAFormat clustered_sequences = DNAFASTAFormat() # Pass it in to write_fasta as a file handle caller.write_fasta(open(str(clustered_sequences), 'w')) # Print the membership (only shows up if --verbose flag is used) caller.write_membership(sys.stdout) return dbotu_table, clustered_sequences
def test_duplicate_input_ids_in_unaligned(self): input_fp = self.get_data_path('unaligned-duplicate-ids.fasta') sequences = DNAFASTAFormat(input_fp, mode='r') alignment, _, _ = self._prepare_sequence_data() with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'): with redirected_stdio(stderr=os.devnull): mafft_add(alignment, sequences)
def setUp(self): super().setUp() taxonomy = Artifact.import_data('FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv')) self.taxonomy = taxonomy.view(pd.Series) # TODO: use `Artifact.import_data` here once we have a transformer # for DNASequencesDirectoryFormat -> DNAFASTAFormat self.reads_fp = self.get_data_path('se-dna-sequences.fasta') self.reads = DNAFASTAFormat(self.reads_fp, mode='r')
def _prepare_sequence_data(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') exp = skbio.TabularMSA( [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})] ) return input_sequences, exp
def test_duplicate_input_ids_across_aligned_and_unaligned(self): input_fp = self.get_data_path('aligned-duplicate-ids-2.fasta') alignment = DNAFASTAFormat(input_fp, mode='r') _, sequences, _ = self._prepare_sequence_data() with self.assertRaisesRegex(ValueError, 'aligned and unaligned.*seq1'): with redirected_stdio(stderr=os.devnull): mafft_add(alignment, sequences)
def test_mafft_parttree_exception(self): input_fp = os.path.join(self.temp_dir.name, 'million.fasta') with open(input_fp, "w") as f: for i in range(0, 1000002): f.write('>%d\nAAGCAAGC\n' % i) input_sequences = DNAFASTAFormat(input_fp, mode='r') with self.assertRaisesRegex(ValueError, '1 million'): with redirected_stdio(stderr=os.devnull): mafft(input_sequences)
def setUp(self): super().setUp() focal_seqs1 = self.get_data_path('focal-seqs-1.fasta') self.focal_seqs1 = DNAFASTAFormat(focal_seqs1, 'r') context_seqs1 = self.get_data_path('context-seqs-1.fasta') self.context_seqs1 = DNAFASTAFormat(context_seqs1, 'r') context_md1 = self.get_data_path('context-metadata-1.tsv') self.context_md1 = qiime2.Metadata.load(context_md1) focal_seqs2 = self.get_data_path('focal-seqs-2.fasta') self.focal_seqs2 = DNAFASTAFormat(focal_seqs2, 'r') context_seqs2 = self.get_data_path('context-seqs-2.fasta') self.context_seqs2 = DNAFASTAFormat(context_seqs2, 'r') context_md2 = self.get_data_path('context-metadata-2.tsv') self.context_md2 = qiime2.Metadata.load(context_md2)
def _split_fasta(sequences, train_ids, test_ids): ''' Split FeatureData[Sequence] artifact into two, based on two sets of IDs. sequences: FeatureData[Sequence] Artifact train_ids: set test_ids: set ''' train_seqs = DNAFASTAFormat() test_seqs = DNAFASTAFormat() with train_seqs.open() as _train, test_seqs.open() as _test: for s in sequences.view(DNAIterator): _id = s.metadata['id'] if s.metadata['id'] in train_ids: _train.write('>%s\n%s\n' % (_id, str(s))) elif s.metadata['id'] in test_ids: _test.write('>%s\n%s\n' % (_id, str(s))) train_seqs = q2.Artifact.import_data('FeatureData[Sequence]', train_seqs) test_seqs = q2.Artifact.import_data('FeatureData[Sequence]', test_seqs) return train_seqs, test_seqs
def test_failed_run_not_verbose(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') output_alignment = AlignedDNAFASTAFormat() unaligned_fp = str(input_sequences) aligned_fp = str(output_alignment) cmd = ["mafft", "--not-a-real-parameter", unaligned_fp] with self.assertRaises(subprocess.CalledProcessError): with redirected_stdio(stderr=os.devnull): run_command(cmd, aligned_fp, verbose=False)
def setUp(self): super().setUp() input_sequences_fp = self.get_data_path('dna-sequences-3.fasta') self.input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') self.input_sequences_list = _read_seqs(self.input_sequences) self.input_table = biom.Table( np.array([[100, 101, 103], [99, 98, 99], [4, 5, 6], [2, 2, 2]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3'])
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat: data = _read_gisaid_dna_fasta(str(fmt)) df = DNASequencesDirectoryFormat() ff = DNAFASTAFormat() with ff.open() as file: skbio.io.write(data, format='fasta', into=file) df.file.write_data(ff, DNAFASTAFormat) return df
def _uchime_ref(sequences, table, reference_sequences, dn, mindiffs, mindiv, minh, xn, threads): # this function only exists to simplify testing chimeras = DNAFASTAFormat() nonchimeras = DNAFASTAFormat() uchime_stats = UchimeStatsFmt() with tempfile.NamedTemporaryFile() as fasta_with_sizes: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = [ 'vsearch', '--uchime_ref', fasta_with_sizes.name, '--uchimeout', str(uchime_stats), '--nonchimeras', str(nonchimeras), '--chimeras', str(chimeras), '--dn', str(dn), '--mindiffs', str(mindiffs), '--mindiv', str(mindiv), '--minh', str(minh), '--xn', str(xn), '--db', str(reference_sequences), '--qmask', 'none', # ensures no lowercase DNA chars '--xsize', '--threads', str(threads), '--minseqlength', '1', '--fasta_width', '0' ] run_command(cmd) return cmd, chimeras, nonchimeras, uchime_stats
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat: df = DNASequencesDirectoryFormat() ff = DNAFASTAFormat() with ff.open() as file, \ tempfile.TemporaryFile(mode='w+') as temp_fh: data = _read_gisaid_dna_fasta(str(fmt), temp_fh) skbio.io.write(data, format='fasta', into=file) df.file.write_data(ff, DNAFASTAFormat) return df
def degap_seqs(aligned_sequences: AlignedDNAIterator, min_length: int = 1) -> DNAFASTAFormat: result = DNAFASTAFormat() with result.open() as out_fasta: for seq in aligned_sequences: dg_seq = seq.degap() # If seq is all gaps, then dg_seq will be an empty string # and we'll not write it out. if len(dg_seq) >= min_length: dg_seq.write(out_fasta) return result
def cull_seqs(sequences: DNAIterator, num_degenerates: int = 5, homopolymer_length: int = 8) -> DNAFASTAFormat: result = DNAFASTAFormat() with result.open() as out_fasta: for seq in sequences: degen = _filt_seq_with_degenerates(seq, num_degenerates) if not degen: poly = _filter_homopolymer(seq, homopolymer_length) if not poly: # if we make it here, write seq to file seq.write(out_fasta) return result