def _make_barcode_map(barcodes, rev_comp_mapping_barcodes): barcode_map = {} barcode_len = None for sample_id, barcode in barcodes.to_series().iteritems(): if barcode_len is None: barcode_len = len(barcode) elif len(barcode) != barcode_len: raise ValueError('Barcodes of different lengths were detected: ' '%d != %d. Variable length barcodes are not ' 'supported.' % (len(barcode), barcode_len)) try: skbio.DNA(barcode) except ValueError as ve: if re.match(r'^ValueError\("Invalid characters in sequence[.,' ' \n]*', ve.__repr__()): raise ValueError("Invalid characters found in specified " "barcodes column within metadata file. " "Please confirm that the column: '%s' " "contains your per-sample barcodes." % (barcodes.name)) else: raise if rev_comp_mapping_barcodes: barcode = str(skbio.DNA(barcode).reverse_complement()) if barcode in barcode_map: raise ValueError('A duplicate barcode was detected. The barcode ' '%s was observed for samples %s and %s.' % (barcode, sample_id, barcode_map[barcode])) barcode_map[barcode] = sample_id return barcode_map, barcode_len
def _prepare_sequence_data(self): sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') sequences = DNAFASTAFormat(sequences_fp, mode='r') alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta') alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r') exp = skbio.TabularMSA([ skbio.DNA('AGGGGG-', metadata={ 'id': 'aln-seq-1', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'aln-seq-2', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-GGGGGG', metadata={ 'id': 'seq2', 'description': '' }) ]) return alignment, sequences, exp
def _gen_reads(sequence, f_primer, r_primer, trim_right, trunc_len, trim_left, identity, min_length, max_length, read_orientation): f_primer = skbio.DNA(f_primer) r_primer = skbio.DNA(r_primer) amp = None if read_orientation in ['forward', 'both']: amp = _exact_match(sequence, f_primer, r_primer) if not amp and read_orientation in ['reverse', 'both']: amp = _exact_match(sequence.reverse_complement(), f_primer, r_primer) if not amp and read_orientation in ['forward', 'both']: amp = _approx_match(sequence, f_primer, r_primer, identity) if not amp and read_orientation in ['reverse', 'both']: amp = _approx_match(sequence.reverse_complement(), f_primer, r_primer, identity) if not amp: return # we want to filter by max length before trimming if max_length > 0 and len(amp) > max_length: return if trim_right > 0: amp = amp[:-trim_right] if trunc_len > 0: amp = amp[:trunc_len] if trim_left > 0: amp = amp[trim_left:] if min_length > 0 and len(amp) < min_length: return if not amp: return return amp
def test_sample_from_contig_set(self): def mock(start, stop, n): if start == 0: return np.tile([0, 5, 10, 15, 9, 12], 100)[:n] else: return np.tile([40, 41, 42, 43], 100)[:n] np.random.seed(1234) # An integration test # 0123456789012345678901234567890123456789 sequences = [ skbio.DNA('ATGCAATTGGCCAAATTTGGGCCCAAAATTTTGGGGCCCC'), skbio.DNA('CGTACCGGTT') ] fullseq = skbio.DNA.concat(sequences) depth = 100 length = 3 obs = sample_from_contig_set(sequences, depth, length, mock) indices = [] for o in obs: remapped = self.remap(o) self.assertIn(remapped, fullseq) indices.append(fullseq.index(remapped)) # we expect the both the first and second sequence to be fully # represented by our starting indices except in rare stochastic # scenario (as on average, 20 reads will come from the second contig) self.assertTrue(set(indices) == {0, 5, 10, 15, 9, 12, 40, 41, 42, 43})
def _gen_reads(sequence, f_primer, r_primer, trunc_len, trim_left, identity, min_length, max_length): f_primer = skbio.DNA(f_primer) r_primer = skbio.DNA(r_primer) amp = _exact_match(sequence, f_primer, r_primer) if not amp: amp = _exact_match(sequence.reverse_complement(), f_primer, r_primer) if not amp: amp = _approx_match(sequence, f_primer, r_primer, identity) if not amp: amp = _approx_match( sequence.reverse_complement(), f_primer, r_primer, identity) if not amp: return # we want to filter by max length before trimming if max_length > 0 and len(amp) > max_length: return if trunc_len > 0: amp = amp[:trunc_len] if trim_left > 0: amp = amp[trim_left:] if min_length > 0 and len(amp) < min_length: return if not amp: return return amp
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids): _check_featureless_table(biom_fp) with open(biom_fp) as fh: table = biom.Table.from_tsv(fh, None, None, None) df = pd.read_csv(track_fp, sep='\t', index_col=0) df.index.name = 'sample-id' df = df.rename(index=_filepath_to_sample) metadata = qiime2.Metadata(df) # Currently the sample IDs in DADA2 are the file names. We make # them the sample id part of the filename here. sid_map = { id_: _filepath_to_sample(id_) for id_ in table.ids(axis='sample') } table.update_ids(sid_map, axis='sample', inplace=True) # The feature IDs in DADA2 are the sequences themselves. if hashed_feature_ids: # Make feature IDs the md5 sums of the sequences. fid_map = { id_: hashlib.md5(id_.encode('utf-8')).hexdigest() for id_ in table.ids(axis='observation') } table.update_ids(fid_map, axis='observation', inplace=True) rep_sequences = DNAIterator( (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items())) else: rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_}) for id_ in table.ids(axis='observation'))) return table, rep_sequences, metadata
def test_dereplicate_sequences_prefix(self): input_sequences_fp = self.get_data_path('seqs-1') input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r') exp_table = biom.Table(np.array([[2, 2], [2, 0]]), ['4574b947a0159c0da35a1f30f989681a1d9f64ef', '16a1263bde4f2f99422630d1bb87935c4236d1ba'], ['s2', 'sample1']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = dereplicate_sequences( sequences=input_sequences, derep_prefix=True) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) # sequences are reverse-sorted by abundance in output obs_seqs = list(skbio.io.read(str(obs_sequences), constructor=skbio.DNA, format='fasta')) exp_seqs = [skbio.DNA('AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGG', metadata={'id': ('4574b947a0159c0da35a1f30f' '989681a1d9f64ef'), 'description': 's2_1'}), skbio.DNA('ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCG', metadata={'id': ('16a1263bde4f2f99422630d1bb' '87935c4236d1ba'), 'description': 's2_42'})] self.assertEqual(obs_seqs, exp_seqs)
def test_apply_mask_mask_all(self): obs = _apply_mask(self.msa1, np.array([True, True, True, True])) seqs = [ skbio.DNA('', metadata=dict(id='s1')), skbio.DNA('', metadata=dict(id='s2')), skbio.DNA('', metadata=dict(id='s3'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def denoise(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt, pos_ref_filepath: str=None, neg_ref_filepath: str=None, mean_error: float=0.005, indel_prob: float=0.01, indel_max: int=3, trim_length: int=150, min_reads: int=0, min_size: int=2, negate: bool=False, jobs_to_start: int=1, hashed_feature_ids: bool=True) -> (biom.Table, DNAIterator): with tempfile.TemporaryDirectory() as tmp: seqs_fp = str(demultiplexed_seqs) cmd = ['deblur', 'workflow', '--seqs-fp', seqs_fp, '--output-dir', tmp, '--mean-error', str(mean_error), '--indel-prob', str(indel_prob), '--indel-max', str(indel_max), '--trim-length', str(trim_length), '--min-reads', str(min_reads), '--min-size', str(min_size), '-w'] if pos_ref_filepath is not None: cmd.append('--pos-ref-db') cmd.append(pos_ref_filepath) if neg_ref_filepath is not None: cmd.append('--neg-ref-db') cmd.append(neg_ref_filepath) if negate: cmd.append('--negate') subprocess.run(cmd, check=True) # code adapted from q2-dada2 table = biom.load_table(os.path.join(tmp, 'final.biom')) sid_map = {id_: id_.split('_')[0] for id_ in table.ids(axis='sample')} table.update_ids(sid_map, axis='sample', inplace=True) if hashed_feature_ids: # Make feature IDs the md5 sums of the sequences. fid_map = {id_: hashlib.md5(id_.encode('utf-8')).hexdigest() for id_ in table.ids(axis='observation')} table.update_ids(fid_map, axis='observation', inplace=True) rep_sequences = DNAIterator((skbio.DNA(k, metadata={'id': v}, lowercase='ignore') for k, v in fid_map.items())) else: rep_sequences = DNAIterator( (skbio.DNA(id_, metadata={'id': id_}, lowercase='ignore') for id_ in table.ids(axis='observation'))) return (table, rep_sequences)
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids): _check_featureless_table(biom_fp) with open(biom_fp) as fh: table = biom.Table.from_tsv(fh, None, None, None) df = pd.read_csv(track_fp, sep='\t', index_col=0) df.index.name = 'sample-id' df = df.rename(index=_filepath_to_sample) PASSED_FILTER = 'percentage of input passed filter' NON_CHIMERIC = 'percentage of input non-chimeric' round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2} df[PASSED_FILTER] = df['filtered'] / df['input'] * 100 df[NON_CHIMERIC] = df['non-chimeric'] / df['input'] * 100 col_order = [ 'input', 'filtered', PASSED_FILTER, 'denoised', 'non-chimeric', NON_CHIMERIC ] # only calculate percentage of input merged if paired end if 'merged' in df: MERGED = 'percentage of input merged' round_cols[MERGED] = 2 df[MERGED] = df['merged'] / df['input'] * 100 col_order.insert(4, 'merged') col_order.insert(5, MERGED) df = df[col_order] df.fillna(0, inplace=True) df = df.round(round_cols) metadata = qiime2.Metadata(df) # Currently the sample IDs in DADA2 are the file names. We make # them the sample id part of the filename here. sid_map = { id_: _filepath_to_sample(id_) for id_ in table.ids(axis='sample') } table.update_ids(sid_map, axis='sample', inplace=True) # The feature IDs in DADA2 are the sequences themselves. if hashed_feature_ids: # Make feature IDs the md5 sums of the sequences. fid_map = { id_: hashlib.md5(id_.encode('utf-8')).hexdigest() for id_ in table.ids(axis='observation') } table.update_ids(fid_map, axis='observation', inplace=True) rep_sequences = DNAIterator( (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items())) else: rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_}) for id_ in table.ids(axis='observation'))) return table, rep_sequences, metadata
def _prepare_sequence_data(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') exp = skbio.TabularMSA( [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})] ) return input_sequences, exp
def test_empty_return(self): inp = pd.Series({ 's1': skbio.DNA('ACGTTGACA', metadata={'id': 's1'}), 's2': skbio.DNA('AAN', metadata={'id': 's2'}) }) exp = pd.Series() obs = filter_seqs(inp, min_length=29000) self.assertEqual(list(obs.index), list(exp.index)) self.assertEqual(list(obs), list(exp))
def test_too_short_and_too_ambiguous(self): inp = pd.Series({ 's1': skbio.DNA('ACGTTGACA', metadata={'id': 's1'}), 's2': skbio.DNA('AAN', metadata={'id': 's2'}) }) exp = pd.Series({'s1': skbio.DNA('ACGTTGACA', metadata={'id': 's1'})}) obs = filter_seqs(inp, max_proportion_ambiguous=.3, min_length=4) self.assertEqual(list(obs.index), list(exp.index)) self.assertEqual(list(obs), list(exp))
def test_too_long(self): inp = pd.Series({ 's1': skbio.DNA('ACGTTGACA', metadata={'id': 's1'}), 's2': skbio.DNA('AA', metadata={'id': 's2'}) }) exp = pd.Series({'s2': skbio.DNA('AA', metadata={'id': 's2'})}) obs = filter_seqs(inp, max_length=3) self.assertEqual(list(obs.index), list(exp.index)) self.assertEqual(list(obs), list(exp))
def test_no_filter(self): exp = pd.Series({ 's1': skbio.DNA('ACGTTNGACA', metadata={'id': 's1'}), 's2': skbio.DNA('A', metadata={'id': 's2'}), 's3': skbio.DNA('NNNNNN', metadata={'id': 's3'}) }) obs = filter_seqs(exp) self.assertEqual(list(obs.index), list(exp.index)) self.assertEqual(list(obs), list(exp))
def test_error_on_empty_alignment_conservation_boundary(self): alignment1 = skbio.TabularMSA( [skbio.DNA('A', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('C', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('G', metadata={'id': 'seq3', 'description': ''})]) self.assertRaisesRegex(ValueError, " 0.00% of positions were retained by the con", mask, alignment1, max_gap_frequency=1.0, min_conservation=0.5)
def test_create_position_map_all_gaps(self): seqs = [ skbio.DNA('ACGT', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('----', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_position_map(msa, 's3') exp = np.array([]) npt.assert_array_equal(obs, exp)
def test_get_iterator(self): tab = biom.Table(np.ones((3, 2)), ['ATCC', 'ATGG', 'CACA'], ['S1', 'S2']) exp = [ skbio.DNA('ATCC', metadata={'id': 'ATCC'}), skbio.DNA('ATGG', metadata={'id': 'ATGG'}), skbio.DNA('CACA', metadata={'id': 'CACA'}) ] obs = list(_get_featuredata_from_table(tab)) self.assertEqual(obs, exp)
def test_apply_mask_mask_some(self): obs = _apply_mask(self.msa1, np.array([False, True, True, True])) seqs = [ skbio.DNA('A', metadata=dict(id='s1')), skbio.DNA('A', metadata=dict(id='s2')), skbio.DNA('-', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = _apply_mask(self.msa1, np.array([False, True, True, False])) seqs = [ skbio.DNA('AT', metadata=dict(id='s1')), skbio.DNA('AT', metadata=dict(id='s2')), skbio.DNA('-T', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = _apply_mask(self.msa1, np.array([False, True, False, False])) seqs = [ skbio.DNA('AGT', metadata=dict(id='s1')), skbio.DNA('A-T', metadata=dict(id='s2')), skbio.DNA('--T', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def test_create_terminal_gap_mask_one_chrome(self): obs = _create_terminal_gap_mask(self.msa1, self.mask5) npt.assert_array_equal(obs, [True, False, False, False]) seqs = [ skbio.DNA('ACG-', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('-C-T', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_terminal_gap_mask(msa, self.mask5) npt.assert_array_equal(obs, [True, False, False, False])
def test_invalid_conservation_threshold(self): alignment = skbio.TabularMSA( [skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('-', metadata={'id': 'seq3', 'description': ''})] ) eps = np.finfo(float).eps with self.assertRaises(ValueError): mask(alignment, min_conservation=0.0 - eps) with self.assertRaises(ValueError): mask(alignment, min_conservation=1.0 + eps)
def test_empty_input(self): alignment = skbio.TabularMSA( [skbio.DNA('', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('', metadata={'id': 'seq3', 'description': ''})] ) with self.assertRaises(ValueError): mask(alignment) alignment = skbio.TabularMSA([]) with self.assertRaises(ValueError): mask(alignment)
def test_basic(self): seqs = DNAIterator( (s for s in (skbio.DNA('ACGT', metadata={'id': 'seq1'}), skbio.DNA('AAAA', metadata={'id': 'seq2'})))) with tempfile.TemporaryDirectory() as output_dir: tabulate_seqs(output_dir, seqs) expected_fp = os.path.join(output_dir, 'index.html') self.assertTrue(os.path.exists(expected_fp)) self.assertTrue('ACGT</a>' in open(expected_fp).read()) self.assertTrue('<td>seq2</td>' in open(expected_fp).read())
def test_join_contigs(self): sequences = [ skbio.DNA('AATTGG'), skbio.DNA('CCTTAA'), skbio.DNA('ATAT') ] # 0123456789012345 exp_seq = skbio.DNA('AATTGGCCTTAAATAT') exp_breaks = np.array([0, 6, 12, 16]) obs_seq, obs_breaks = join_contigs(sequences) self.assertEqual(obs_seq, exp_seq) npt.assert_equal(obs_breaks, exp_breaks)
def setUp(self): self.seq_block = [ pd.DataFrame(data=[list('CATS'), list('WANT'), list("CANS")], index=['0', '1', '2']) ] self.skbio_series = pd.Series( data={ "0": skbio.DNA('CATS', metadata={'id': '0'}), "1": skbio.DNA('WANT', metadata={'id': '1'}), "2": skbio.DNA('CANS', metadata={'id': '2'}), }) self.seq_artifact = Artifact.import_data('FeatureData[Sequence]', self.skbio_series, pd.Series)
def _validate_seq(self, seq): if seq: # Will raise a `ValueError` on invalid DNA characters. skbio.DNA(seq, validate=True) else: # Empty sequence. raise Exception()
def test_descriptive_stats_integration(self): seqs = DNAIterator(skbio.DNA(a, metadata=b)for a, b in ( ('A', {'id': 'seq01'}), ('AA', {'id': 'seq02'}), ('AAA', {'id': 'seq03'}), ('AAAA', {'id': 'seq04'}), ('AAAA', {'id': 'seq05'}), ('AAA', {'id': 'seq06'}), ('AA', {'id': 'seq07'}), ('AAAAAAAAAA', {'id': 'seq08'}))) with tempfile.TemporaryDirectory() as output_dir: tabulate_seqs(output_dir, seqs) expected_fp = os.path.join(output_dir, 'index.html') # all expected values are unique. If they all render in index.html, our # function likely worked as expected. with open(expected_fp) as fh: file_text = fh.read() self.assertTrue('<td>8</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>10</td>' in file_text) self.assertTrue('<td>3.62</td>' in file_text) self.assertTrue('<td>9</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>2</td>' in file_text) self.assertTrue('<td>3</td>' in file_text) self.assertTrue('<td>4</td>' in file_text) self.assertTrue('<td>6</td>' in file_text) self.assertTrue('<td>9</td>' in file_text)
def _16(data: pd.Series) -> DNAFASTAFormat: ff = DNAFASTAFormat() with ff.open() as f: for id_, seq in data.iteritems(): sequence = skbio.DNA(seq, metadata={'id': id_}) skbio.io.write(sequence, format='fasta', into=f) return ff
def _get_featuredata_from_table(table): """Extract the observations and interpret as skbio.DNA""" if table.is_empty(): raise ValueError("No features") it = (skbio.DNA(i, metadata={'id': i}) for i in table.ids(axis='observation')) return DNAIterator(it)
def test_sample_from_contig_set_one_short(self): randfunc = np.random.randint np.random.seed(1234) # An integration test # 0123456789012345678901234567890123456789 sequences = [ skbio.DNA('ATGCAATTGGCCAAATTTGGGCCCAAAATTTTGGGGCCCC'), skbio.DNA('CGTACCGGTT') ] depth = 100 length = 15 obs = sample_from_contig_set(sequences, depth, length, randfunc) self.assertEqual(depth, len(obs))