def test_autotune_reads_per_batch_more_jobs_than_reads(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=1105), 1)
def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str, r_primer: str, trunc_len: int = 0, trim_left: int = 0, identity: float = 0.8, min_length: int = 50, max_length: int = 0, n_jobs: int = 1, batch_size: int = 'auto') -> DNAFASTAFormat: """Extract the read selected by a primer or primer pair. Only sequences which match the primers at greater than the specified identity are returned Parameters ---------- sequences : DNASequencesDirectoryFormat An aligned list of skbio.sequence.DNA query sequences f_primer : skbio.sequence.DNA Forward primer sequence r_primer : skbio.sequence.DNA Reverse primer sequence trunc_len : int, optional Read is cut to trunc_len if trunc_len is positive. Applied before trim_left. trim_left : int, optional `trim_left` nucleotides are removed from the 5' end if trim_left is positive. Applied after trunc_len. identity : float, optional Minimum combined primer match identity threshold. Default: 0.8 min_length: int, optional Minimum amplicon length. Shorter amplicons are discarded. Default: 50 max_length: int, optional Maximum amplicon length. Longer amplicons are discarded. n_jobs: int, optional Number of seperate processes to break the task into. batch_size: int, optional Number of samples to be processed in one batch. Returns ------- q2_types.DNAFASTAFormat containing the reads """ if min_length > trunc_len - trim_left and trunc_len > 0: raise ValueError('The minimum length setting is greater than the ' 'length of the truncated sequences. This will cause ' 'all sequences to be removed from the dataset. To ' 'proceed, set a min_length ≤ trunc_len - trim_left.') n_jobs = effective_n_jobs(n_jobs) if batch_size == 'auto': batch_size = _autotune_reads_per_batch( sequences.file.view(DNAFASTAFormat), n_jobs) sequences = sequences.file.view(DNAIterator) ff = DNAFASTAFormat() with open(str(ff), 'a') as fh: with Parallel(n_jobs) as parallel: for chunk in _chunks(sequences, batch_size): amplicons = parallel(delayed(_gen_reads)(sequence, f_primer, r_primer, trunc_len, trim_left, identity, min_length, max_length) for sequence in chunk) for amplicon in amplicons: if amplicon is not None: skbio.write(amplicon, format='fasta', into=fh) if os.stat(str(ff)).st_size == 0: raise RuntimeError("No matches found") return ff
def test_autotune_reads_per_batch_zero_jobs(self): with self.assertRaisesRegex( ValueError, "Value other than zero must be specified"): _autotune_reads_per_batch(self.seq_path, n_jobs=0)
def test_autotune_reads_per_batch_ceil(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=5), 221)
def test_autotune_reads_per_batch(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=4), 276)
def test_autotune_reads_per_batch_disable_if_single_job(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=1), 500000)
def test_autotune_reads_per_batch_disable_if_single_job(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=1), 20000)