def test_launch_workflow_incorrect_trim(self): """ test if we get the warning when trim length is too long """ # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) seqs_fp = self.seqs_s1_fp output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 # trim length longer than sequences trim_length = 151 left_trim_length = 0 threads = 1 with self.assertWarns(UserWarning): launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, left_trim_length=left_trim_length, min_size=min_size, ref_fp=(ref_fp, ), ref_db_fp=ref_db_fp, threads_per_sample=threads)
def test_launch_workflow_skip_trim(self): # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) seqs_fp = self.seqs_s1_fp output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 # trim length longer than sequences trim_length = -1 left_trim_length = 0 threads = 1 output_fp = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, left_trim_length=left_trim_length, min_size=min_size, ref_fp=(ref_fp, ), ref_db_fp=ref_db_fp, threads_per_sample=threads) exp = Sequence.read(self.no_trim_res, format='fasta') res = Sequence.read(output_fp, format='fasta') self.assertEqual(exp, res)
def run_workflow_try(self, simfilename, origfilename, ref_fp, ref_db_fp, threads=1, trim_length=100): """Test launching the complete workflow using simulated sequences and compare to original ground truth. Parameters ---------- simfilename : str name of the simulated reads fasta file origfilename : str name of the fasta file with the ground truth sequences ref_fp : list of str list of the reference database files def_db_fp : list of str list of the indexed database files or None to create them threads : int number of threads to use (default=1) trim_length : int length of sequences to trim to (default=100) """ seqs_fp = simfilename output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 negate = False nochimera = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, min_size=min_size, ref_fp=(ref_fp,), ref_db_fp=ref_db_fp, negate=negate, threads_per_sample=threads) # get the trimmed ground truth sequences orig_seqs = [item[1] for item in sequence_generator(origfilename)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] output_filename = 'final.biom' output_table_fp = join(output_fp, output_filename) create_otu_table(output_table_fp, [(nochimera, seqs_fp)]) table_obs = load_table(output_table_fp) outseqs = table_obs.ids(axis='observation') outseqs = list(outseqs) outseqs.sort() orig_seqs.sort() # test we see all ground truth sequences and no other self.assertEqual(outseqs, orig_seqs)
def run_workflow_try(self, simfilename, origfilename, ref_fp, ref_db_fp): """Test launching the complete workflow using simulated sequences and compare to original ground truth. Parameters ---------- simfilename : str name of the simulated reads fasta file origfilename : str name of the fasta file with the ground truth sequences """ seqs_fp = simfilename output_fp = self.working_dir read_error = 0.05 mean_error = 0.005 error_dist = None indel_prob = 0.01 indel_max = 3 trim_length = 100 min_size = 2 negate = False threads = 1 delim = '_' biom_fp = launch_workflow(seqs_fp, output_fp, read_error, mean_error, error_dist, indel_prob, indel_max, trim_length, min_size, (ref_fp, ), ref_db_fp, negate, threads, delim) # get the trimmed ground truth sequences with open(origfilename, 'U') as f: orig_seqs = [item[1] for item in parse_fasta(f)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] table_obs = load_table(biom_fp) outseqs = table_obs.ids(axis='observation') # test we see all ground truth sequences and no other self.assertItemsEqual(outseqs, orig_seqs)