def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(sequence_generator(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp) error_dist = np.array([ 1, 0.06, 0.02, 0.02, 0.01, 0.005, 0.005, 0.005, 0.001, 0.001, 0.001, 0.0005 ]) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(sequence_generator(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp)
def compare_result(self, simfilename, origfilename, trim_length): """Compare the results of deblurring to the original mixture Parameters ---------- simfilename : str name of the simulated reads fasta file origfilename : str name of the fasta file with the ground truth sequences trim_length : int the length used for trimming the sequences in deblurring """ # get the trimmed ground truth sequences orig_seqs = [item[1] for item in sequence_generator(origfilename)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] # get the deblurred fasta file sequences out_seqs = [item[1] for item in sequence_generator(simfilename)] out_seqs = [item.upper() for item in out_seqs] out_seqs.sort() orig_seqs.sort() # test we see all ground truth sequences and no other self.assertEqual(out_seqs, orig_seqs)
def validate_results(self, table_name, orig_fasta_name): res_table = load_table(table_name) res_seqs = list(res_table.ids(axis='observation')) exp_seqs = [item[1] for item in sequence_generator(orig_fasta_name)] exp_seqs = list(map(lambda x: x.upper()[:self.trim_length], exp_seqs)) self.assertListEqual(res_seqs, exp_seqs)
def test_remove_artifacts_from_biom_table(self): """ Test remove_artifacts_from_biom_table() function for removing non 16s sequences from a biom table and matching fasta file. This test uses a pre-calculated biom table and fasta file and tests the output only 16s and only artifacts tables s4 dataset is similar to s2 but with two added non-16s sequences (which are not phix/adapter) """ # create the positive reference databases pos_ref_fp = join(self.test_data_dir, '70_otus.fasta') pos_ref_db_fp = build_index_sortmerna(ref_fp=(pos_ref_fp, ), working_dir=self.working_dir) # remove the artifacts from the s4 biom table input_biom_file = join(self.test_data_dir, 'final.s4.biom') input_fasta_file = join(self.test_data_dir, 'final.s4.seqs.fa') remove_artifacts_from_biom_table(input_biom_file, input_fasta_file, [pos_ref_fp], self.working_dir, pos_ref_db_fp) origfilename = join(self.test_data_dir, 'simset.s2.fasta') trim_length = 150 orig_seqs = [item[1] for item in sequence_generator(origfilename)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] no_artifacts_table_name = join(self.working_dir, 'final.only-16s.biom') no_artifacts_table = load_table(no_artifacts_table_name) obs_seqs = no_artifacts_table.ids(axis='observation') self.assertEqual(set(obs_seqs), set(orig_seqs)) artifacts_table_name = join(self.working_dir, 'final.only-non16s.biom') artifacts_table = load_table(artifacts_table_name) obs_seqs = artifacts_table.ids(axis='observation') self.assertEqual(len(obs_seqs), 2)
def test_multiple_sequence_alignment(self): """Test multiple sequence alignment. """ seqs = [DNA('caccggcggcccggtggtggccattattattgggtctaaag', metadata={'id': 'seq_1'}, lowercase=True), DNA('caccggcggcccgagtggtggccattattattgggtcaagg', metadata={'id': 'seq_2'}, lowercase=True), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', metadata={'id': 'seq_3'}, lowercase=True), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', metadata={'id': 'seq_4'}, lowercase=True), DNA('caccgggcccgagtggtggccattattattgggtctaaag', metadata={'id': 'seq_5'}, lowercase=True)] seqs_fp = join(self.working_dir, "seqs.fna") with open(seqs_fp, 'w') as o: for seq in seqs: seq.write(o, format='fasta') alignment_file = multiple_sequence_alignment(seqs_fp) aligned_seqs = [DNA(item[1], metadata={'id': item[0]}, lowercase=True) for item in sequence_generator(alignment_file)] align_exp = [ DNA('caccggcggcccg-gtggtggccattattattgggtctaaag', lowercase=True, metadata={'id': 'seq_1'}), DNA('caccggcggcccgagtggtggccattattattgggtcaagg-', lowercase=True, metadata={'id': 'seq_2'}), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', lowercase=True, metadata={'id': 'seq_3'}), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', lowercase=True, metadata={'id': 'seq_4'}), DNA('caccg--ggcccgagtggtggccattattattgggtctaaag', lowercase=True, metadata={'id': 'seq_5'})] self.assertEqual(aligned_seqs, align_exp)
def get_seqs_act_split_sequence_on_sample_ids(self, output_dir): """Parse output of split_sequence_file_on_sample_ids_to_files() Parameters ---------- output_dir: string output directory path storing FASTA files Returns ------- seqs_act: dict dictionary with keys being sample IDs and values list of sequences belonging to sample ID """ seqs_act = {} for fn in listdir(output_dir): input_fp = join(output_dir, fn) sample_file = splitext(fn)[0] for label, seq in sequence_generator(input_fp): sample = label.split('_')[0] self.assertEqual(sample_file, sample) if sample not in seqs_act: seqs_act[sample] = [(label, seq)] else: seqs_act[sample].append((label, seq)) return seqs_act
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion tseq = cseq[:10] + 'A' + cseq[10:-1] + '-' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 1 sequence as output self.assertEqual(len(obs), 1) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence)
def test_dereplicate_seqs(self): """ Test dereplicate_seqs() method functionality, keep singletons """ seqs = [("seq1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq2", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq4", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"), ("seq5", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG"), ("seq6", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"), ("seq7", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) output_fp = join(self.working_dir, "seqs_derep.fasta") dereplicate_seqs(seqs_fp=seqs_fp, output_fp=output_fp, min_size=1) self.assertTrue(isfile(output_fp)) exp = [("seq1;size=3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq6;size=2", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"), ("seq4;size=1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"), ("seq5;size=1", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG")] act = [item for item in sequence_generator(output_fp)] self.assertEqual(act, exp)
def run_workflow_try(self, simfilename, origfilename, ref_fp, ref_db_fp, threads=1, trim_length=100): """Test launching the complete workflow using simulated sequences and compare to original ground truth. Parameters ---------- simfilename : str name of the simulated reads fasta file origfilename : str name of the fasta file with the ground truth sequences ref_fp : list of str list of the reference database files def_db_fp : list of str list of the indexed database files or None to create them threads : int number of threads to use (default=1) trim_length : int length of sequences to trim to (default=100) """ seqs_fp = simfilename output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 negate = False nochimera = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, min_size=min_size, ref_fp=(ref_fp,), ref_db_fp=ref_db_fp, negate=negate, threads_per_sample=threads) # get the trimmed ground truth sequences orig_seqs = [item[1] for item in sequence_generator(origfilename)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] output_filename = 'final.biom' output_table_fp = join(output_fp, output_filename) create_otu_table(output_table_fp, [(nochimera, seqs_fp)]) table_obs = load_table(output_table_fp) outseqs = table_obs.ids(axis='observation') outseqs = list(outseqs) outseqs.sort() orig_seqs.sort() # test we see all ground truth sequences and no other self.assertEqual(outseqs, orig_seqs)
def test_sequence_generator_invalid_format(self): allres = [] input_fp = join(self.test_data_dir, 'readme.txt') msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp with self.assertWarns(UserWarning) as W: for res in sequence_generator(input_fp): allres.append(res) self.assertEqual(len(allres), 0) self.assertEqual(str(W.warning), msg)
def test_sequence_generator_fastq(self): exp_len = 2 first_id = "foo" first_few_nucs = "GCgc" obs = list(sequence_generator(self.seqs_fq_fp)) self.assertEqual(len(obs), exp_len) self.assertEqual(obs[0][0], first_id) self.assertTrue(obs[0][1].startswith(first_few_nucs))
def test_sequence_generator_fasta(self): exp_len = 135 first_id = "s1_1001203-10" first_few_nucs = "TACGTAGGTGGCAAGCGTTA" obs = list(sequence_generator(self.seqs_s1_fp)) self.assertEqual(len(obs), exp_len) self.assertEqual(obs[0][0], first_id) self.assertTrue(obs[0][1].startswith(first_few_nucs))
def test_fasta_from_biom(self): '''Test the fasta file from biom table function fasta_from_biom() ''' input_biom_file = join(self.test_data_dir, 'final.s4.biom') table = load_table(input_biom_file) output_fasta = join(self.working_dir, 'final.s4.seqs.fa') fasta_from_biom(table, output_fasta) self.assertTrue(isfile(output_fasta)) table_seqs = table.ids(axis='observation') expected = [(seq, seq) for seq in table_seqs] written_seqs = [item for item in sequence_generator(output_fasta)] self.assertListEqual(written_seqs, expected)
def test_deblur_toy_example(self): seqs_f = StringIO(TEST_SEQS_1) obs = deblur(sequence_generator(seqs_f)) exp = [ Sequence( "E.Coli;size=1000;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp)
def test_remove_artifacts_seqs(self): """ Test remove_artifacts_seqs() function for removing sequences not matching to a reference database using SortMeRNA. This test forces a new index construction for the reference sequences. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] exp_seqs = ["seq1", "seq2", "seq3", "seq4", "seq5", "seq6"] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref2.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) output_fp, num_seqs_left, tmp_files = remove_artifacts_seqs( seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=ref_db_fp, negate=False, threads=1) obs_seqs = [] for label, seq in sequence_generator(output_fp): obs_seqs.append(label) self.assertEqual(obs_seqs, exp_seqs) # validate it creates one tmp file self.assertEqual(len(tmp_files), 1)
def test_remove_artifacts_seqs_negate(self): """ Test remove_artifacts_seqs() function for removing sequences matching to a reference database using SortMeRNA. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] # seq5 is 80% similar, so should be kept for 0.95 default similarity # to artifacts exp_seqs = ["seq5", "phix1", "phix2", "phix3"] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref4.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) ref_db_fp = build_index_sortmerna([ref_fp], self.working_dir) output_fp = join(self.working_dir, "seqs_filtered.fasta") output_fp, num_seqs_left, _ = remove_artifacts_seqs( seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=ref_db_fp, negate=True, threads=1) obs_seqs = [] for label, seq in sequence_generator(output_fp): obs_seqs.append(label) self.assertEqual(obs_seqs, exp_seqs)
def test_remove_chimeras_denovo_from_seqs(self): """ Test remove_chimeras_denovo_from_seqs() method functionality. Remove chimeric sequences from a FASTA file using the UCHIME algorithm, implemented in VSEARCH. """ seqs = [("s1_104;size=2;", "GTGCCAGCCGCCGCGGTAATACCCGCAGCTCAAGTGGTG" "GTCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTT" "GTAAATCCCTGGGTAAATCGGGAAGCTTAACTTTCCGAC" "TTCCGAGGAGACTGTCAAACTTGGGACCGGGAG"), ("s1_106;size=2;", "GTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTG" "TGGATGTTTATTGGGCCTAAAGCGTCCGTAGCCGGCTGC" "GCAAGTCTGTCGGGAAATCCGCACGCCTAACGTGCGGGC" "GTCCGGCGGAAACTGCGTGGCTTGGGACCGGAA"), ("s1_1;size=9;", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAA" "ACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT" "CGCTTAACGATCCGATTCTGGGGAGACTGCAAAGCTTGGGA" "CCGGGCGAGGTTAGAGGTACTCTCGGG"), ("s1_20;size=9;", "TACCTGCAGCCCAAGTGGTGGTCGATTTTATTGAGTCTAA" "AACGTTCGTAGCCGGTTTGATAAATCCTTGGGTAAATCGG" "GAAGCTTAACTTTCCGATTCCGAGGAGACTGTCAAACTTG" "GGACCGGGAGAGGCTAGAGGTACTTCTGGG"), ("s1_40;size=8;", "TACCAGCTCTCCGAGTGGTGTGGATGTTTATTGGGCCTAA" "AGCATCCGTAGCTGGCTAGGTTAGTCCCCTGTTAAATCCA" "CCGAATTAATCGTTGGATGCGGGGGATACTGCTTGGCTAG" "GGGACGAGAGAGGCAGACGGTATTTCCGGG"), ("s1_60;size=8;", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAA" "AGCGTCCGTAGCCGGCTGCGCAAGTCTGTCGGGAAATCCG" "CACGCCTAACGTGCGGGTCCGGCGGAAACTGCGTGGCTTG" "GGACCGGAAGACTCGAGGGGTACGTCAGGG")] seqs_non_chimera = [ "s1_1;size=9;", "s1_20;size=9;", "s1_40;size=8;", "s1_60;size=8;" ] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) output_fp = remove_chimeras_denovo_from_seqs( seqs_fp=seqs_fp, working_dir=self.working_dir) seqs_obs = [] for label, seq in sequence_generator(output_fp): label = label.split()[0] seqs_obs.append(label) self.assertEqual(seqs_non_chimera, seqs_obs)
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel1-read;size=30;' newseqs.append((chead, tseq)) # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed) cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel2-read;size=31;' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty) self.assertEqual(len(obs), 2) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence) self.assertEqual(obs[1].label, '>indel2-read;size=31;')
def test_create_otu_table(self): # merge the fasta files m1 = join( self.test_data_dir, 'testmerge.fasta.trim.derep.no_artifacts' '.msa.deblur.no_chimeras') m2 = join( self.test_data_dir, 'testmerge2.fasta.trim.derep.no_artifacts' '.msa.deblur.no_chimeras') outfile = join(self.working_dir, 'testmerge.biom') fasta_outfile = join(self.working_dir, 'testmerge.seq.fa') create_otu_table(outfile, [(m1, 'testmerge'), (m2, 'testmerge2')], outputfasta_fp=fasta_outfile) # test the result table = load_table(outfile) tableids = table.ids(axis='observation') # test a sequence present in both self.assertEqual( table.get_value_by_ids( 'TACGAGGGGGGCGAGCGTTGTTCGGAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCG' 'GTAAGTTTCGTGTGAAATCTTCGGGCTCAACTCGAAGCCTGCACGAAATACTGCCGGGC' 'TTGAGTGTGGGAGAGGTGAGTGGAATTTCCGGT', 'testmerge'), 5) self.assertEqual( table.get_value_by_ids( 'TACGAGGGGGGCGAGCGTTGTTCG' 'GAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCGGTAAGTTTCGTGTGAAATCTTCGGG' 'CTCAACTCGAAGCCTGCACGAAATACTGCCGGGCTTGAGTGTGGGAGAGGTGAGTGGAAT' 'TTCCGGT', 'testmerge2'), 8) # and an otu present only in one self.assertEqual( table.get_value_by_ids( 'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTT' 'AAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGA' 'GTGCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge'), 7) self.assertEqual( table.get_value_by_ids( 'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTTA' 'AGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGAGT' 'GCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge2'), 0) # test the output fasta file allseqs = [] for label, seq in sequence_generator(fasta_outfile): self.assertTrue(seq in tableids) allseqs.append(seq) self.assertEqual(len(allseqs), len(tableids)) # test minimal read filtering ( minreads>0 ) minreads = 7 outfile2 = join(self.working_dir, 'testmerge2.biom') create_otu_table(outfile2, [(m1, 'testmerge'), (m2, 'testmerge2')], minreads=minreads) table2 = load_table(outfile2) table2ids = table2.ids(axis='observation') tablesum = table.sum(axis='observation') for idx, cid in enumerate(table.ids(axis='observation')): if tablesum[idx] >= minreads: self.assertIn(cid, table2ids) else: self.assertNotIn(cid, table2ids) self.assertEqual( table2.get_value_by_ids( 'TACGAGGGGGGCGAGCGTTGTTCG' 'GAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCGGTAAGTTTCGTGTGAAATCTTCGGG' 'CTCAACTCGAAGCCTGCACGAAATACTGCCGGGCTTGAGTGTGGGAGAGGTGAGTGGAAT' 'TTCCGGT', 'testmerge2'), 8) # and an otu present only in one self.assertEqual( table2.get_value_by_ids( 'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTT' 'AAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGA' 'GTGCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge'), 7)