Esempio n. 1
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(sequence_generator(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)

        error_dist = np.array([
            1, 0.06, 0.02, 0.02, 0.01, 0.005, 0.005, 0.005, 0.001, 0.001,
            0.001, 0.0005
        ])
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(sequence_generator(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)
Esempio n. 2
0
    def compare_result(self, simfilename, origfilename, trim_length):
        """Compare the results of deblurring to the original mixture

        Parameters
        ----------
        simfilename : str
            name of the simulated reads fasta file
        origfilename : str
            name of the fasta file with the ground truth sequences
        trim_length : int
            the length used for trimming the sequences in deblurring
        """

        # get the trimmed ground truth sequences
        orig_seqs = [item[1] for item in sequence_generator(origfilename)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        # get the deblurred fasta file sequences
        out_seqs = [item[1] for item in sequence_generator(simfilename)]
        out_seqs = [item.upper() for item in out_seqs]

        out_seqs.sort()
        orig_seqs.sort()

        # test we see all ground truth sequences and no other
        self.assertEqual(out_seqs, orig_seqs)
Esempio n. 3
0
    def validate_results(self, table_name, orig_fasta_name):
        res_table = load_table(table_name)

        res_seqs = list(res_table.ids(axis='observation'))
        exp_seqs = [item[1] for item in sequence_generator(orig_fasta_name)]
        exp_seqs = list(map(lambda x: x.upper()[:self.trim_length], exp_seqs))
        self.assertListEqual(res_seqs, exp_seqs)
Esempio n. 4
0
    def test_remove_artifacts_from_biom_table(self):
        """ Test remove_artifacts_from_biom_table() function for
        removing non 16s sequences from a biom table and matching
        fasta file. This test uses a pre-calculated biom table and
        fasta file and tests the output only 16s and only artifacts
        tables
        s4 dataset is similar to s2 but with two added non-16s
        sequences (which are not phix/adapter)
        """
        # create the positive reference databases
        pos_ref_fp = join(self.test_data_dir, '70_otus.fasta')
        pos_ref_db_fp = build_index_sortmerna(ref_fp=(pos_ref_fp, ),
                                              working_dir=self.working_dir)

        # remove the artifacts from the s4 biom table
        input_biom_file = join(self.test_data_dir, 'final.s4.biom')
        input_fasta_file = join(self.test_data_dir, 'final.s4.seqs.fa')
        remove_artifacts_from_biom_table(input_biom_file, input_fasta_file,
                                         [pos_ref_fp], self.working_dir,
                                         pos_ref_db_fp)

        origfilename = join(self.test_data_dir, 'simset.s2.fasta')
        trim_length = 150
        orig_seqs = [item[1] for item in sequence_generator(origfilename)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        no_artifacts_table_name = join(self.working_dir, 'final.only-16s.biom')
        no_artifacts_table = load_table(no_artifacts_table_name)
        obs_seqs = no_artifacts_table.ids(axis='observation')
        self.assertEqual(set(obs_seqs), set(orig_seqs))

        artifacts_table_name = join(self.working_dir, 'final.only-non16s.biom')
        artifacts_table = load_table(artifacts_table_name)
        obs_seqs = artifacts_table.ids(axis='observation')
        self.assertEqual(len(obs_seqs), 2)
Esempio n. 5
0
    def test_multiple_sequence_alignment(self):
        """Test multiple sequence alignment.
        """
        seqs = [DNA('caccggcggcccggtggtggccattattattgggtctaaag',
                    metadata={'id': 'seq_1'}, lowercase=True),
                DNA('caccggcggcccgagtggtggccattattattgggtcaagg',
                    metadata={'id': 'seq_2'}, lowercase=True),
                DNA('caccggcggcccgagtgatggccattattattgggtctaaag',
                    metadata={'id': 'seq_3'}, lowercase=True),
                DNA('aaccggcggcccaagtggtggccattattattgggtctaaag',
                    metadata={'id': 'seq_4'}, lowercase=True),
                DNA('caccgggcccgagtggtggccattattattgggtctaaag',
                    metadata={'id': 'seq_5'}, lowercase=True)]

        seqs_fp = join(self.working_dir, "seqs.fna")
        with open(seqs_fp, 'w') as o:
            for seq in seqs:
                seq.write(o, format='fasta')
        alignment_file = multiple_sequence_alignment(seqs_fp)
        aligned_seqs = [DNA(item[1], metadata={'id': item[0]}, lowercase=True)
                        for item in sequence_generator(alignment_file)]

        align_exp = [
            DNA('caccggcggcccg-gtggtggccattattattgggtctaaag',
                lowercase=True, metadata={'id': 'seq_1'}),
            DNA('caccggcggcccgagtggtggccattattattgggtcaagg-',
                lowercase=True, metadata={'id': 'seq_2'}),
            DNA('caccggcggcccgagtgatggccattattattgggtctaaag',
                lowercase=True, metadata={'id': 'seq_3'}),
            DNA('aaccggcggcccaagtggtggccattattattgggtctaaag',
                lowercase=True, metadata={'id': 'seq_4'}),
            DNA('caccg--ggcccgagtggtggccattattattgggtctaaag',
                lowercase=True, metadata={'id': 'seq_5'})]
        self.assertEqual(aligned_seqs, align_exp)
Esempio n. 6
0
    def get_seqs_act_split_sequence_on_sample_ids(self, output_dir):
        """Parse output of split_sequence_file_on_sample_ids_to_files()

        Parameters
        ----------
        output_dir: string
            output directory path storing FASTA files

        Returns
        -------
        seqs_act: dict
            dictionary with keys being sample IDs and values list of
            sequences belonging to sample ID
        """
        seqs_act = {}
        for fn in listdir(output_dir):
            input_fp = join(output_dir, fn)
            sample_file = splitext(fn)[0]
            for label, seq in sequence_generator(input_fp):
                sample = label.split('_')[0]
                self.assertEqual(sample_file, sample)
                if sample not in seqs_act:
                    seqs_act[sample] = [(label, seq)]
                else:
                    seqs_act[sample].append((label, seq))
        return seqs_act
Esempio n. 7
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))
        # now add a sequence with an A insertion
        tseq = cseq[:10] + 'A' + cseq[10:-1] + '-'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)
        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 1 sequence as output
        self.assertEqual(len(obs), 1)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
Esempio n. 8
0
    def test_dereplicate_seqs(self):
        """ Test dereplicate_seqs() method functionality,
            keep singletons
        """
        seqs = [("seq1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq2", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq4", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"),
                ("seq5", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG"),
                ("seq6", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"),
                ("seq7", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")]
        seqs_fp = join(self.working_dir, "seqs.fasta")
        with open(seqs_fp, 'w') as seqs_f:
            for seq in seqs:
                seqs_f.write(">%s\n%s\n" % seq)

        output_fp = join(self.working_dir, "seqs_derep.fasta")

        dereplicate_seqs(seqs_fp=seqs_fp, output_fp=output_fp, min_size=1)
        self.assertTrue(isfile(output_fp))

        exp = [("seq1;size=3",
                "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
               ("seq6;size=2",
                "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"),
               ("seq4;size=1",
                "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"),
               ("seq5;size=1",
                "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG")]

        act = [item for item in sequence_generator(output_fp)]

        self.assertEqual(act, exp)
Esempio n. 9
0
    def run_workflow_try(self, simfilename, origfilename,
                         ref_fp, ref_db_fp, threads=1,
                         trim_length=100):
        """Test launching the complete workflow using simulated sequences
        and compare to original ground truth.

        Parameters
        ----------
        simfilename : str
            name of the simulated reads fasta file
        origfilename : str
            name of the fasta file with the ground truth sequences
        ref_fp : list of str
            list of the reference database files
        def_db_fp : list of str
            list of the indexed database files or None to create them
        threads : int
            number of threads to use (default=1)
        trim_length : int
            length of sequences to trim to (default=100)
        """
        seqs_fp = simfilename
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        negate = False
        nochimera = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp,
                                    mean_error=mean_error,
                                    error_dist=error_dist,
                                    indel_prob=indel_prob,
                                    indel_max=indel_max,
                                    trim_length=trim_length,
                                    min_size=min_size,
                                    ref_fp=(ref_fp,),
                                    ref_db_fp=ref_db_fp,
                                    negate=negate,
                                    threads_per_sample=threads)

        # get the trimmed ground truth sequences
        orig_seqs = [item[1] for item in sequence_generator(origfilename)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        output_filename = 'final.biom'
        output_table_fp = join(output_fp, output_filename)

        create_otu_table(output_table_fp, [(nochimera, seqs_fp)])

        table_obs = load_table(output_table_fp)
        outseqs = table_obs.ids(axis='observation')
        outseqs = list(outseqs)
        outseqs.sort()
        orig_seqs.sort()

        # test we see all ground truth sequences and no other
        self.assertEqual(outseqs, orig_seqs)
Esempio n. 10
0
 def test_sequence_generator_invalid_format(self):
     allres = []
     input_fp = join(self.test_data_dir, 'readme.txt')
     msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp
     with self.assertWarns(UserWarning) as W:
         for res in sequence_generator(input_fp):
             allres.append(res)
     self.assertEqual(len(allres), 0)
     self.assertEqual(str(W.warning), msg)
Esempio n. 11
0
    def test_sequence_generator_fastq(self):
        exp_len = 2
        first_id = "foo"
        first_few_nucs = "GCgc"

        obs = list(sequence_generator(self.seqs_fq_fp))
        self.assertEqual(len(obs), exp_len)
        self.assertEqual(obs[0][0], first_id)
        self.assertTrue(obs[0][1].startswith(first_few_nucs))
Esempio n. 12
0
    def test_sequence_generator_fasta(self):
        exp_len = 135
        first_id = "s1_1001203-10"
        first_few_nucs = "TACGTAGGTGGCAAGCGTTA"

        obs = list(sequence_generator(self.seqs_s1_fp))
        self.assertEqual(len(obs), exp_len)
        self.assertEqual(obs[0][0], first_id)
        self.assertTrue(obs[0][1].startswith(first_few_nucs))
Esempio n. 13
0
 def test_fasta_from_biom(self):
     '''Test the fasta file from biom table function fasta_from_biom()
     '''
     input_biom_file = join(self.test_data_dir, 'final.s4.biom')
     table = load_table(input_biom_file)
     output_fasta = join(self.working_dir, 'final.s4.seqs.fa')
     fasta_from_biom(table, output_fasta)
     self.assertTrue(isfile(output_fasta))
     table_seqs = table.ids(axis='observation')
     expected = [(seq, seq) for seq in table_seqs]
     written_seqs = [item for item in sequence_generator(output_fasta)]
     self.assertListEqual(written_seqs, expected)
Esempio n. 14
0
    def test_deblur_toy_example(self):
        seqs_f = StringIO(TEST_SEQS_1)
        obs = deblur(sequence_generator(seqs_f))
        exp = [
            Sequence(
                "E.Coli;size=1000;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)
Esempio n. 15
0
 def test_remove_artifacts_seqs(self):
     """ Test remove_artifacts_seqs() function for removing
         sequences not matching to a reference database
         using SortMeRNA. This test forces a new index
         construction for the reference sequences.
     """
     seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"),
             ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"),
             ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"),
             ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"),
             ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"),
             ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")]
     exp_seqs = ["seq1", "seq2", "seq3", "seq4", "seq5", "seq6"]
     seqs_fp = join(self.working_dir, "seqs.fasta")
     with open(seqs_fp, 'w') as seqs_f:
         for seq in seqs:
             seqs_f.write(">%s\n%s\n" % seq)
     ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
             "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
            ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref_fp = join(self.working_dir, "ref2.fasta")
     with open(ref_fp, 'w') as ref_f:
         for seq in ref:
             ref_f.write(">%s\n%s\n" % seq)
     self.files_to_remove.append(ref_fp)
     ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                       working_dir=self.working_dir)
     output_fp, num_seqs_left, tmp_files = remove_artifacts_seqs(
         seqs_fp=seqs_fp,
         ref_fp=(ref_fp, ),
         working_dir=self.working_dir,
         ref_db_fp=ref_db_fp,
         negate=False,
         threads=1)
     obs_seqs = []
     for label, seq in sequence_generator(output_fp):
         obs_seqs.append(label)
     self.assertEqual(obs_seqs, exp_seqs)
     # validate it creates one tmp file
     self.assertEqual(len(tmp_files), 1)
Esempio n. 16
0
 def test_remove_artifacts_seqs_negate(self):
     """ Test remove_artifacts_seqs() function for removing
         sequences matching to a reference database
         using SortMeRNA.
     """
     seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"),
             ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"),
             ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"),
             ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"),
             ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"),
             ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")]
     # seq5 is 80% similar, so should be kept for 0.95 default similarity
     # to artifacts
     exp_seqs = ["seq5", "phix1", "phix2", "phix3"]
     seqs_fp = join(self.working_dir, "seqs.fasta")
     with open(seqs_fp, 'w') as seqs_f:
         for seq in seqs:
             seqs_f.write(">%s\n%s\n" % seq)
     ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
             "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
            ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref_fp = join(self.working_dir, "ref4.fasta")
     with open(ref_fp, 'w') as ref_f:
         for seq in ref:
             ref_f.write(">%s\n%s\n" % seq)
     self.files_to_remove.append(ref_fp)
     ref_db_fp = build_index_sortmerna([ref_fp], self.working_dir)
     output_fp = join(self.working_dir, "seqs_filtered.fasta")
     output_fp, num_seqs_left, _ = remove_artifacts_seqs(
         seqs_fp=seqs_fp,
         ref_fp=(ref_fp, ),
         working_dir=self.working_dir,
         ref_db_fp=ref_db_fp,
         negate=True,
         threads=1)
     obs_seqs = []
     for label, seq in sequence_generator(output_fp):
         obs_seqs.append(label)
     self.assertEqual(obs_seqs, exp_seqs)
Esempio n. 17
0
 def test_remove_chimeras_denovo_from_seqs(self):
     """ Test remove_chimeras_denovo_from_seqs() method functionality.
         Remove chimeric sequences from a FASTA file using the UCHIME
         algorithm, implemented in VSEARCH.
     """
     seqs = [("s1_104;size=2;", "GTGCCAGCCGCCGCGGTAATACCCGCAGCTCAAGTGGTG"
              "GTCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTT"
              "GTAAATCCCTGGGTAAATCGGGAAGCTTAACTTTCCGAC"
              "TTCCGAGGAGACTGTCAAACTTGGGACCGGGAG"),
             ("s1_106;size=2;", "GTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTG"
              "TGGATGTTTATTGGGCCTAAAGCGTCCGTAGCCGGCTGC"
              "GCAAGTCTGTCGGGAAATCCGCACGCCTAACGTGCGGGC"
              "GTCCGGCGGAAACTGCGTGGCTTGGGACCGGAA"),
             ("s1_1;size=9;", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAA"
              "ACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"
              "CGCTTAACGATCCGATTCTGGGGAGACTGCAAAGCTTGGGA"
              "CCGGGCGAGGTTAGAGGTACTCTCGGG"),
             ("s1_20;size=9;", "TACCTGCAGCCCAAGTGGTGGTCGATTTTATTGAGTCTAA"
              "AACGTTCGTAGCCGGTTTGATAAATCCTTGGGTAAATCGG"
              "GAAGCTTAACTTTCCGATTCCGAGGAGACTGTCAAACTTG"
              "GGACCGGGAGAGGCTAGAGGTACTTCTGGG"),
             ("s1_40;size=8;", "TACCAGCTCTCCGAGTGGTGTGGATGTTTATTGGGCCTAA"
              "AGCATCCGTAGCTGGCTAGGTTAGTCCCCTGTTAAATCCA"
              "CCGAATTAATCGTTGGATGCGGGGGATACTGCTTGGCTAG"
              "GGGACGAGAGAGGCAGACGGTATTTCCGGG"),
             ("s1_60;size=8;", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAA"
              "AGCGTCCGTAGCCGGCTGCGCAAGTCTGTCGGGAAATCCG"
              "CACGCCTAACGTGCGGGTCCGGCGGAAACTGCGTGGCTTG"
              "GGACCGGAAGACTCGAGGGGTACGTCAGGG")]
     seqs_non_chimera = [
         "s1_1;size=9;", "s1_20;size=9;", "s1_40;size=8;", "s1_60;size=8;"
     ]
     seqs_fp = join(self.working_dir, "seqs.fasta")
     with open(seqs_fp, 'w') as seqs_f:
         for seq in seqs:
             seqs_f.write(">%s\n%s\n" % seq)
     output_fp = remove_chimeras_denovo_from_seqs(
         seqs_fp=seqs_fp, working_dir=self.working_dir)
     seqs_obs = []
     for label, seq in sequence_generator(output_fp):
         label = label.split()[0]
         seqs_obs.append(label)
     self.assertEqual(seqs_non_chimera, seqs_obs)
Esempio n. 18
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))

        # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel1-read;size=30;'
        newseqs.append((chead, tseq))

        # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed)
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel2-read;size=31;'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)

        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty)
        self.assertEqual(len(obs), 2)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
        self.assertEqual(obs[1].label, '>indel2-read;size=31;')
Esempio n. 19
0
    def test_create_otu_table(self):
        # merge the fasta files
        m1 = join(
            self.test_data_dir, 'testmerge.fasta.trim.derep.no_artifacts'
            '.msa.deblur.no_chimeras')
        m2 = join(
            self.test_data_dir, 'testmerge2.fasta.trim.derep.no_artifacts'
            '.msa.deblur.no_chimeras')
        outfile = join(self.working_dir, 'testmerge.biom')
        fasta_outfile = join(self.working_dir, 'testmerge.seq.fa')
        create_otu_table(outfile, [(m1, 'testmerge'), (m2, 'testmerge2')],
                         outputfasta_fp=fasta_outfile)

        # test the result
        table = load_table(outfile)
        tableids = table.ids(axis='observation')
        # test a sequence present in both
        self.assertEqual(
            table.get_value_by_ids(
                'TACGAGGGGGGCGAGCGTTGTTCGGAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCG'
                'GTAAGTTTCGTGTGAAATCTTCGGGCTCAACTCGAAGCCTGCACGAAATACTGCCGGGC'
                'TTGAGTGTGGGAGAGGTGAGTGGAATTTCCGGT', 'testmerge'), 5)
        self.assertEqual(
            table.get_value_by_ids(
                'TACGAGGGGGGCGAGCGTTGTTCG'
                'GAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCGGTAAGTTTCGTGTGAAATCTTCGGG'
                'CTCAACTCGAAGCCTGCACGAAATACTGCCGGGCTTGAGTGTGGGAGAGGTGAGTGGAAT'
                'TTCCGGT', 'testmerge2'), 8)
        # and an otu present only in one
        self.assertEqual(
            table.get_value_by_ids(
                'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTT'
                'AAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGA'
                'GTGCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge'), 7)
        self.assertEqual(
            table.get_value_by_ids(
                'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTTA'
                'AGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGAGT'
                'GCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge2'), 0)
        # test the output fasta file
        allseqs = []
        for label, seq in sequence_generator(fasta_outfile):
            self.assertTrue(seq in tableids)
            allseqs.append(seq)
        self.assertEqual(len(allseqs), len(tableids))

        # test minimal read filtering ( minreads>0 )
        minreads = 7
        outfile2 = join(self.working_dir, 'testmerge2.biom')
        create_otu_table(outfile2, [(m1, 'testmerge'), (m2, 'testmerge2')],
                         minreads=minreads)

        table2 = load_table(outfile2)
        table2ids = table2.ids(axis='observation')
        tablesum = table.sum(axis='observation')
        for idx, cid in enumerate(table.ids(axis='observation')):
            if tablesum[idx] >= minreads:
                self.assertIn(cid, table2ids)
            else:
                self.assertNotIn(cid, table2ids)

        self.assertEqual(
            table2.get_value_by_ids(
                'TACGAGGGGGGCGAGCGTTGTTCG'
                'GAATTATTGGGCGTAAAAGGTGCGTAGGCGGTTCGGTAAGTTTCGTGTGAAATCTTCGGG'
                'CTCAACTCGAAGCCTGCACGAAATACTGCCGGGCTTGAGTGTGGGAGAGGTGAGTGGAAT'
                'TTCCGGT', 'testmerge2'), 8)
        # and an otu present only in one
        self.assertEqual(
            table2.get_value_by_ids(
                'TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGAGCGTAGGCGGTTTCTT'
                'AAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGA'
                'GTGCAGAAGAGGAGAGTGGAATTCCATGT', 'testmerge'), 7)