Exemple #1
0
    def test_launch_workflow_incorrect_trim(self):
        """
        test if we get the warning when trim length
        is too long
        """
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                          working_dir=self.working_dir)

        seqs_fp = self.seqs_s1_fp
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        # trim length longer than sequences
        trim_length = 151
        left_trim_length = 0
        threads = 1
        with self.assertWarns(UserWarning):
            launch_workflow(seqs_fp=seqs_fp,
                            working_dir=output_fp,
                            mean_error=mean_error,
                            error_dist=error_dist,
                            indel_prob=indel_prob,
                            indel_max=indel_max,
                            trim_length=trim_length,
                            left_trim_length=left_trim_length,
                            min_size=min_size,
                            ref_fp=(ref_fp, ),
                            ref_db_fp=ref_db_fp,
                            threads_per_sample=threads)
Exemple #2
0
    def test_launch_workflow_skip_trim(self):
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                          working_dir=self.working_dir)

        seqs_fp = self.seqs_s1_fp
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        # trim length longer than sequences
        trim_length = -1
        left_trim_length = 0
        threads = 1

        output_fp = launch_workflow(seqs_fp=seqs_fp,
                                    working_dir=output_fp,
                                    mean_error=mean_error,
                                    error_dist=error_dist,
                                    indel_prob=indel_prob,
                                    indel_max=indel_max,
                                    trim_length=trim_length,
                                    left_trim_length=left_trim_length,
                                    min_size=min_size,
                                    ref_fp=(ref_fp, ),
                                    ref_db_fp=ref_db_fp,
                                    threads_per_sample=threads)
        exp = Sequence.read(self.no_trim_res, format='fasta')
        res = Sequence.read(output_fp, format='fasta')
        self.assertEqual(exp, res)
Exemple #3
0
    def run_workflow_try(self, simfilename, origfilename,
                         ref_fp, ref_db_fp, threads=1,
                         trim_length=100):
        """Test launching the complete workflow using simulated sequences
        and compare to original ground truth.

        Parameters
        ----------
        simfilename : str
            name of the simulated reads fasta file
        origfilename : str
            name of the fasta file with the ground truth sequences
        ref_fp : list of str
            list of the reference database files
        def_db_fp : list of str
            list of the indexed database files or None to create them
        threads : int
            number of threads to use (default=1)
        trim_length : int
            length of sequences to trim to (default=100)
        """
        seqs_fp = simfilename
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        negate = False
        nochimera = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp,
                                    mean_error=mean_error,
                                    error_dist=error_dist,
                                    indel_prob=indel_prob,
                                    indel_max=indel_max,
                                    trim_length=trim_length,
                                    min_size=min_size,
                                    ref_fp=(ref_fp,),
                                    ref_db_fp=ref_db_fp,
                                    negate=negate,
                                    threads_per_sample=threads)

        # get the trimmed ground truth sequences
        orig_seqs = [item[1] for item in sequence_generator(origfilename)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        output_filename = 'final.biom'
        output_table_fp = join(output_fp, output_filename)

        create_otu_table(output_table_fp, [(nochimera, seqs_fp)])

        table_obs = load_table(output_table_fp)
        outseqs = table_obs.ids(axis='observation')
        outseqs = list(outseqs)
        outseqs.sort()
        orig_seqs.sort()

        # test we see all ground truth sequences and no other
        self.assertEqual(outseqs, orig_seqs)
Exemple #4
0
    def run_workflow_try(self, simfilename, origfilename, ref_fp, ref_db_fp):
        """Test launching the complete workflow using simulated sequences
        and compare to original ground truth.

        Parameters
        ----------
        simfilename : str
            name of the simulated reads fasta file
        origfilename : str
            name of the fasta file with the ground truth sequences
        """
        seqs_fp = simfilename
        output_fp = self.working_dir
        read_error = 0.05
        mean_error = 0.005
        error_dist = None
        indel_prob = 0.01
        indel_max = 3
        trim_length = 100
        min_size = 2
        negate = False
        threads = 1
        delim = '_'
        biom_fp = launch_workflow(seqs_fp, output_fp, read_error, mean_error,
                                  error_dist, indel_prob, indel_max,
                                  trim_length, min_size, (ref_fp, ), ref_db_fp,
                                  negate, threads, delim)

        # get the trimmed ground truth sequences
        with open(origfilename, 'U') as f:
            orig_seqs = [item[1] for item in parse_fasta(f)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        table_obs = load_table(biom_fp)
        outseqs = table_obs.ids(axis='observation')

        # test we see all ground truth sequences and no other
        self.assertItemsEqual(outseqs, orig_seqs)