Exemple #1
0
    def test_iqtree_model_choice(self):
        # Tip to tip dists should NOT be identical under different models.
        # Default is MFP (auto select substitution model). We'll compare ouput
        # of the GTR+G and HKY models.
        # This test is comparing an ordered series of tip-to-tip distances.
        # Take note, that for this comparison to work, all must have the same
        # seed value set.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # default GTR+G
        with redirected_stdio(stderr=os.devnull):
            gtrg = iqtree(input_sequences,
                          seed=1723,
                          substitution_model='GTR+G')
            gtrg_tree = skbio.TreeNode.read(str(gtrg),
                                            convert_underscores=False)
            gtrg_td = set(gtrg_tree.tip_tip_distances().to_series())

        # set HKY
        with redirected_stdio(stderr=os.devnull):
            hky = iqtree(input_sequences, seed=1723, substitution_model='HKY')
            hky_tree = skbio.TreeNode.read(str(hky), convert_underscores=False)
            hky_td = set(hky_tree.tip_tip_distances().to_series())

        # test pairs are not equivalent
        self.assertNotEqual(gtrg_td, hky_td)
Exemple #2
0
    def test_iqtree_model_choice(self):
        # Tip to tip dists should NOT be identical under different models.
        # Default is MFP (auto select substitution model). We'll compare ouput
        # of the GTR+G and HKY models.
        # This test is comparing an ordered series of tip-to-tip distances.
        # Take note, that for this comparison to work, all must have the same
        # seed value set.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # default GTR+G
        with redirected_stdio(stderr=os.devnull):
            gtrg = iqtree(input_sequences, seed=1723,
                          substitution_model='GTR+G')
            gtrg_tree = skbio.TreeNode.read(
                        str(gtrg), convert_underscores=False)
            gtrg_td = set(gtrg_tree.tip_tip_distances().to_series())

        # set HKY
        with redirected_stdio(stderr=os.devnull):
            hky = iqtree(input_sequences, seed=1723,
                         substitution_model='HKY')
            hky_tree = skbio.TreeNode.read(
                         str(hky), convert_underscores=False)
            hky_td = set(hky_tree.tip_tip_distances().to_series())

        # test pairs are not equivalent
        self.assertNotEqual(gtrg_td, hky_td)
Exemple #3
0
    def test_join_pairs_some_samples_w_no_joined_seqs(self):
        # minmergelen is set very high here, resulting in only one sequence
        # being joined across the three samples.
        with redirected_stdio(stderr=os.devnull):
            obs = join_pairs(self.input_seqs, minmergelen=279)

        # manifest is as expected
        self._test_manifest(obs)

        # expected number of fastq files are created
        output_fastqs = list(obs.sequences.iter_views(FastqGzFormat))
        self.assertEqual(len(output_fastqs), 3)

        # The following values were determined by running vsearch directly.
        exp_sequence_counts = {
            'BAQ2687.1_0_L001_R1_001.fastq.gz': 0,
            'BAQ3473.2_1_L001_R1_001.fastq.gz': 2,
            'BAQ4697.2_2_L001_R1_001.fastq.gz': 0,
        }

        for fastq_name, fastq_path in output_fastqs:
            with redirected_stdio(stderr=os.devnull):
                seqs = skbio.io.read(str(fastq_path),
                                     format='fastq',
                                     compression='gzip',
                                     constructor=skbio.DNA)
            seqs = list(seqs)
            seq_lengths = np.asarray([len(s) for s in seqs])

            # expected number of sequences are joined
            self.assertEqual(len(seq_lengths),
                             exp_sequence_counts[str(fastq_name)])
    def test_q_score(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        with redirected_stdio(stdout=os.devnull):
            obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score'](
                ar, quality_window=2, min_quality=20, min_length_fraction=0.25)
        obs_drop_ambig = obs_drop_ambig_ar.view(
            SingleLanePerSampleSingleEndFastqDirFmt)
        stats = stats_ar.view(pd.DataFrame)

        exp_drop_ambig = ["@foo_1",
                          "ATGCATGC",
                          "+",
                          "DDDDBBDD"]
        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_drop_ambig_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.),
                                             ('bar', 1., 0., 0., 0., 1.)],
                                            columns=columns)
        exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id')
        obs = []
        iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_drop_ambig)
        pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index])

        with redirected_stdio(stdout=os.devnull):
            obs_trunc_ar, stats_ar = self.plugin.methods['q_score'](
                ar, quality_window=1, min_quality=33, min_length_fraction=0.25)
        obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        stats = stats_ar.view(pd.DataFrame)

        exp_trunc = ["@foo_1",
                     "ATGCATGC",
                     "+",
                     "DDDDBBDD",
                     "@bar_1",
                     "ATA",
                     "+",
                     "DDD"]
        exp_trunc_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.),
                                        ('bar', 1., 1., 1., 0., 0.)],
                                       columns=columns)
        exp_trunc_stats = exp_trunc_stats.set_index('sample-id')

        obs = []
        for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat):
            obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')])
        self.assertEqual(sorted(obs), sorted(exp_trunc))
        pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
Exemple #5
0
    def test_iqtree_ultrafast_bootstrap_singlebranch_methods(self):
        # Comparing branch support to manually constructed tree
        # using the following command:
        #  iqtree -s aligned-dna-sequences-3.fasta -alrt 1500 -lbp 1500
        #         -abayes -bb 1500 -m 'HKY' -seed 1723
        # Here I am simply checking if the support values are identical
        # to the manual run. Also check for number of values.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = iqtree_ultrafast_bootstrap(input_sequences,
                                             seed=1723,
                                             substitution_model='HKY',
                                             alrt=1500,
                                             lbp=1500,
                                             abayes=True,
                                             bootstrap_replicates=1500)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_supp = [node.name for node in obs_tree.non_tips()]

        exp_tree = skbio.TreeNode.read(self.get_data_path('test5.tre'))
        exp_supp = [node.name for node in exp_tree.non_tips()]

        self.assertEqual(set(obs_supp), set(exp_supp))
        self.assertEqual(len(obs_supp[0].split('/')), 4)  # should be 4 values
        self.assertEqual(len(exp_supp[0].split('/')), 4)  # should be 4 values
Exemple #6
0
    def test_skip_denovo(self):
        # feature1 and feature3 clusters into r1 and feature2 and feature4
        # clusters into r2 during closed-ref clustering; no unclustered
        # features so de-novo clustering is skipped.
        exp_table = biom.Table(np.array([[104, 106, 109],
                                         [107, 107, 108]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])
        with redirected_stdio(stderr=os.devnull):
            obs_table, rep_seqs, new_ref_seqs = self.open_reference(
                sequences=self.input_sequences, table=self.input_table,
                reference_sequences=self.ref_sequences, perc_identity=0.01)

        obs_table = obs_table.view(biom.Table)
        obs_table_ids = set(obs_table.ids(axis='observation'))
        exp_table_ids = set(exp_table.ids(axis='observation'))
        self.assertEqual(obs_table_ids, exp_table_ids)

        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_rep_seqs = _read_seqs(rep_seqs)
        exp_rep_seqs = [self.input_sequences_list[0],  # feature1
                        self.input_sequences_list[4]]  # feature5
        _relabel_seqs(exp_rep_seqs, ['r1', 'r2'])
        self.assertEqual(obs_rep_seqs, exp_rep_seqs)

        obs_ref_seqs = _read_seqs(new_ref_seqs)
        # The returned "new" ref seqs should be the same as the original ref
        # seqs, because we skipped de-novo clustering.
        exp_ref_seqs = _read_seqs(self.ref_sequences)
        self.assertEqual(obs_ref_seqs, exp_ref_seqs)
Exemple #7
0
    def test_min_length(self):
        metadata = CategoricalMetadataColumn(
            # The third barcode is meant to completely remove the only GGGG
            # coded sequence
            pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c is empty because the barcode matched the entire
            # read, which removed everything.
            '',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Exemple #8
0
    def test_typical(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a, fwd
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample a, rev
            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample b, fwd
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b, fwd
            '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n',
        ]
        exp_untrimmed = [
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
            '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
Exemple #9
0
    def test_batch_size(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=1)

        # This test should yield the same results as test_typical, above,
        # the fact that we are batching shouldn't impact the final results
        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
Exemple #10
0
    def test_batch_size_odd_number_of_samples(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=2)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Exemple #11
0
    def test_extra_barcode_in_metadata(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                      name='Barcode',
                      index=pd.Index(
                          ['sample_a', 'sample_b', 'sample_c', 'sample_d'],
                          name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
            # sample d is empty bc no reads matched the barcode TTTT
            '',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        exp_samples_and_barcodes = pd.Series(
            ['AAAA', 'CCCC', 'GGGG', 'TTTT'],
            index=['sample_a', 'sample_b', 'sample_c', 'sample_d'])
        self.assert_demux_results(exp_samples_and_barcodes, exp,
                                  obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Exemple #12
0
    def test_variable_length_barcodes(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
    def test_run_rapid_bs_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            cmd = ['raxmlHPC',
                   '-m', 'GTRGAMMA',
                   '-p', '1723',
                   '-s', aligned_fp,
                   '-w', temp_dir,
                   '-n', 'q2',
                   '-f', 'a',
                   '-x', '9834',
                   '-N', '10']

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)

            obs_tree_fp = os.path.join(temp_dir, 'RAxML_bipartitions.q2')
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515',
                              'GCA000454205', 'GCA000473545',
                              'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985',
                              'GCA900007555']))
Exemple #14
0
    def test_run_ultrafast_bs_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot')
            cmd = ['iqtree',
                   '-m', 'HKY',
                   '-seed', '1723',
                   '-bb', '1000',
                   '-s', aligned_fp,
                   '-pre', run_prefix,
                   '-nt', '2']

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)
            obs_tree_fp = run_prefix + '.treefile'
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515',
                              'GCA000454205', 'GCA000473545',
                              'GCA000196255', 'GCA002142615',
                              'GCA000686145', 'GCA001950115',
                              'GCA001971985', 'GCA900007555']))
Exemple #15
0
    def test_iqtree_ultrafast_bootstrap_singlebranch_methods(self):
        # Comparing branch support to manually constructed tree
        # using the following command:
        #  iqtree -s aligned-dna-sequences-3.fasta -alrt 1500 -lbp 1500
        #         -abayes -bb 1500 -m 'HKY' -seed 1723
        # Here I am simply checking if the support values are identical
        # to the manual run. Also check for number of values.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = iqtree_ultrafast_bootstrap(input_sequences, seed=1723,
                                             substitution_model='HKY',
                                             alrt=1500, lbp=1500,
                                             abayes=True,
                                             bootstrap_replicates=1500)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_supp = [node.name for node in obs_tree.non_tips()]

        exp_tree = skbio.TreeNode.read(self.get_data_path('test5.tre'))
        exp_supp = [node.name for node in exp_tree.non_tips()]

        self.assertEqual(set(obs_supp), set(exp_supp))
        self.assertEqual(len(obs_supp[0].split('/')), 4)  # should be 4 values
        self.assertEqual(len(exp_supp[0].split('/')), 4)  # should be 4 values
    def test_raxml_rapid_bootstrap_with_seed(self):
        # Test tip-to-tip dists are identical to manually run RAxML output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        #     raxmlHPC -f a -m GTRGAMMA -p 1723 -x 3871 -N 10
        #         -s aligned-dna-sequences-3.fasta -n q2
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML
        # may return slightly different rounding errors on different
        # systems (and at times, between conda environments).
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # test that branchlengths are identical
        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences, seed=1723,
                                        rapid_bootstrap_seed=3871,
                                        bootstrap_replicates=10)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        # sometimes we lose the last set of numbers on long floats
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test2.tre'),
                                       convert_underscores=True)
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])
        self.assertEqual(obs_series, exp_series)

        # test that bootstrap supports are identical
        obs_bs = [node.name for node in obs_tree.non_tips()].sort()
        exp_bs = [node.name for node in exp_tree.non_tips()].sort()
        self.assertEqual(obs_bs, exp_bs)
Exemple #17
0
    def test_1_percent_clustering(self):
        # feature1 and feature3 cluster together; feature2 and feature4
        # cluster together;
        exp_table = biom.Table(np.array([[104, 106, 109],
                                         [8, 9, 11]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_1,
                        perc_identity=0.01)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. feature1 and feature3 are in the same cluster, but
        # feature1 is selected as the rep seq because it has a higher count.
        # Similarly, feature4 is selected as the cluster rep seq  because it
        # has a higher count.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        # all sequences matched, so unmatched seqs is empty
        self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
Exemple #18
0
    def test_none_matched(self):
        metadata = MetadataCategory(
            pd.Series(['TTTT'], index=['sample_d'], name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            with self.assertRaisesRegex(ValueError, 'demultiplexed'):
                self.demux_single_fn(self.muxed_sequences, metadata)
Exemple #19
0
    def test_duplicate_input_ids(self):
        input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
        input_sequences = DNAFASTAFormat(input_fp, mode='r')

        with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
            with redirected_stdio(stderr=os.devnull):
                mafft(input_sequences)
Exemple #20
0
    def test_97_percent_clustering(self):
        # feature1 and feature3 cluster together; feature2 doesn't cluster at
        # all; feature 4 clusters alone.
        exp_table = biom.Table(np.array([[104, 106, 109],
                                         [7, 8, 9]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_1,
                        perc_identity=0.97)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. feature1 and feature3 are in the same cluster, but
        # feature1 is selected as the rep seq because it has a higher count.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        obs_unmatched_seqs = _read_seqs(unmatched_seqs)
        exp_unmatched_seqs = [self.input_sequences_list[1]]  # feature2
        self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
Exemple #21
0
    def test_join_pairs(self):

        with redirected_stdio(stderr=os.devnull):
            obs = join_pairs(self.input_seqs)

        # manifest is as expected
        self._test_manifest(obs)

        # expected number of fastq files are created
        output_fastqs = list(obs.sequences.iter_views(FastqGzFormat))
        self.assertEqual(len(output_fastqs), 3)

        # The following values were determined by running vsearch directly
        # with default parameters. It is possible that different versions of
        # vsearch will result in differences in these numbers, and that
        # the corresponding tests may therefore be too specific. We'll have
        # to adjust the tests if that's the case.
        default_exp_sequence_counts = {
            'BAQ2687.1_0_L001_R1_001.fastq.gz': 806,
            'BAQ3473.2_1_L001_R1_001.fastq.gz': 753,
            'BAQ4697.2_2_L001_R1_001.fastq.gz': 711,
        }
        for fastq_name, fastq_path in output_fastqs:
            seqs = skbio.io.read(str(fastq_path),
                                 format='fastq',
                                 compression='gzip',
                                 constructor=skbio.DNA)
            seqs = list(seqs)
            seq_lengths = np.asarray([len(s) for s in seqs])
            self._test_seq_lengths(seq_lengths)

            # expected number of sequences are joined
            self.assertEqual(len(seq_lengths),
                             default_exp_sequence_counts[str(fastq_name)])
Exemple #22
0
    def test_100_percent_clustering_strand(self):
        # feature2 and feature3 don't cluster
        exp_table = biom.Table(np.array([[100, 101, 103],
                                         [7, 8, 9]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_2,
                        perc_identity=1.0, strand='both')
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. Since no other features are in the cluster, there is
        # no count-based selection of the rep seq.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        obs_unmatched_seqs = _read_seqs(unmatched_seqs)
        exp_unmatched_seqs = [self.input_sequences_list[2],  # feature3
                              self.input_sequences_list[1]]  # feature2
        self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
Exemple #23
0
    def test_uchime_denovo(self):
        with redirected_stdio(stderr=os.devnull):
            chime, nonchime, stats = uchime_denovo(
                sequences=self.input_sequences, table=self.input_table)

        obs_chime = _read_seqs(chime)
        exp_chime = [self.input_sequences_list[3]]
        self.assertEqual(obs_chime, exp_chime)

        # sequences are reverse-sorted by abundance in output
        obs_nonchime = _read_seqs(nonchime)
        exp_nonchime = [
            self.input_sequences_list[0], self.input_sequences_list[1],
            self.input_sequences_list[2]
        ]
        self.assertEqual(obs_nonchime, exp_nonchime)

        with stats.open() as stats_fh:
            stats_text = stats_fh.read()
        self.assertTrue('feature1' in stats_text)
        self.assertTrue('feature2' in stats_text)
        self.assertTrue('feature3' in stats_text)
        self.assertTrue('feature4' in stats_text)
        stats_lines = [e for e in stats_text.split('\n') if len(e) > 0]
        self.assertEqual(len(stats_lines), 4)
Exemple #24
0
    def test_mixed_orientation_success(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')

        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     mixed_orientation=True)

        self.assert_demux_results(forward_barcodes.to_series(),
                                  obs_demuxed_art)
        # Everything should match
        self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
Exemple #25
0
    def test_uchime_denovo_no_chimeras(self):
        input_table = biom.Table(
            np.array([[3, 4, 2], [1, 0, 0], [4, 5, 6], [2, 2, 2]]),
            ['feature1', 'feature2', 'feature3', 'feature4'],
            ['sample1', 'sample2', 'sample3'])
        with redirected_stdio(stderr=os.devnull):
            chime, nonchime, stats = uchime_denovo(
                sequences=self.input_sequences, table=input_table)

        obs_chime = _read_seqs(chime)
        exp_chime = []
        self.assertEqual(obs_chime, exp_chime)

        # sequences are reverse-sorted by abundance in output
        obs_nonchime = _read_seqs(nonchime)
        exp_nonchime = [
            self.input_sequences_list[2], self.input_sequences_list[0],
            self.input_sequences_list[3], self.input_sequences_list[1]
        ]
        self.assertEqual(obs_nonchime, exp_nonchime)

        with stats.open() as stats_fh:
            stats_text = stats_fh.read()
        self.assertTrue('feature1' in stats_text)
        self.assertTrue('feature2' in stats_text)
        self.assertTrue('feature3' in stats_text)
        self.assertTrue('feature4' in stats_text)
        stats_lines = [e for e in stats_text.split('\n') if len(e) > 0]
        self.assertEqual(len(stats_lines), 4)
Exemple #26
0
    def test_run_rapid_bs_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            cmd = ['raxmlHPC',
                   '-m', 'GTRGAMMA',
                   '-p', '1723',
                   '-s', aligned_fp,
                   '-w', temp_dir,
                   '-n', 'q2',
                   '-f', 'a',
                   '-x', '9834',
                   '-N', '10']

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)

            obs_tree_fp = os.path.join(temp_dir, 'RAxML_bipartitions.q2')
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515',
                              'GCA000454205', 'GCA000473545',
                              'GCA000196255', 'GCA002142615',
                              'GCA000686145', 'GCA001950115',
                              'GCA001971985', 'GCA900007555']))
Exemple #27
0
    def test_uchime_ref_no_chimeras(self):
        ref_sequences_fp = self.get_data_path('ref-sequences-4.fasta')
        ref_sequences = DNAFASTAFormat(ref_sequences_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            chime, nonchime, stats = uchime_ref(
                sequences=self.input_sequences,
                table=self.input_table,
                reference_sequences=ref_sequences)

        obs_chime = _read_seqs(chime)
        exp_chime = []
        self.assertEqual(obs_chime, exp_chime)

        # sequences are reverse-sorted by abundance in output
        obs_nonchime = _read_seqs(nonchime)
        exp_nonchime = [
            self.input_sequences_list[0], self.input_sequences_list[1],
            self.input_sequences_list[2], self.input_sequences_list[3]
        ]
        self.assertEqual(obs_nonchime, exp_nonchime)

        with stats.open() as stats_fh:
            stats_text = stats_fh.read()
        self.assertTrue('feature1' in stats_text)
        self.assertTrue('feature2' in stats_text)
        self.assertTrue('feature3' in stats_text)
        self.assertTrue('feature4' in stats_text)
        stats_lines = [e for e in stats_text.split('\n') if len(e) > 0]
        self.assertEqual(len(stats_lines), 4)
Exemple #28
0
    def test_97_percent_clustering_feature4_most_abundant(self):
        input_table = biom.Table(np.array([[4, 5, 6],
                                           [1, 1, 2],
                                           [7, 8, 9],
                                           [100, 101, 103]]),
                                 ['feature1', 'feature2', 'feature3',
                                  'feature4'],
                                 ['sample1', 'sample2', 'sample3'])
        exp_table = biom.Table(np.array([[111, 114, 118],
                                         [1, 1, 2]]),
                               ['feature4', 'feature2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, obs_sequences = cluster_features_de_novo(
                sequences=self.input_sequences, table=input_table,
                perc_identity=0.97)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        # sequences are reverse-sorted by abundance in output
        obs_seqs = _read_seqs(obs_sequences)
        exp_seqs = [self.input_sequences_list[3], self.input_sequences_list[1]]
        self.assertEqual(obs_seqs, exp_seqs)
Exemple #29
0
    def test_run_ultrafast_bs_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot')
            cmd = [
                'iqtree', '-m', 'HKY', '-seed', '1723', '-bb', '1000', '-s',
                aligned_fp, '-pre', run_prefix, '-nt', '2'
            ]

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)
            obs_tree_fp = run_prefix + '.treefile'
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(
            set(tip_names),
            set([
                'GCA001510755', 'GCA001045515', 'GCA000454205', 'GCA000473545',
                'GCA000196255', 'GCA002142615', 'GCA000686145', 'GCA001950115',
                'GCA001971985', 'GCA900007555'
            ]))
    def test_dereplicate_sequences_prefix(self):
        input_sequences_fp = self.get_data_path('seqs-1')
        input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r')

        exp_table = biom.Table(np.array([[2, 2],
                                        [2, 0]]),
                               ['4574b947a0159c0da35a1f30f989681a1d9f64ef',
                                '16a1263bde4f2f99422630d1bb87935c4236d1ba'],
                               ['s2', 'sample1'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, obs_sequences = dereplicate_sequences(
                sequences=input_sequences, derep_prefix=True)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        # sequences are reverse-sorted by abundance in output
        obs_seqs = list(skbio.io.read(str(obs_sequences),
                        constructor=skbio.DNA, format='fasta'))
        exp_seqs = [skbio.DNA('AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGG',
                              metadata={'id': ('4574b947a0159c0da35a1f30f'
                                               '989681a1d9f64ef'),
                                        'description': 's2_1'}),
                    skbio.DNA('ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCG',
                              metadata={'id': ('16a1263bde4f2f99422630d1bb'
                                               '87935c4236d1ba'),
                                        'description': 's2_42'})]
        self.assertEqual(obs_seqs, exp_seqs)
Exemple #31
0
 def test_typical(self):
     demuxed_art = Artifact.import_data(
         'SampleData[PairedEndSequencesWithQuality]',
         self.get_data_path('paired-end'))
     adapter = ['TACGGAGGATCC']
     with redirected_stdio(stdout=os.devnull):
         # The forward and reverse reads are identical in these data
         obs_art, = self.plugin.methods['trim_paired'](demuxed_art,
                                                       front_f=adapter,
                                                       front_r=adapter)
     demuxed = demuxed_art.view(SingleLanePerSampleSingleEndFastqDirFmt)
     demuxed_seqs = demuxed.sequences.iter_views(FastqGzFormat)
     obs = obs_art.view(SingleLanePerSampleSingleEndFastqDirFmt)
     obs_seqs = obs.sequences.iter_views(FastqGzFormat)
     # Iterate over each sample, side-by-side
     for (_, exp_fp), (_, obs_fp) in zip(demuxed_seqs, obs_seqs):
         exp_fh = gzip.open(str(exp_fp), 'rt')
         obs_fh = gzip.open(str(obs_fp), 'rt')
         # Iterate over expected and observed reads, side-by-side
         for records in itertools.zip_longest(*[exp_fh] * 4, *[obs_fh] * 4):
             (exp_seq_h, exp_seq, _, exp_qual, obs_seq_h, obs_seq, _,
              obs_qual) = records
             # Make sure cutadapt hasn't shuffled the read order
             self.assertEqual(exp_seq_h, obs_seq_h)
             self.assertTrue(obs_seq in exp_seq)
             # The adapter should not be present in the trimmed seqs
             self.assertTrue('TACGGAGGATCC' not in obs_seq)
             self.assertTrue(obs_qual in exp_qual)
             # Make sure cutadapt trimmed the quality scores, too
             self.assertEqual(len(obs_seq), len(obs_qual))
         exp_fh.close(), obs_fh.close()
Exemple #32
0
    def test_raxml_rapid_bootstrap_with_seed(self):
        # Test tip-to-tip dists are identical to manually run RAxML output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        #     raxmlHPC -f a -m GTRGAMMA -p 1723 -x 3871
        #         -s aligned-dna-sequences-3.fasta -n q2
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML
        # may return slightly different rounding errors on different
        # systems (and at times, between conda environments).
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # test that branchlengths are identical
        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences, seed=1723,
                                        rapid_bootstrap_seed=3871,
                                        bootstrap_replicates=10)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        # sometimes we lose the last set of numbers on long floats
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test2.tre'),
                                       convert_underscores=True)
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])
        self.assertEqual(obs_series, exp_series)

        # test that bootstrap supports are identical
        obs_bs = [node.name for node in obs_tree.non_tips()].sort()
        exp_bs = [node.name for node in exp_tree.non_tips()].sort()
        self.assertEqual(obs_bs, exp_bs)
Exemple #33
0
    def test_multithreaded_mafft(self):
        input_sequences, exp = self._prepare_sequence_data()

        with redirected_stdio(stderr=os.devnull):
            result = mafft(input_sequences, n_threads='auto')
        obs = skbio.io.read(str(result), into=skbio.TabularMSA,
                            constructor=skbio.DNA)
        self.assertEqual(obs, exp)
Exemple #34
0
 def test_nans_in_unused_column(self):
     md = qiime2.Metadata(
         pd.DataFrame([[1, 'a'], [1, 'b'], [np.nan, 'b']],
                      columns=['number', 'letter'],
                      index=pd.Index(['sample1', 'sample2', 'sample3'],
                                     name='id')))
     with redirected_stdio(stderr=os.devnull):
         with tempfile.TemporaryDirectory() as temp_dir_name:
             adonis(temp_dir_name, self.dm, md, 'letter+letter')
Exemple #35
0
 def test_mafft_parttree_exception(self):
     input_fp = os.path.join(self.temp_dir.name, 'million.fasta')
     with open(input_fp, "w") as f:
         for i in range(0, 1000002):
             f.write('>%d\nAAGCAAGC\n' % i)
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     with self.assertRaisesRegex(ValueError, '1 million'):
         with redirected_stdio(stderr=os.devnull):
             mafft(input_sequences)
 def test_failed_run_not_verbose(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     output_alignment = AlignedDNAFASTAFormat()
     unaligned_fp = str(input_sequences)
     aligned_fp = str(output_alignment)
     cmd = ["mafft", "--not-a-real-parameter", unaligned_fp]
     with self.assertRaises(subprocess.CalledProcessError):
         with redirected_stdio(stderr=os.devnull):
             run_command(cmd, aligned_fp, verbose=False)
Exemple #37
0
 def test_build_iqtree_ufbs_command(self):
     input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with tempfile.TemporaryDirectory() as temp_dir:
         run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot')
         with redirected_stdio(stderr=os.devnull):
             obs = _build_iqtree_ufbs_command(input_sequences,
                                              seed=1723,
                                              n_cores=0,
                                              n_runs=5,
                                              bootstrap_replicates=2000,
                                              substitution_model='MFP',
                                              run_prefix=run_prefix,
                                              dtype='DNA',
                                              safe='True',
                                              allnni='True',
                                              alrt=500,
                                              abayes=True,
                                              lbp=400,
                                              bnni=True,
                                              n_init_pars_trees=200,
                                              n_top_init_trees=30,
                                              n_best_retain_trees=10,
                                              stop_iter=300,
                                              perturb_nni_strength=0.55,
                                              spr_radius=8,
                                              n_max_ufboot_iter=600,
                                              n_ufboot_steps=80,
                                              min_cor_ufboot=0.66,
                                              ep_break_ufboot=0.51)
     self.assertTrue('2000' in obs[2])
     self.assertTrue('DNA' in obs[4])
     self.assertTrue('5' in obs[6])
     self.assertTrue(str(input_sequences) in str(obs[8]))
     self.assertTrue('MFP' in obs[10])
     self.assertTrue(str(run_prefix) in obs[12])
     self.assertTrue('AUTO' in obs[14])
     self.assertTrue('1723' in obs[16])
     self.assertTrue('-safe' in obs[17])
     self.assertTrue('-allnni' in obs[18])
     self.assertTrue('500' in obs[20])
     self.assertTrue('-abayes' in obs[21])
     self.assertTrue('400' in obs[23])
     self.assertTrue('-bnni' in obs[24])
     self.assertTrue('200' in obs[26])
     self.assertTrue(str('30') in obs[28])
     self.assertTrue(str('10') in obs[30])
     self.assertTrue(str('300') in obs[32])
     self.assertTrue(str('0.55') in obs[34])
     self.assertTrue(str('8') in obs[36])
     self.assertTrue(str('600') in obs[38])
     self.assertTrue(str('80') in obs[40])
     self.assertTrue(str('0.66') in obs[42])
     self.assertTrue(str('0.51') in obs[44])
Exemple #38
0
    def test_failed_run_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        result = NewickFormat()
        aligned_fp = str(input_sequences)
        tree_fp = str(result)

        cmd = ['FastTree', '-nt', '-not-a-real-parameter', aligned_fp]
        with self.assertRaises(subprocess.CalledProcessError):
            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, tree_fp, verbose=False)
 def test_mafft(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     exp = skbio.TabularMSA(
         [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}),
          skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})]
     )
     with redirected_stdio(stderr=os.devnull):
         result = mafft(input_sequences)
     obs = skbio.io.read(str(result), into=skbio.TabularMSA,
                         constructor=skbio.DNA)
     self.assertEqual(obs, exp)
Exemple #40
0
 def test_fasttree_underscore_ids(self):
     input_fp = self.get_data_path('aligned-dna-sequences-2.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with redirected_stdio(stderr=os.devnull):
         obs = fasttree(input_sequences)
     # load the resulting tree and test that it has the right number of
     # tips and the right tip ids (the branch lengths can vary with
     # different versions of FastTree)
     obs_tree = skbio.TreeNode.read(str(obs))
     tips = list(obs_tree.tips())
     tip_names = [t.name for t in tips]
     tip_names.sort()
     self.assertEqual(tip_names, ['_s_e_q_1_', '_s_e_q_2_'])
Exemple #41
0
    def test_raxml_num_searches(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, seed=1723, n_searches=5)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test3.tre'))
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])
        self.assertEqual(obs_series, exp_series)
Exemple #42
0
 def test_iqtree_safe_allnni(self):
     # Same as `test_iqtree` but testing the `-safe` and `-allnni `flags
     input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with redirected_stdio(stderr=os.devnull):
         obs = iqtree(input_sequences, safe='True', allnni='True')
     obs_tree = skbio.TreeNode.read(str(obs))
     tips = list(obs_tree.tips())
     tip_names = [t.name for t in tips]
     self.assertEqual(set(tip_names),
                      set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                           'GCA000473545', 'GCA000196255', 'GCA002142615',
                           'GCA000686145', 'GCA001950115', 'GCA001971985',
                           'GCA900007555']))
Exemple #43
0
 def test_fasttree_n_threads(self):
     input_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with redirected_stdio(stderr=os.devnull):
         obs = fasttree(input_sequences, n_threads=-1)
     # load the resulting tree and test that it has the right number of
     # tips and the right tip ids (the branch lengths can vary with
     # different versions of FastTree, and threading can produce
     # non-deterministic trees)
     obs_tree = skbio.TreeNode.read(str(obs))
     tips = list(obs_tree.tips())
     tip_names = [t.name for t in tips]
     tip_names.sort()
     self.assertEqual(tip_names, ['seq1', 'seq2'])
Exemple #44
0
    def test_raxml_model_choice(self):
        # Tip to tip dists should NOT be identical under different models.
        # Default is GTRGAMMA, we'll compare ouput to GRTGAMMAI & GTRCAT.
        # This test is comparing an ordered series of tip-to-tip distances.
        # Take note, that for this comparison to work, all must have the same
        # seed value set.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # default GTRGAMMA
        with redirected_stdio(stderr=os.devnull):
            gtrg = raxml(input_sequences, seed=1723)
            gtrg_tree = skbio.TreeNode.read(
                        str(gtrg), convert_underscores=False)
            gtrg_td = set(gtrg_tree.tip_tip_distances().to_series())

        # set GTRGAMMAI
        with redirected_stdio(stderr=os.devnull):
            gtrgi = raxml(input_sequences, seed=1723,
                          substitution_model='GTRGAMMAI')
            gtrgi_tree = skbio.TreeNode.read(
                         str(gtrgi), convert_underscores=False)
            gtrgi_td = set(gtrgi_tree.tip_tip_distances().to_series())

        # set GTRCAT
        with redirected_stdio(stderr=os.devnull):
            gtrcat = raxml(input_sequences, seed=1723,
                           substitution_model='GTRCAT')
            gtrcat_tree = skbio.TreeNode.read(
                          str(gtrcat), convert_underscores=False)
            gtrcat_td = set(gtrcat_tree.tip_tip_distances().to_series())

        # test pairs are not equivalent
        self.assertNotEqual(gtrg_td, gtrgi_td)
        self.assertNotEqual(gtrg_td, gtrcat_td)
        self.assertNotEqual(gtrgi_td, gtrcat_td)
Exemple #45
0
 def test_rapid_bootstrap_command(self):
     input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with tempfile.TemporaryDirectory() as temp_dir:
         with redirected_stdio(stderr=os.devnull):
             obs = _build_rapid_bootstrap_command(input_sequences, 1723,
                                                  8752, 15, 'GTRGAMMA',
                                                  temp_dir, 'bs')
     self.assertTrue(str(input_sequences) in str(obs[11]))
     self.assertTrue('1723' in obs[5])
     self.assertTrue('8752' in obs[7])
     self.assertTrue('15' in obs[9])
     self.assertTrue('GTRGAMMA' in obs[3])
     self.assertTrue(str(temp_dir) in obs[13])
     self.assertTrue('bs' in obs[15])
Exemple #46
0
 def test_raxml(self):
     # Test that output tree is made.
     # Reads tree output and compares tip labels to expected labels.
     input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with redirected_stdio(stderr=os.devnull):
         obs = raxml(input_sequences)
     obs_tree = skbio.TreeNode.read(str(obs))
     # load the resulting tree and test that it has the right number of
     # tips and the right tip ids
     tips = list(obs_tree.tips())
     tip_names = [t.name for t in tips]
     self.assertEqual(set(tip_names),
                      set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                           'GCA000473545', 'GCA000196255', 'GCA002142615',
                           'GCA000686145', 'GCA001950115', 'GCA001971985',
                           'GCA900007555']))
Exemple #47
0
 def test_build_iqtree_command(self):
     input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with tempfile.TemporaryDirectory() as temp_dir:
         run_prefix = os.path.join(temp_dir, 'q2iqtree')
         with redirected_stdio(stderr=os.devnull):
             obs = _build_iqtree_command(input_sequences,
                                         seed=1723,
                                         n_cores=0,
                                         n_runs=2,
                                         substitution_model='MFP',
                                         run_prefix=run_prefix,
                                         dtype='DNA',
                                         safe='True',
                                         fast='True',
                                         alrt=1000,
                                         abayes=True,
                                         lbp=1000,
                                         n_init_pars_trees=200,
                                         n_top_init_trees=30,
                                         n_best_retain_trees=10,
                                         n_iter=80,
                                         stop_iter=300,
                                         perturb_nni_strength=0.55,
                                         spr_radius=8,
                                         allnni='True')
     self.assertTrue('DNA' in obs[2])
     self.assertTrue('2' in obs[4])
     self.assertTrue(str(input_sequences) in str(obs[6]))
     self.assertTrue('MFP' in obs[8])
     self.assertTrue(str(run_prefix) in obs[10])
     self.assertTrue('AUTO' in obs[12])
     self.assertTrue('1723' in obs[14])
     self.assertTrue(str('-safe') in obs[15])
     self.assertTrue(str('-fast') in obs[16])
     self.assertTrue(str('1000') in obs[18])
     self.assertTrue(str('-abayes') in obs[19])
     self.assertTrue(str('1000') in obs[21])
     self.assertTrue(str('-allnni') in obs[22])
     self.assertTrue(str('200') in obs[24])
     self.assertTrue(str('30') in obs[26])
     self.assertTrue(str('10') in obs[28])
     self.assertTrue(str('80') in obs[30])
     self.assertTrue(str('300') in obs[32])
     self.assertTrue(str('0.55') in obs[34])
     self.assertTrue(str('8') in obs[36])
Exemple #48
0
    def test_raxml_rapid_bootstrap_n_threads(self):
        # Test that an output tree is made when invoking threads.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences, n_threads=2)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)

        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]

        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA002142615',
                              'GCA000686145', 'GCA001950115', 'GCA001971985',
                              'GCA900007555']))
Exemple #49
0
 def test_raxml_underscore_ids(self):
     # Test that output tree is made with underscores in tip IDs.
     # Some programs and python wrappers may strip underscores.
     # Reads tree output and compares tip labels to expected labels.
     input_fp = self.get_data_path('aligned-dna-sequences-4.fasta')
     input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
     with redirected_stdio(stderr=os.devnull):
         obs = raxml(input_sequences)
     obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
     # load the resulting tree and test that it has the right number of
     # tips and the right tip ids
     tips = list(obs_tree.tips())
     tip_names = [t.name for t in tips]
     self.assertEqual(set(tip_names),
                      set(['GCA_001510755_1', 'GCA_001045515_1',
                           'GCA_000454205_1', 'GCA_000473545_1',
                           'GCA_000196255_1', 'GCA_002142615_1',
                           'GCA_000686145_1', 'GCA_001950115_1',
                           'GCA_001971985_1', 'GCA_900007555_1']))
Exemple #50
0
    def test_raxml_with_seed(self):
        # Test tip-to-tip dists are identical to manually run RAxML output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        # raxmlHPC -m GTRGAMMA -p 1723 -s aligned-dna-sequences-3.fasta -n q2
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML
        # may return slightly different rounding errors on different
        # systems.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, seed=1723)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test.tre'))
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])

        self.assertEqual(obs_series, exp_series)
Exemple #51
0
    def test_iqtree_with_seed(self):
        # Test tip-to-tip dists are identical to manually run IQ-TREE output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        #     iqtree -seed 1723 -m HKY -s aligned-dna-sequences-3.fasta
        #            -nt 1 -pre q2iqtree
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as
        # IQ-TREE may return slightly different rounding errors on different
        # systems.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = iqtree(input_sequences, seed=1723,
                         substitution_model='HKY')
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test4.tre'))
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])

        self.assertEqual(obs_series, exp_series)
Exemple #52
0
def create_job():
    # TODO: handle errors in the request body
    request_body = request.get_json()
    plugin = request_body['plugin']
    action = request_body['action']
    inputs = request_body['inputs']
    parameters = request_body['parameters']
    outputs = request_body['outputs']

    plugin = PLUGIN_MANAGER.plugins[plugin]
    action = plugin.actions[action]

    # TODO: make this better
    json_params = {}
    for key, spec in action.signature.parameters.items():
        if spec.qiime_type == Metadata:
            if parameters[key] == "":
                parameters[key] = None
                json_params[key] = None
            else:
                parameters[key] = qiime2.Metadata.load(parameters[key])
                json_params[key] = '<metadata>'
        # TODO is there a better way to check whether `spec.qiime_type` is some
        # kind of `MetadataColumn` subtype using the type system API? The
        # current approach here matches more or less what q2cli is doing.
        elif spec.qiime_type.name == 'MetadataColumn':
            if spec.qiime_type == MetadataColumn[Categorical]:
                column_types = ('categorical',)
            elif spec.qiime_type == MetadataColumn[Numeric]:
                column_types = ('numeric',)
            elif spec.qiime_type == MetadataColumn[Categorical | Numeric]:
                column_types = ('categorical', 'numeric')
            else:
                raise NotImplementedError(
                    "Parameter %r is type %r, which is not currently "
                    "supported by this interface." % (key, spec.qiime_type))

            if parameters[key][0] == "" or parameters[key][1] == "":
                parameters[key] = None
                json_params[key] = None
            else:
                column_name = parameters[key][1]
                metadata_column = qiime2.Metadata.load(
                    parameters[key][0]).get_column(column_name)

                if metadata_column.type not in column_types:
                    if len(column_types) == 1:
                        suffix = '%s.' % column_types[0]
                    else:
                        suffix = ('one of the following types: %s' %
                                  ', '.join(column_types))
                    raise TypeError(
                        "Metadata column %r is %s. Parameter %r expects the "
                        "column to be %s" %
                        (column_name, metadata_column.type, key,
                         suffix))

                parameters[key] = metadata_column
                json_params[key] = '<metadata>'
        else:
            json_params[key] = parameters[key]

    parameters = action.signature.decode_parameters(**parameters)
    inputs = load_artifacts(**inputs)

    job_id = str(uuid.uuid4())
    now = int(time.time() * 1000)

    JOBS[job_id] = {
        'uuid': job_id,
        'completed': False,
        'error': False,
        'started': now,
        'finished': None,
        'stdout': None,
        'stderr': None,
        'code': action.source,
        'actionId': action.id,
        'actionName': action.name,
        'inputs': {k: v.uuid for k, v in inputs.items()},
        'params': json_params,
        'outputs': {k: None for k in outputs}
    }

    inputs.update(parameters)

    # Add prefix just in case the file isn't unlinked, but we don't need a
    # name either way as the context manager works on file-descripters
    stdout = tempfile.TemporaryFile(prefix='q2studio-stdout')
    stderr = tempfile.TemporaryFile(prefix='q2studio-stderr')
    with redirected_stdio(stdout=stdout, stderr=stderr):
        future = action.asynchronous(**inputs)
        future.add_done_callback(
            _callback_factory(job_id, outputs, stdout, stderr))
    return jsonify({
        'job': url_for('.inspect_job', job_id=job_id)
    })
Exemple #53
0
def create_job():
    # TODO: handle errors in the request body
    request_body = request.get_json()
    plugin = request_body['plugin']
    action = request_body['action']
    inputs = request_body['inputs']
    parameters = request_body['parameters']
    outputs = request_body['outputs']

    plugin = PLUGIN_MANAGER.plugins[plugin]
    action = plugin.actions[action]

    # TODO: make this better
    json_params = {}
    for key, spec in action.signature.parameters.items():
        if spec.qiime_type == qiime2.plugin.Metadata:
            if parameters[key] == "":
                parameters[key] = None
                json_params[key] = None
            else:
                parameters[key] = qiime2.Metadata.load(parameters[key])
                json_params[key] = '<metadata>'
        elif spec.qiime_type == qiime2.plugin.MetadataCategory:
            if parameters[key][0] == "" or parameters[key][1] == "":
                parameters[key] = None
                json_params[key] = None
            else:
                parameters[key] = qiime2.Metadata.load(
                    parameters[key][0]).get_category(parameters[key][1])
                json_params[key] = '<metadata>'
        else:
            json_params[key] = parameters[key]

    parameters = action.signature.decode_parameters(**parameters)
    inputs = load_artifacts(**inputs)

    job_id = str(uuid.uuid4())
    now = int(time.time() * 1000)

    JOBS[job_id] = {
        'uuid': job_id,
        'completed': False,
        'error': False,
        'started': now,
        'finished': None,
        'stdout': None,
        'stderr': None,
        'code': action.source,
        'actionId': action.id,
        'actionName': action.name,
        'inputs': {k: v.uuid for k, v in inputs.items()},
        'params': json_params,
        'outputs': {k: None for k in outputs}
    }

    inputs.update(parameters)

    # Add prefix just in case the file isn't unlinked, but we don't need a
    # name either way as the context manager works on file-descripters
    stdout = tempfile.TemporaryFile(prefix='q2studio-stdout')
    stderr = tempfile.TemporaryFile(prefix='q2studio-stderr')
    with redirected_stdio(stdout=stdout, stderr=stderr):
        future = action.async(**inputs)
        future.add_done_callback(
            _callback_factory(job_id, outputs, stdout, stderr))
    return jsonify({
        'job': url_for('.inspect_job', job_id=job_id)
    })