def test_missing_feature_ids(self):
        feature_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['g0', 'g1', 'g2', 'g1', 'g2', 'extra'],
                      name='foo',
                      index=pd.Index(['a', 'c', 'd', 'e', 'f', 'g'],
                                     name='featureid')))
        data = np.array([[1, 0, 0], [1, 10, 10], [0, 0, 100], [5, 5, 5],
                         [0, 1, 100], [7, 8, 9]])
        # g is missing on purpose
        table = biom.Table(data,
                           sample_ids=['s1', 's2', 's3'],
                           observation_ids=['a', 'b', 'c', 'd', 'e', 'f'])

        with self.assertRaisesRegex(ValueError, "not present.*'b'"):
            group(table, axis='feature', metadata=feature_mc, mode='sum')
    def test_extra_metadata(self):
        dm = skbio.DistanceMatrix(
            [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]],
            ids=['sample1', 'sample2', 'sample3'])
        md = qiime2.CategoricalMetadataColumn(
            pd.Series(['a', 'b', 'b', 'c'],
                      name='a or b',
                      index=pd.Index(
                          ['sample1', 'sample2', 'sample3', 'sample4'],
                          name='id')))

        with tempfile.TemporaryDirectory() as output_dir:
            beta_group_significance(output_dir, dm, md, permutations=42)
            index_fp = os.path.join(output_dir, 'index.html')
            self.assertTrue('<td>2</td>' in open(index_fp).read())
 def test_filtered_samples_str_metadata(self):
     dm = skbio.DistanceMatrix(
         [[0.00, 0.25, 0.25, 0.66], [0.25, 0.00, 0.00, 0.66],
          [0.25, 0.00, 0.00, 0.66], [0.66, 0.66, 0.66, 0.00]],
         ids=['sample1', 'sample2', 'sample3', 'sample4'])
     md = qiime2.CategoricalMetadataColumn(
         pd.Series(['a', 'b', 'b', np.nan],
                   name='a or b',
                   index=pd.Index(
                       ['sample1', 'sample2', 'sample3', 'sample4'],
                       name='id')))
     with tempfile.TemporaryDirectory() as output_dir:
         beta_group_significance(output_dir, dm, md)
         index_fp = os.path.join(output_dir, 'index.html')
         self.assertTrue('Warning' in open(index_fp).read())
    def test_empty_metadata_values(self):
        # Trusting that the code is sane enough to not invent a distinction
        # between feature and sample metadata where there is none
        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['a_new', 'a_new', None],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c'], name='sampleid')))
        sample_ids = sample_mc.to_series().index

        data = np.array([[1, 2, 3], [30, 20, 10]])
        table = biom.Table(data,
                           sample_ids=sample_ids,
                           observation_ids=['x', 'y'])

        with self.assertRaisesRegex(ValueError, "missing.*value.*'c'"):
            group(table, axis='sample', metadata=sample_mc, mode='sum')

        nan_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['a_new', float('nan'), 'a_new'],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c'], name='id')))

        with self.assertRaisesRegex(ValueError, "missing.*value.*'b'"):
            group(table, axis='sample', metadata=nan_mc, mode='sum')
Exemple #5
0
    def setUp(self):
        super().setUp()
        self.md = qiime2.CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b'],
                      index=pd.Index([c for c in 'abcdef'], name='id'),
                      name='foo'))

        tab = biom.Table(np.array([[13, 26, 37, 3, 6,
                                    1], [33, 24, 23, 5, 6, 2],
                                   [38, 26, 33, 4, 1,
                                    0], [3, 2, 1, 22, 25, 31],
                                   [2, 1, 3, 44, 46, 42]]),
                         observation_ids=[c for c in 'vwxyz'],
                         sample_ids=[c for c in 'abcdef'])
        self.tab = qiime2.Artifact.import_data('FeatureTable[Frequency]', tab)
Exemple #6
0
    def setUp(self):
        super().setUp()

        def _load_biom(table_fp):
            table_fp = self.get_data_path(table_fp)
            table = qiime2.Artifact.load(table_fp)
            table = table.view(biom.Table)
            return table

        def _load_md(md_fp):
            md_fp = self.get_data_path(md_fp)
            md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
            md = qiime2.Metadata(md)
            return md

        def _load_nmc(md_fp, column):
            md_fp = self.get_data_path(md_fp)
            md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
            md = qiime2.NumericMetadataColumn(md[column])
            return md

        def _load_cmc(md_fp, column):
            md_fp = self.get_data_path(md_fp)
            md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
            md = qiime2.CategoricalMetadataColumn(md[column])
            return md

        self.table_chard_fp = _load_biom('chardonnay.table.qza')
        self.md_chard_fp = _load_md('chardonnay.map.txt')
        self.mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')
        self.table_ecam_fp = _load_biom('ecam-table-maturity.qza')
        self.md_ecam_fp = _load_md('ecam_map_maturity.txt')
        self.mdc_ecam_fp = _load_nmc('ecam_map_maturity.txt', 'month')
        self.exp_imp = pd.read_csv(
            self.get_data_path('importance.tsv'), sep='\t', header=0,
            index_col=0)
        self.exp_pred = pd.read_csv(
            self.get_data_path('predictions.tsv'), sep='\t', header=0,
            index_col=0, squeeze=True)
        index = pd.Index(['A', 'B', 'C', 'D'], name='id')
        self.table_percnorm = qiime2.Artifact.import_data(
            FeatureTable[PercentileNormalized], pd.DataFrame(
                [[20.0, 20.0, 50.0, 10.0], [10.0, 10.0, 70.0, 10.0],
                 [90.0, 8.0, 1.0, 1.0], [30.0, 15.0, 20.0, 35.0]],
                index=index,
                columns=['feat1', 'feat2', 'feat3', 'feat4'])).view(biom.Table)
        self.mdc_percnorm = qiime2.CategoricalMetadataColumn(
            pd.Series(['X', 'X', 'Y', 'Y'], index=index, name='name'))
    def setUp(self):
        super().setUp()
        self.exp_results = pd.read_csv(
            self.get_data_path('mock-3-results.tsv'), sep='\t', index_col=0)
        self.exp = qiime2.Artifact.load(
            self.get_data_path('qc-mock-3-expected.qza')).view(pd.DataFrame)
        self.obs = qiime2.Artifact.load(
            self.get_data_path('qc-mock-3-observed.qza')).view(pd.DataFrame)

        self.false_neg = pd.DataFrame(
            {'HMPMockV1.1.Even1': [0.047619, 0.047619, 0.047619],
             'HMPMockV1.1.Even2': [0.047619, 0.047619, 0.047619],
             'HMPMockV1.2.Staggered1': [0.2143622714, 0.0214362274,
                                        0.0002143626],
             'HMPMockV1.2.Staggered2': [0.2143622714, 0.0214362274,
                                        0.0002143626]},
            index=['k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;'
                   'f__Staphylococcaceae;g__Staphylococcus;s__aureus',
                   'k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;'
                   'f__Staphylococcaceae;g__Staphylococcus;s__epidermidis',
                   'k__Bacteria;p__Thermi;c__Deinococci;o__Deinococcales;'
                   'f__Deinococcaceae;g__Deinococcus;s__'])
        self.false_neg.index.name = 'Taxon'
        self.misclassified = pd.DataFrame(
            {'HMPMockV1.1.Even1': [0.08634],
             'HMPMockV1.1.Even2': [0.0533176566813],
             'HMPMockV1.2.Staggered1': [0.],
             'HMPMockV1.2.Staggered2': [0.]},
            index=['k__Bacteria;p__[Thermi];c__Deinococci;o__Deinococcales;'
                   'f__Deinococcaceae;g__Deinococcus;s__'])
        self.misclassified.index.name = 'Taxon'
        self.underclassified = pd.DataFrame(
            {'HMPMockV1.1.Even1': [0.536876],
             'HMPMockV1.1.Even2': [0.577293],
             'HMPMockV1.2.Staggered1': [0.639295],
             'HMPMockV1.2.Staggered2': [0.666156]},
            index=['k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;'
                   'f__Staphylococcaceae;g__Staphylococcus;__'])
        self.underclassified.index.name = 'Taxon'
        self.metadata = qiime2.CategoricalMetadataColumn(
            pd.Series(['HMPMockV1.1.Even1',
                       'HMPMockV1.1.Even1',
                       'HMPMockV1.2.Staggered1',
                       'HMPMockV1.2.Staggered1'], name='mock_id',
                      index=pd.Index(['HMPMockV1.1.Even1',
                                      'HMPMockV1.1.Even2',
                                      'HMPMockV1.2.Staggered1',
                                      'HMPMockV1.2.Staggered2'], name='id')))
Exemple #8
0
    def setUp(self):
        self.barcodes = [('@s1/2 abc/2', 'AAAA', '+', 'YYYY'),
                         ('@s2/2 abc/2', 'TTAA', '+', 'PPPP'),
                         ('@s3/2 abc/2', 'AACC', '+', 'PPPP'),
                         ('@s4/2 abc/2', 'TTAA', '+', 'PPPP'),
                         ('@s5/2 abc/2', 'AACC', '+', 'PPPP'),
                         ('@s6/2 abc/2', 'AAAA', '+', 'PPPP'),
                         ('@s7/2 abc/2', 'CGGC', '+', 'PPPP'),
                         ('@s8/2 abc/2', 'GGAA', '+', 'PPPP'),
                         ('@s9/2 abc/2', 'CGGC', '+', 'PPPP'),
                         ('@s10/2 abc/2', 'CGGC', '+', 'PPPP'),
                         ('@s11/2 abc/2', 'GGAA', '+', 'PPPP')]

        self.forward = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
                        ('@s2/1 abc/1', 'CCC', '+', 'PPP'),
                        ('@s3/1 abc/1', 'AAA', '+', 'PPP'),
                        ('@s4/1 abc/1', 'TTT', '+', 'PPP'),
                        ('@s5/1 abc/1', 'ATA', '+', 'PPP'),
                        ('@s6/1 abc/1', 'TAT', '+', 'PPP'),
                        ('@s7/1 abc/1', 'CGC', '+', 'PPP'),
                        ('@s8/1 abc/1', 'GCG', '+', 'PPP'),
                        ('@s9/1 abc/1', 'ACG', '+', 'PPP'),
                        ('@s10/1 abc/1', 'GCA', '+', 'PPP'),
                        ('@s11/1 abc/1', 'TGA', '+', 'PPP')]

        self.reverse = [('@s1/1 abc/1', 'CCC', '+', 'YYY'),
                        ('@s2/1 abc/1', 'GGG', '+', 'PPP'),
                        ('@s3/1 abc/1', 'TTT', '+', 'PPP'),
                        ('@s4/1 abc/1', 'AAA', '+', 'PPP'),
                        ('@s5/1 abc/1', 'TAT', '+', 'PPP'),
                        ('@s6/1 abc/1', 'ATA', '+', 'PPP'),
                        ('@s7/1 abc/1', 'GCG', '+', 'PPP'),
                        ('@s8/1 abc/1', 'CGC', '+', 'PPP'),
                        ('@s9/1 abc/1', 'CGT', '+', 'PPP'),
                        ('@s10/1 abc/1', 'TGC', '+', 'PPP'),
                        ('@s11/1 abc/1', 'TCA', '+', 'PPP')]

        self.bpsi = BarcodePairedSequenceFastqIterator(self.barcodes,
                                                       self.forward,
                                                       self.reverse)

        barcode_map = pd.Series(
            ['AAAA', 'AACC', 'TTAA', 'GGAA', 'CGGC'],
            name='bc',
            index=pd.Index(
                ['sample1', 'sample2', 'sample3', 'sample4', 'sample5'],
                name='id'))
        self.barcode_map = qiime2.CategoricalMetadataColumn(barcode_map)
    def test_empty_only_in_superset(self):
        # Trusting that the code is sane enough to not invent a distinction
        # between feature and sample metadata where there is none
        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['a_new', 'a_new', 'b_new', None], name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='sampleid')))

        data = np.array([[1, 2, 3], [30, 20, 10]])
        table = biom.Table(data, sample_ids=['a', 'b', 'c'],
                           observation_ids=['x', 'y'])
        expected = biom.Table(np.array([[2, 3], [25, 10]]),
                              sample_ids=['a_new', 'b_new'],
                              observation_ids=['x', 'y'])
        result = group(table, axis='sample', metadata=sample_mc,
                       mode='mean-ceiling')
        self.assertEqual(expected, result)
    def test_seqs_restrict_metadata(self):
        context_seqs = self.get_data_path('context-seqs-4.fasta')
        context_seqs = DNAFASTAFormat(context_seqs, 'r')
        s = pd.Series(['2019-11-01', '2020-01-17'], index=['B', 'U'])
        s.index.name = 'id'
        s.name = 'date-md'
        exp_md = qiime2.CategoricalMetadataColumn(s)

        for _ in range(self._N_TEST_ITERATIONS):
            sel = sample_longitudinal(self.md2, context_seqs)

            self.assertEqual(sel.inclusion.sum(), 2)
            self.assertTrue(sel.inclusion['B'])
            self.assertTrue(sel.inclusion['U'])
            self.assertEqual(sel.metadata.get_column('date-md'), exp_md)
            self.assertEqual(sel.label, 'sample_longitudinal')
Exemple #11
0
    def test_confusion_matrix_dtype_coercion(self):
        predictions = pd.Series([1, 1, 1, 2, 2, 2],
                                index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'],
                                               name='sample_id'),
                                name='features')

        # NOTE: the targets are numbers but represented as str
        truth = qiime2.CategoricalMetadataColumn(
            pd.Series(['1', '2', '1', '2', '1', '2'],
                      index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'],
                                     name='sample-id'),
                      name='target'))

        confusion_matrix(self.tmpd, predictions, truth)

        self.assertTrue('index.html' in listdir(self.tmpd))
    def test_ancom_no_volcano_plot(self):
        t = pd.DataFrame([[1, 1], [1, 1], [1, 1], [1, 1]],
                         index=['S1', 'S2', 'S3', 'S4'],
                         columns=['O1', 'O2'])
        c = qiime2.CategoricalMetadataColumn(
            pd.Series(['0', '0', '1', '2'],
                      name='n',
                      index=pd.Index(['S1', 'S2', 'S3', 'S4'], name='id')))
        ancom(output_dir=self.temp_dir.name, table=t + 1, metadata=c)

        index_fp = os.path.join(self.temp_dir.name, 'index.html')
        self.assertTrue(os.path.exists(index_fp))
        self.assertTrue(os.path.getsize(index_fp) > 0)
        with open(index_fp) as fh:
            f = fh.read()
            self.assertTrue('Unable to generate volcano plot' in f)
Exemple #13
0
    def test_missing_sample_ids(self):
        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['g0', 'g2', 'g0', 'g2'],
                      name='foo',
                      index=pd.Index(['s1', 's3', 's4', 's6'],
                                     name='sampleid')))
        data = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [100, 110, 120, 130]])
        table = biom.Table(data,
                           sample_ids=['s1', 's2', 's4', 's5'],
                           observation_ids=['x', 'y', 'z'])

        with self.assertRaisesRegex(ValueError, 'not present.*s2.*s5') as e:
            group(table, axis='sample', metadata=sample_mc, mode='sum')

        self.assertIn('s2', str(e.exception))
        self.assertIn('s5', str(e.exception))
    def test_ancom_no_tables(self):
        t = pd.DataFrame([[2, 1, 2], [2, 2, 2], [2, 2, 2]],
                         index=['S1', 'S2', 'S3'],
                         columns=['O1', 'O2', 'O3'])
        c = qiime2.CategoricalMetadataColumn(
            pd.Series(['0', '0', '1'],
                      name='n',
                      index=pd.Index(['S1', 'S2', 'S3'], name='id')))
        ancom(output_dir=self.temp_dir.name, table=t + 1, metadata=c)

        index_fp = os.path.join(self.temp_dir.name, 'index.html')
        self.assertTrue(os.path.exists(index_fp))
        self.assertTrue(os.path.getsize(index_fp) > 0)
        with open(index_fp) as fh:
            f = fh.read()
            self.assertTrue('No significant features found' in f)
Exemple #15
0
    def test_numeric_strings(self):
        data = np.array([[1, 2, 3], [30, 20, 10]])
        table = biom.Table(data,
                           sample_ids=['a', 'b', 'c'],
                           observation_ids=['x', 'y'])

        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['-4.2', '-4.2', '-4.2'],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c'], name='sampleid')))

        expected = biom.Table(np.array([[6], [60]]),
                              sample_ids=['-4.2'],
                              observation_ids=['x', 'y'])
        result = group(table, axis='sample', metadata=sample_mc, mode='sum')
        self.assertEqual(expected, result)
Exemple #16
0
    def test_superset_sample_group(self):
        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['g0', 'g1', 'g2', 'g0', 'g1', 'g2'],
                      name='foo',
                      index=pd.Index(['s1', 's2', 's3', 's4', 's5', 's6'],
                                     name='sampleid')))
        data = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [100, 110, 120, 130]])
        table = biom.Table(data,
                           sample_ids=['s1', 's2', 's4', 's5'],
                           observation_ids=['x', 'y', 'z'])

        expected = biom.Table(np.array([[2, 4], [22, 24], [220, 240]]),
                              sample_ids=['g0', 'g1'],
                              observation_ids=['x', 'y', 'z'])

        result = group(table, axis='sample', metadata=sample_mc, mode='sum')
        self.assertEqual(expected, result)
    def test_ancom(self):
        t = pd.DataFrame(
            [[9, 9, 9, 19, 19, 19], [10, 11, 10, 20, 20, 20],
             [9, 10, 9, 9, 10, 9], [9, 10, 9, 9, 9, 8], [9, 10, 9, 9, 9, 9],
             [9, 10, 9, 9, 9, 10], [9, 12, 9, 9, 9, 11]],
            index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],
            columns=['S1', 'S2', 'S3', 'S4', 'S5', 'S6']).T
        c = qiime2.CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', '1', '1', '1'],
                      name='n',
                      index=pd.Index(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'],
                                     name='id')))
        ancom(output_dir=self.temp_dir.name, table=t + 1, metadata=c)

        res = pd.read_csv(os.path.join(self.temp_dir.name, 'ancom.tsv'),
                          index_col=0,
                          sep='\t')
        exp = pd.DataFrame(
            {
                'W':
                np.array([5, 5, 2, 2, 2, 2, 2]),
                'Reject null hypothesis':
                np.array([True, True, False, False, False, False, False],
                         dtype=bool)
            },
            index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],
        )
        pdt.assert_frame_equal(res, exp)

        index_fp = os.path.join(self.temp_dir.name, 'index.html')
        self.assertTrue(os.path.exists(index_fp))
        self.assertTrue(os.path.getsize(index_fp) > 0)

        data_fp = os.path.join(self.temp_dir.name, 'data.tsv')
        self.assertTrue(os.path.exists(data_fp))
        self.assertTrue(os.path.getsize(data_fp) > 0)

        tsv_fp = os.path.join(self.temp_dir.name, 'percent-abundances.tsv')
        self.assertTrue(os.path.exists(tsv_fp))
        self.assertTrue(os.path.getsize(tsv_fp) > 0)

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<th>Percentile</th>', html)
            self.assertIn('<th>Group</th>', html)
            self.assertIn('<th>O1</th>', html)
Exemple #18
0
    def test_rev_comp_mapping_barcodes(self):
        barcodes = pd.Series(
            ['TTTT', 'GGTT', 'TTAA', 'TTCC', 'GCCG'],
            name='bc',
            index=pd.Index(
                ['sample1', 'sample2', 'sample3', 'sample4', 'sample5'],
                name='id'))
        barcodes = qiime2.CategoricalMetadataColumn(barcodes)
        actual = emp_single(self.bsi, barcodes, rev_comp_mapping_barcodes=True)
        output_fastq = list(actual.sequences.iter_views(FastqGzFormat))
        # five per-sample files were written
        self.assertEqual(len(output_fastq), 5)

        # sequences in sample1 are correct
        self._validate_sample_fastq(output_fastq[0][1].open(), self.sequences,
                                    [0, 5])

        # sequences in sample2 are correct
        self._validate_sample_fastq(output_fastq[1][1].open(), self.sequences,
                                    [2, 4])

        # sequences in sample3 are correct
        self._validate_sample_fastq(output_fastq[2][1].open(), self.sequences,
                                    [1, 3])

        # sequences in sample4 are correct
        self._validate_sample_fastq(output_fastq[3][1].open(), self.sequences,
                                    [7, 10])

        # sequences in sample5 are correct
        self._validate_sample_fastq(output_fastq[4][1].open(), self.sequences,
                                    [6, 8, 9])

        # manifest is correct
        act_manifest = list(actual.manifest.view(FastqManifestFormat).open())
        exp_manifest = [
            'sample-id,filename,direction\n',
            'sample1,sample1_1_L001_R1_001.fastq.gz,forward\n',
            'sample3,sample3_2_L001_R1_001.fastq.gz,forward\n',
            'sample2,sample2_3_L001_R1_001.fastq.gz,forward\n',
            'sample5,sample5_4_L001_R1_001.fastq.gz,forward\n',
            'sample4,sample4_5_L001_R1_001.fastq.gz,forward\n'
        ]
        self._compare_manifests(act_manifest, exp_manifest)
Exemple #19
0
    def test_superset_feature_group(self):
        feature_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['g0', 'g0', 'g1', 'g2', 'g1', 'g2', 'extra'],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'],
                                     name='featureid')))
        data = np.array([[1, 0, 0], [1, 10, 10], [0, 0, 100], [5, 5, 5],
                         [0, 1, 100], [7, 8, 9]])
        # g is missing on purpose
        table = biom.Table(data,
                           sample_ids=['s1', 's2', 's3'],
                           observation_ids=['a', 'b', 'c', 'd', 'e', 'f'])

        expected = biom.Table(np.array([[2, 10, 10], [0, 1, 200], [12, 13,
                                                                   14]]),
                              sample_ids=['s1', 's2', 's3'],
                              observation_ids=['g0', 'g1', 'g2'])
        result = group(table, axis='feature', metadata=feature_mc, mode='sum')
        self.assertEqual(expected, result)
Exemple #20
0
    def setUp(self):
        super().setUp()
        self.md = qiime2.CategoricalMetadataColumn(pd.Series(
            ['a', 'a', 'a', 'b', 'b', 'b'],
            index=pd.Index([c for c in 'abcdef'], name='id'), name='foo'))

        tab = biom.Table(np.array(
            [[13, 26, 37, 3, 6, 1], [33, 24, 23, 5, 6, 2],
             [38, 26, 33, 4, 1, 0], [3, 2, 1, 22, 25, 31],
             [2, 1, 3, 44, 46, 42]]),
            observation_ids=[c for c in 'vwxyz'],
            sample_ids=[c for c in 'abcdef'])
        self.tab = qiime2.Artifact.import_data('FeatureTable[Frequency]', tab)

        dist = skbio.DistanceMatrix.from_iterable(
            iterable=[1, 16, 2, 1, 16, 17],
            metric=lambda x, y: abs(y-x), keys=[c for c in 'abcdef']
        )
        self.dist = qiime2.Artifact.import_data('DistanceMatrix', dist)
Exemple #21
0
    def test_ancom_zero_division(self):
        t = pd.DataFrame([[10, 0], [11, 0], [12, 0], [13, 0],
                          [1000, 10], [1000, 10]],
                         index=['S1', 'S2', 'S3', 'S4', 'S5', 'S6'],
                         columns=['O1', 'O2'])
        c = qiime2.CategoricalMetadataColumn(
            pd.Series(['0', '0', '1', '1', '2', '2'], name='n',
                      index=pd.Index(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'],
                                     name='id'))
        )

        ancom(output_dir=self.temp_dir.name, table=t+1, metadata=c,
              transform_function='log')

        with open(os.path.join(self.temp_dir.name, 'index.html')) as fh:
            f = fh.read()
            self.assertFalse('Infinity' in f)
            self.assertTrue(
                'non-numeric results:\n    <strong>O2</strong>' in f)
Exemple #22
0
    def test_subsample_higher_than_seqs_count(self):
        barcodes = self.barcodes[:1]

        sequences = self.sequences[:1]
        bsi = BarcodeSequenceFastqIterator(barcodes, sequences)

        barcode_map = pd.Series(['AAAA'],
                                name='bc',
                                index=pd.Index(['sample1'], name='id'))
        barcode_map = qiime2.CategoricalMetadataColumn(barcode_map)

        demux_data = emp_single(bsi, barcode_map)
        with tempfile.TemporaryDirectory() as output_dir:
            result = summarize(output_dir,
                               _PlotQualView(demux_data, paired=False),
                               n=50)
            self.assertTrue(result is None)
            plot_fp = os.path.join(output_dir, 'quality-plot.html')
            with open(plot_fp, 'r') as fh:
                html = fh.read()
                self.assertIn('<strong>Warning:</strong>', html)
Exemple #23
0
    def test_basic(self):
        bsi = BarcodeSequenceFastqIterator(self.barcodes, self.sequences)

        barcode_map = pd.Series(['AAAA', 'AACC'],
                                name='bc',
                                index=pd.Index(['sample_1', 'sample2'],
                                               name='id'))
        barcode_map = qiime2.CategoricalMetadataColumn(barcode_map)

        demux_data = emp_single(bsi, barcode_map)
        # test that an index.html file is created and that it has size > 0
        with tempfile.TemporaryDirectory() as output_dir:
            # TODO: Remove _PlotQualView wrapper
            result = summarize(output_dir,
                               _PlotQualView(demux_data, paired=False),
                               n=2)
            self.assertTrue(result is None)
            index_fp = os.path.join(output_dir, 'overview.html')
            self.assertTrue(os.path.exists(index_fp))
            self.assertTrue(os.path.getsize(index_fp) > 0)
            csv_fp = os.path.join(output_dir, 'per-sample-fastq-counts.csv')
            self.assertTrue(os.path.exists(csv_fp))
            self.assertTrue(os.path.getsize(csv_fp) > 0)
            pdf_fp = os.path.join(output_dir, 'demultiplex-summary.pdf')
            self.assertTrue(os.path.exists(pdf_fp))
            self.assertTrue(os.path.getsize(pdf_fp) > 0)
            png_fp = os.path.join(output_dir, 'demultiplex-summary.png')
            self.assertTrue(os.path.exists(png_fp))
            self.assertTrue(os.path.getsize(png_fp) > 0)
            qual_forward_fp = os.path.join(
                output_dir, 'forward-seven-number-summaries.csv')
            self.assertTrue(os.path.exists(qual_forward_fp))
            self.assertTrue(os.path.getsize(qual_forward_fp) > 0)
            with open(index_fp, 'r') as fh:
                html = fh.read()
                self.assertIn('<td>Minimum:</td><td>1</td>', html)
                self.assertIn('<td>Maximum:</td><td>3</td>', html)
            with open(csv_fp, 'r') as ch:
                csv = ch.read()
                self.assertIn('sample_1', csv)
Exemple #24
0
    def test_chain_with_metadata(self):
        df = pd.DataFrame({'a': ['1', '2', '3']},
                          index=pd.Index(['0', '1', '2'], name='feature ID'))

        a = qiime2.Artifact.import_data('IntSequence1', [1, 2, 3])
        m = qiime2.Metadata(df)
        mc = qiime2.CategoricalMetadataColumn(df['a'])

        b = dummy_plugin.actions.identity_with_metadata(a, m).out
        c = dummy_plugin.actions.identity_with_metadata_column(b, mc).out

        p_dir = c._archiver.provenance_dir

        new_m = qiime2.Metadata.load(
            str(p_dir / 'artifacts' / str(b.uuid) / 'action' / 'metadata.tsv'))

        pdt.assert_frame_equal(m.to_dataframe(), new_m.to_dataframe())

        with (p_dir / 'action' / 'metadata.tsv').open() as fh:
            self.assertEqual(
                fh.read(),
                'feature ID\ta\n#q2:types\tcategorical\n0\t1\n1\t2\n2\t3\n')
    def setUp(self):
        super().setUp()
        md = pd.Series(['a', 'a', 'b', 'b', 'b'],
                       index=['a', 'b', 'c', 'd', 'e'], name='bugs')
        md.index.name = 'SampleID'
        self.md = qiime2.CategoricalMetadataColumn(md)

        tab = biom.Table(
            np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
                      [8, 6, 4, 1, 0], [8, 6, 4, 1, 0]]),
            observation_ids=['v', 'w', 'x', 'y', 'z'],
            sample_ids=['a', 'b', 'c', 'd', 'e'])
        self.tab = qiime2.Artifact.import_data('FeatureTable[Frequency]', tab)

        md2 = pd.DataFrame({'trash': ['a', 'a', 'b', 'b', 'b', 'junk'],
                            'floats': [0.1, 0.1, 1.3, 1.8, 1000.1, 0.1],
                            'ints': [0, 1, 2, 2, 2, 0],
                            'nans': [1, 1, 2, 2, np.nan, np.nan],
                            'negatives': [-7, -3, -1.2, -4, -9, -1]},
                           index=['a', 'b', 'c', 'd', 'e', 'peanut'])
        md2.index.name = 'SampleID'
        self.md2 = qiime2.Metadata(md2)
Exemple #26
0
    def test_paired_end(self):
        barcodes = self.barcodes[:3]

        forward = self.sequences[:3]

        reverse = [('@s1/1 abc/1', 'CCC', '+', 'YYY'),
                   ('@s2/1 abc/1', 'GGG', '+', 'PPP'),
                   ('@s3/1 abc/1', 'TTT', '+', 'PPP')]

        bpsi = BarcodePairedSequenceFastqIterator(barcodes, forward, reverse)

        barcode_map = pd.Series(['AAAA', 'AACC', 'TTAA'],
                                name='bc',
                                index=pd.Index(
                                    ['sample1', 'sample2', 'sample3'],
                                    name='id'))
        barcode_map = qiime2.CategoricalMetadataColumn(barcode_map)

        demux_data = emp_paired(bpsi, barcode_map)
        with tempfile.TemporaryDirectory() as output_dir:
            result = summarize(output_dir,
                               _PlotQualView(demux_data, paired=True),
                               n=2)
            self.assertTrue(result is None)
            plot_fp = os.path.join(output_dir, 'quality-plot.html')
            qual_forward_fp = os.path.join(
                output_dir, 'forward-seven-number-summaries.csv')
            self.assertTrue(os.path.exists(qual_forward_fp))
            self.assertTrue(os.path.getsize(qual_forward_fp) > 0)
            qual_reverse_fp = os.path.join(
                output_dir, 'reverse-seven-number-summaries.csv')
            self.assertTrue(os.path.exists(qual_reverse_fp))
            self.assertTrue(os.path.getsize(qual_reverse_fp) > 0)
            with open(plot_fp, 'r') as fh:
                html = fh.read()
                self.assertIn('<h5 class="text-center">Forward Reads</h5>',
                              html)
                self.assertIn('<h5 class="text-center">Reverse Reads</h5>',
                              html)
    def test_ancom_integer_indices(self):
        # The idea behind this test is to use integer indices to confirm
        # that the metadata column mapping is joining on labels, not on
        # indices. If it was joining on the index, the metadata would map in
        # the opposite direction, resulting in no significant results being
        # rendered to the output HTML table.
        t = pd.DataFrame(
            [[9, 9, 9, 19, 19, 19], [10, 11, 10, 20, 20, 20],
             [9, 10, 9, 9, 10, 9], [9, 10, 9, 9, 9, 8], [9, 10, 9, 9, 9, 9],
             [9, 10, 9, 9, 9, 10], [9, 12, 9, 9, 9, 11]],
            index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],
            columns=['1', '2', '3', '4', '5', '6']).T
        c = qiime2.CategoricalMetadataColumn(
            pd.Series(['1', '0', '0', '0', '1', '0'],
                      name='n',
                      index=pd.Index(['6', '5', '4', '3', '2', '1'],
                                     name='id')))
        ancom(output_dir=self.temp_dir.name, table=t + 1, metadata=c)

        index_fp = os.path.join(self.temp_dir.name, 'index.html')
        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<th>O7</th>', html)
Exemple #28
0
    def test_anosim_pairwise(self):
        dm = skbio.DistanceMatrix(
            [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]],
            ids=['sample1', 'sample2', 'sample3'])
        md = qiime2.CategoricalMetadataColumn(
            pd.Series(['a', 'b', 'b'],
                      name='a or b',
                      index=pd.Index(['sample1', 'sample2', 'sample3'],
                                     name='id')))

        with tempfile.TemporaryDirectory() as output_dir:
            beta_group_significance(output_dir,
                                    dm,
                                    md,
                                    method='anosim',
                                    permutations=42,
                                    pairwise=True)
            index_fp = os.path.join(output_dir, 'index.html')
            self.assertTrue(os.path.exists(index_fp))
            # all expected boxplots are generated
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'a-boxplots.pdf')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'a-boxplots.png')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'b-boxplots.pdf')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'b-boxplots.png')))
            # no extra boxplots are generated
            self.assertEqual(len(glob.glob('%s/*-boxplots.pdf' % output_dir)),
                             2)
            self.assertEqual(len(glob.glob('%s/*-boxplots.png' % output_dir)),
                             2)
            self.assertTrue('ANOSIM results' in open(index_fp).read())
            self.assertTrue('<td>42</td>' in open(index_fp).read())
            self.assertFalse('Warning' in open(index_fp).read())
            self.assertTrue('Pairwise anosim' in open(index_fp).read())
Exemple #29
0
    def test_inconsistent_sequence_length_paired(self):
        forward = [('@s1/1 abc/1', 'G', '+', 'Y'),
                   ('@s2/1 abc/1', 'CCC', '+', 'PPP'),
                   ('@s3/1 abc/1', 'AAAAA', '+', 'PPPPP'),
                   ('@s4/1 abc/1', 'TTTTTTT', '+', 'PPPPPPP')]
        reverse = [('@s1/1 abc/1', 'AAAAAAA', '+', 'YYYYYYY'),
                   ('@s2/1 abc/1', 'TTTTT', '+', 'PPPPP'),
                   ('@s3/1 abc/1', 'GGG', '+', 'PPP'),
                   ('@s4/1 abc/1', 'C', '+', 'P')]
        bpsi = BarcodePairedSequenceFastqIterator(self.barcodes, forward,
                                                  reverse)

        barcode_map = pd.Series(['AAAA', 'AACC'],
                                name='bc',
                                index=pd.Index(['sample1', 'sample2'],
                                               name='id'))
        barcode_map = qiime2.CategoricalMetadataColumn(barcode_map)

        demux_data = emp_paired(bpsi, barcode_map)
        lengths = [1, 3, 5, 7]
        for n in range(1, 6):
            with tempfile.TemporaryDirectory() as output_dir:
                lengths_ = lengths[0:5 - n] if n < 4 else [1]
                # TODO: Remove _PlotQualView wrapper
                summarize(output_dir,
                          _PlotQualView(demux_data, paired=True),
                          n=n)
                plot_fp = os.path.join(output_dir, 'data.jsonp')
                with open(plot_fp, 'r') as fh:
                    jsonp = fh.read()
                    json_ = jsonp.replace('app.init(', '[').replace(');', ']')
                    payload = json.loads(json_)[0]
                    self.assertEqual(payload["totalSeqCount"], 4)
                    self.assertIn(payload["minSeqLen"]["forward"], lengths_)
                    self.assertIn(payload["minSeqLen"]["reverse"], lengths_)
                    self.assertEqual(payload["n"], min(n, 4))
Exemple #30
0
 def test_confusion_matrix_vmax_too_low(self):
     b = qiime2.CategoricalMetadataColumn(self.a)
     with self.assertRaisesRegex(
             ValueError, r'vmax must be greater than.*'
             r'\s\s0\.5.*less.*1\.0'):
         confusion_matrix(self.tmpd, self.a, b, vmin=None, vmax=.5)