Beispiel #1
0
    def setUp(self):
        eigvals = pd.Series(np.array([0.50, 0.25, 0.25]),
                            index=['PC1', 'PC2', 'PC3'])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(samples,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=['PC1', 'PC2', 'PC3'])
        self.pcoa = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

        samples_df = pd.DataFrame(samples + 1.01,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=['PC1', 'PC2', 'PC3'])
        self.other = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals.copy(),
            samples_df,
            proportion_explained=proportion_explained.copy())

        self.metadata = qiime2.Metadata(
            pd.DataFrame(
                {
                    'val1': ['1.0', '2.0', '3.0', '4.0'],
                    'val2': ['3.3', '3.5', '3.6', '3.9']
                },
                index=pd.Index(['A', 'B', 'C', 'D'], name='id')))
    def setUp(self):
        axes = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']
        eigvals = pd.Series(np.array([1.5, 0.75, 0.3, 0.15, 0.15, 0.15]),
                            index=axes)
        samples = np.array([[0, 3, 4, 4, 0, 0], [1, 2, 1, 4, 3, 3],
                            [2, 3, 1, 0, 0, 1], [0, 3, 2, 4, 3, 0]])

        proportion_explained = pd.Series([0.50, 0.25, 0.10, 0.05, 0.05, 0.05],
                                         index=axes)
        samples_df = pd.DataFrame(samples,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes)
        self.reference = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

        samples = np.array([[0.7, 3.7, 4.7, 4.7, 0.7, 0.7],
                            [1.7, 2.7, 1.7, 4.7, 3.7, 3.7],
                            [2.7, 3.7, 1.7, 0.7, 0.7, 1.7],
                            [30, 3.7, 2.7, 4.7, 3.7, 0.7]])
        samples_df = pd.DataFrame(samples,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes)
        self.other = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals.copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained.copy())

        S = [[-0.1358036, 0.0452679, 0.3621430, 0.1810715, -0.2716072],
             [0.0452679, -0.1358036, -0.1810715, 0.1810715, 0.2716072],
             [0.2263394, 0.0452679, -0.1810715, -0.5432145, -0.2716072],
             [-0.1358036, 0.0452679, 0.0000000, 0.1810715, 0.2716072]]
        samples_df = pd.DataFrame(np.array(S),
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes[:5])
        self.expected_ref = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals[:5].copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained[:5].copy())
        S = [[0.0482731, -0.0324317, 0.0494312, -0.0316828, -0.1584374],
             [0.0803620, -0.0718115, -0.0112234, -0.0171011, -0.1101209],
             [0.0527554, -0.0042753, -0.0126739, -0.0969602, -0.0964822],
             [-0.1813905, 0.1085184, -0.0255339, 0.1457440, 0.3650405]]
        samples_df = pd.DataFrame(np.array(S),
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes[:5])
        self.expected_other = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals[:5].copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained[:5].copy())
Beispiel #3
0
def rpca(
        table: biom.Table,
        rank: int = 3,
        min_sample_count: int = 500,
        min_feature_count: int = 10,
        iterations: int = 5
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """ Runs RPCA with an rclr preprocessing step"""

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    table = table.T[table.sum() > min_feature_count].T

    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform(
        table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    # eigan-vals
    eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values()))

    # if the rank is two add PC3 of zeros
    if rank == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Beispiel #4
0
 def test_remove_empty_nothing_to_remove_with_ordination(self, mock_stdout):
     good_pcoa = skbio.OrdinationResults(
         'PCoA',
         'Principal Coordinate Analysis',
         self.eigvals,
         self.samples_df.drop(labels="Sample4", axis="index"),
         features=self.features_df.drop(labels="e", axis="index"),
         proportion_explained=self.proportion_explained)
     ft, fsm = remove_empty_samples_and_features(self.table_ef, self.sm_ef,
                                                 good_pcoa)
     self.assertEqual(ft, self.table_ef)
     assert_frame_equal(fsm, self.sm_ef)
     self.assertEqual(mock_stdout.getvalue(), "")
Beispiel #5
0
 def test_remove_empty_with_empty_feature_in_ordination(self):
     bad_feature_pcoa = skbio.OrdinationResults(
         'PCoA',
         'Principal Coordinate Analysis',
         self.eigvals,
         self.samples_df.drop(labels="Sample4", axis="index"),
         features=self.features_df,
         proportion_explained=self.proportion_explained)
     with self.assertRaisesRegex(
             ValueError,
         (r"The ordination contains features that are empty \(i.e. all "
          r"0s\) in the table. Problematic feature IDs: e")):
         remove_empty_samples_and_features(self.table, self.sm,
                                           bad_feature_pcoa)
Beispiel #6
0
 def setUp(self):
     eigvals = np.array([0.50, 0.25, 0.25])
     samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
     proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                      index=['PC1', 'PC2', 'PC3'])
     samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'],
                               ['PC1', 'PC2', 'PC3'])
     self.pcoa = skbio.OrdinationResults(
         'PCoA',
         'Principal Coordinate Analysis',
         eigvals,
         samples_df,
         proportion_explained=proportion_explained)
     self.metadata = qiime2.Metadata(
         pd.DataFrame({'val1': ['1.0', '2.0', '3.0', '4.0']},
                      index=['A', 'B', 'C', 'D']))
Beispiel #7
0
 def test_remove_empty_with_empty_sample_and_feature_in_ordination(self):
     # Checks behavior when both an empty sample and an empty feature are in
     # the ordination. Currently the code is structured so that empty sample
     # errors take precedence over empty feature errors -- I imagine this
     # will be the more common of the two scenarios, which is partially why
     # I went with this. But this is probably a rare edge case anyway.
     extremely_funky_pcoa = skbio.OrdinationResults(
         'PCoA',
         'Principal Coordinate Analysis',
         self.eigvals,
         self.samples_df,
         features=self.features_df,
         proportion_explained=self.proportion_explained)
     with self.assertRaisesRegex(
             ValueError,
         (r"The ordination contains samples that are empty \(i.e. all "
          r"0s\) in the table. Problematic sample IDs: Sample4")):
         remove_empty_samples_and_features(self.table, self.sm,
                                           extremely_funky_pcoa)
Beispiel #8
0
    def setUp(self):
        self.table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0,
                                                                 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])
        # After filtering out empty samples/features:
        self.table_ef = biom.Table(
            np.array([[1, 2, 4], [8, 7, 5], [1, 0, 0]]).T, ['a', 'b', 'd'],
            ['Sample1', 'Sample2', 'Sample3'])

        self.sm = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.ids()))
        # After filtering out empty samples/features:
        # (Note that we only care about "emptiness" from the table's
        # perspective. We don't consider a sample with 0 for all of its
        # metadata, or a metadata field with 0 for all samples, to be empty.)
        self.sm_ef = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0],
                "Metadata2": [0, 0, 0],
                "Metadata3": [1, 2, 3],
                "Metadata4": ["abc", "def", "ghi"]
            },
            index=self.table_ef.ids().copy())
        self.sid2idx = {"Sample1": 0, "Sample2": 1, "Sample3": 2}
        self.tm = pd.DataFrame(
            {
                "Level 1": ["k__Bacteria", "k__Bacteria"],
                "Level 2": ["p__Bacteroidetes", "p__Bacteroidetes"],
                "Level 3": ["c__Bacteroidia", "c__Bacteroidia"],
                "Level 4": ["o__Bacteroidales", "o__Bacteroidales"],
                "Level 5": ["f__Bacteroidaceae", "f__Bacteroidaceae"],
                "Level 6": ["g__Bacteroides", "g__Bacteroides"],
                "Level 7": ["s__", "s__uniformis"],
                "Confidence": [0.95, 0]
            },
            index=["e", "a"])
        self.im = pd.DataFrame(
            {
                "Level 1": ["k__Bacteria", "k__Archaea"],
                "Level 2": ["p__Proteobacteria", "Unspecified"],
                "Level 3": ["c__Gammaproteobacteria", "Unspecified"],
                "Level 4": ["o__Pasteurellales", "Unspecified"],
                "Level 5": ["f__Pasteurellaceae", "Unspecified"],
                "Level 6": ["g__", "Unspecified"],
                "Level 7": ["s__", "Unspecified"],
                "Confidence": [0.8, 1]
            },
            index=["h", "m"])
        self.exp_fm_cols = [
            "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6",
            "Level 7", "Confidence"
        ]
        self.exp_ctm = {
            "e": [
                "k__Bacteria", "p__Bacteroidetes", "c__Bacteroidia",
                "o__Bacteroidales", "f__Bacteroidaceae", "g__Bacteroides",
                "s__", "0.95"
            ],
            # The ".0" in "a"'s Confidence value is due to the 0 being treated
            # as numeric by Pandas, since this was a numeric column in the DF.
            # We can *try* to prevent this sort of thing from happening, but I
            # doubt this will make a difference to anyone -- and also it's kind
            # of dependent on whatever tool is reading the metadata in the
            # first place (if it was all read with dtype=str, then this
            # shouldn't be a problem). So, more of a QIIME 2 problem.
            "a": [
                "k__Bacteria", "p__Bacteroidetes", "c__Bacteroidia",
                "o__Bacteroidales", "f__Bacteroidaceae", "g__Bacteroides",
                "s__uniformis", "0.0"
            ]
        }
        self.exp_cim = {
            "h": [
                "k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria",
                "o__Pasteurellales", "f__Pasteurellaceae", "g__", "s__", "0.8"
            ],
            "m": [
                "k__Archaea", "Unspecified", "Unspecified", "Unspecified",
                "Unspecified", "Unspecified", "Unspecified", "1.0"
            ]
        }
        # Ordination info (for testing inputs to remove_empty...())
        self.eigvals = pd.Series(np.array([0.50, 0.25, 0.25]),
                                 index=["PC1", "PC2", "PC3"])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        self.proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                              index=["PC1", "PC2", "PC3"])
        self.samples_df = pd.DataFrame(
            samples,
            index=["Sample1", "Sample2", "Sample3", "Sample4"],
            columns=["PC1", "PC2", "PC3"])
        features = np.array([[0.9, 0.8, 0.7], [0.6, 0.5, 0.4], [0.3, 0.2, 0.1],
                             [0.0, 0.2, 0.4]])
        self.features_df = pd.DataFrame(features,
                                        index=["a", "b", "e", "d"],
                                        columns=["PC1", "PC2", "PC3"])
        # self.pcoa is problematic by default, because it contains Sample4
        self.pcoa = skbio.OrdinationResults(
            "PCoA",
            "Principal Coordinate Analysis",
            self.eigvals,
            self.samples_df,
            proportion_explained=self.proportion_explained)
Beispiel #9
0
    def setUp(self):
        self.tree = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2):1;')
        self.pruned_tree = TreeNode.read(
            StringIO('(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;'))
        # Test table/metadata (mostly) adapted from Qurro:
        self.table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 1,
                                                                 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])

        self.unrelated_table = biom.Table(
            np.array([[5, 2, 0, 2], [2, 3, 0, 1], [5, 2, 0, 0], [4, 5, 0,
                                                                 4]]).T,
            list("hijk"), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])
        self.sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.ids()))
        self.feature_metadata = pd.DataFrame(
            {
                "fmdcol1": ["asdf", "ghjk"],
                "fmdcol2": ["qwer", "tyui"]
            },
            index=["a", "h"])
        self.filtered_table = biom.Table(
            np.array([[1, 2, 4], [8, 7, 5], [1, 0, 0]]).T, ['a', 'b', 'd'],
            ['Sample1', 'Sample2', 'Sample3'])
        self.filtered_sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0],
                "Metadata2": [0, 0, 0],
                "Metadata3": [1, 2, 3],
                "Metadata4": ["abc", "def", "ghi"]
            },
            index=["Sample1", "Sample2", "Sample3"])

        eigvals = pd.Series(np.array([0.50, 0.25, 0.25]),
                            index=['PC1', 'PC2', 'PC3'])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(
            samples,
            index=['Sample1', 'Sample2', 'Sample3', 'Sample4'],
            columns=['PC1', 'PC2', 'PC3'])
        self.pcoa = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

        features = np.abs(samples_df.copy() / 2.0).iloc[:2, :]
        features.index = 'f.' + features.index
        self.biplot_no_matches = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            features=features,
            proportion_explained=proportion_explained)

        features = np.abs(samples_df / 2.0).iloc[:2, :]
        features.index = pd.Index(['a', 'h'])
        self.biplot = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            features=features,
            proportion_explained=proportion_explained)
        self.biplot_tree = parse_newick(
            '(((y:1,z:2):1,b:2)g:1,(:1,d:3)h:2):1;')
        self.biplot_table = biom.Table(
            np.array([[1, 2], [8, 7], [1, 0], [0, 3]]).T, ['y', 'z'],
            ['Sample1', 'Sample2', 'Sample3', 'Sample4'])

        self.files_to_remove = []
        self.maxDiff = None
Beispiel #10
0
def rpca(
    table: biom.Table,
    n_components: int = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter and import table
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')

    # rclr preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))

    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    u, s, v = svd(X)
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Beispiel #11
0
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an matrix_rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       gemelli.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    # table to dataframe
    table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                         table.ids('sample')).T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(
                               matrix_rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in gemelli -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
Beispiel #12
0
    def setUp(self):
        self.tree = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2):1;')
        self.pruned_tree = TreeNode.read(
            StringIO('(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;'))
        # Test table/metadata (mostly) adapted from Qurro:
        # the table is transposed to match QIIME2's expectation
        self.table = pd.DataFrame(
            {
                "Sample1": [1, 2, 0, 4],
                "Sample2": [8, 7, 0, 5],
                "Sample3": [1, 0, 0, 0],
                "Sample4": [0, 0, 0, 0]
            },
            index=["a", "b", "e", "d"]).T
        self.unrelated_table = pd.DataFrame(
            {
                "Sample1": [5, 2, 0, 2],
                "Sample2": [2, 3, 0, 1],
                "Sample3": [5, 2, 0, 0],
                "Sample4": [4, 5, 0, 4]
            },
            index=["h", "i", "j", "k"]).T
        self.sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.index))

        self.feature_metadata = pd.DataFrame(
            {
                "fmdcol1": ["asdf", "ghjk"],
                "fmdcol2": ["qwer", "tyui"]
            },
            index=["a", "h"])
        self.filtered_table = pd.DataFrame(
            {
                "Sample1": [1, 2, 4],
                "Sample2": [8, 7, 5],
                "Sample3": [1, 0, 0],
                "Sample4": [0, 0, 0]
            },
            index=["a", "b", "d"]).T

        eigvals = pd.Series(np.array([0.50, 0.25, 0.25]),
                            index=['PC1', 'PC2', 'PC3'])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(
            samples,
            index=['Sample1', 'Sample2', 'Sample3', 'Sample4'],
            columns=['PC1', 'PC2', 'PC3'])
        self.pcoa = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

        self.files_to_remove = []
        self.maxDiff = None
    def setUp(self):
        axes = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']
        eigvals = pd.Series(np.array([1.5, 0.75, 0.3, 0.15, 0.15, 0.15]),
                            index=axes)
        samples = np.array([[0, 3, 4, 4, 0, 0], [1, 2, 1, 4, 3, 3],
                            [2, 3, 1, 0, 0, 1], [0, 3, 2, 4, 3, 0]])

        proportion_explained = pd.Series([0.50, 0.25, 0.10, 0.05, 0.05, 0.05],
                                         index=axes)
        samples_df = pd.DataFrame(samples,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes)
        self.reference = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

        samples = np.array([[0.7, 3.7, 4.7, 4.7, 0.7, 0.7],
                            [1.7, 2.7, 1.7, 4.7, 3.7, 3.7],
                            [2.7, 3.7, 1.7, 0.7, 0.7, 1.7],
                            [30, 3.7, 2.7, 4.7, 3.7, 0.7]])
        samples_df = pd.DataFrame(samples,
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes)
        self.other = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals.copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained.copy())

        S = [[-0.1358036, 0.0452679, 0.3621430, 0.1810715, -0.2716072],
             [0.0452679, -0.1358036, -0.1810715, 0.1810715, 0.2716072],
             [0.2263394, 0.0452679, -0.1810715, -0.5432145, -0.2716072],
             [-0.1358036, 0.0452679, 0.0000000, 0.1810715, 0.2716072]]
        samples_df = pd.DataFrame(np.array(S),
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes[:5])
        self.expected_ref = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals[:5].copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained[:5].copy())
        S = [[0.0482731, -0.0324317, 0.0494312, -0.0316828, -0.1584374],
             [0.0803620, -0.0718115, -0.0112234, -0.0171011, -0.1101209],
             [0.0527554, -0.0042753, -0.0126739, -0.0969602, -0.0964822],
             [-0.1813905, 0.1085184, -0.0255339, 0.1457440, 0.3650405]]
        samples_df = pd.DataFrame(np.array(S),
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes[:5])
        self.expected_other = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals[:5].copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained[:5].copy())

        noise = [
            [0.04988341, -0.03234447, 0.03177641, -0.03507789, -0.13564394],
            [0.09117347, -0.08318546, -0.02249053, -0.01597601, -0.10901541],
            [0.05077765, -0.003994, -0.00984688, -0.09356729, -0.09648388],
            [-0.19183453, 0.11952393, 0.000561, 0.14462118, 0.34114323]
        ]
        samples_df = pd.DataFrame(np.array(noise),
                                  index=['A', 'B', 'C', 'D'],
                                  columns=axes[:5])
        self.expected_noise = skbio.OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals[:5].copy(),
            samples_df.copy(),
            proportion_explained=proportion_explained[:5].copy())

        self.expected_m2 = 0.72240956
        self.expected_p = 0.5