Example #1
0
def _pca(ranks_df: pd.DataFrame,
         n_components: int = None) -> (OrdinationResults, OrdinationResults):
    # perform PCA
    pca_result = PCA(n_components=n_components)
    pca_result.fit(ranks_df)

    # transform ranks
    ranks_transformed = pd.DataFrame(pca_result.transform(ranks_df))
    ranks_transformed.index = ranks_df.index

    components_loadings = pd.DataFrame(-1 * pca_result.components_.T *
                                       np.sqrt(pca_result.explained_variance_))
    components_loadings.index = ranks_df.columns
    eigenvalues = pd.Series(pca_result.explained_variance_)

    ores_scores = OrdinationResults(
        short_method_name="PCA",
        long_method_name="Principal Components Analysis",
        eigvals=eigenvalues,
        samples=ranks_transformed,
        features=None,
        biplot_scores=None,
        proportion_explained=pd.Series(pca_result.explained_variance_ratio_))

    ores_loadings = OrdinationResults(
        short_method_name="PCA",
        long_method_name="Principal Components Analysis",
        eigvals=eigenvalues,
        samples=components_loadings,
        features=None,
        biplot_scores=None,
        proportion_explained=pd.Series(pca_result.explained_variance_ratio_))

    return ores_scores, ores_loadings
Example #2
0
def procrustes_analysis(
    reference: OrdinationResults,
    other: OrdinationResults,
    dimensions: int = 5,
    permutations: int = 999
) -> (OrdinationResults, OrdinationResults, pd.DataFrame):

    if reference.samples.shape != other.samples.shape:
        raise ValueError('The matrices cannot be fitted unless they have the '
                         'same dimensions')

    if reference.samples.shape[1] < dimensions:
        raise ValueError('Cannot fit fewer dimensions than available')

    # fail if there are any elements in the symmetric difference
    diff = reference.samples.index.symmetric_difference(other.samples.index)
    if not diff.empty:
        raise ValueError('The ordinations represent two different sets of '
                         'samples')

    # make the matrices be comparable
    other.samples = other.samples.reindex(index=reference.samples.index)
    mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions],
                                other.samples.values[:, :dimensions])

    axes = reference.samples.columns[:dimensions]
    samples1 = pd.DataFrame(data=mtx1,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())
    samples2 = pd.DataFrame(data=mtx2,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())

    info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions],
                                   other.samples.values[:, :dimensions], m2,
                                   permutations)

    out1 = OrdinationResults(short_method_name=reference.short_method_name,
                             long_method_name=reference.long_method_name,
                             eigvals=reference.eigvals[:dimensions].copy(),
                             samples=samples1,
                             features=reference.features,
                             biplot_scores=reference.biplot_scores,
                             sample_constraints=reference.sample_constraints,
                             proportion_explained=reference.
                             proportion_explained[:dimensions].copy())
    out2 = OrdinationResults(
        short_method_name=other.short_method_name,
        long_method_name=other.long_method_name,
        eigvals=other.eigvals[:dimensions].copy(),
        samples=samples2,
        features=other.features,
        biplot_scores=other.biplot_scores,
        sample_constraints=other.sample_constraints,
        proportion_explained=other.proportion_explained[:dimensions].copy())
    return out1, out2, info
    def setUp(self):
        self.alpha = pd.Series([1, 2, 3], index=list('abc'))

        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        self.biom = biom.Table(data, ['O1', 'O2'], ['a', 'b', 'c'])

        eigvals = [0.51236726, 0.30071909, 0.26791207]
        proportion_explained = [0.2675738328, 0.157044696, 0.1399118638]
        sample_ids = ['a', 'b', 'c']
        axis_labels = ['PC%d' % i for i in range(1, 4)]
        np.random.seed(11)
        data = np.random.randn(3, 3)

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(
                data,
                index=sample_ids, columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))
        self.ordination = expected_results

        self.metadata = pd.DataFrame(data=[[':0', ':)', ':/'],
                                           [':D', 'xD', '<3'],
                                           [';L', ']:->', ':S']],
                                     index=list('abc'),
                                     columns=['foo', 'bar', 'baz'])
Example #4
0
 def _create_ordination_results(self):
     eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868]
     proportion_explained = [
         0.2675738328, 0.157044696, 0.1399118638, 0.1091402725
     ]
     sample_ids = [
         '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192',
         '1.SKB7.640196'
     ]
     axis_labels = ['PC1', 'PC2', 'PC3', 'PC4']
     samples = [[-2.584, 1.739, 3.828, -1.944],
                [-2.710, -1.859, -8.648, 1.180],
                [2.350, 9.625, -3.457,
                 -3.208], [2.614, -1.114, 1.476, 2.908],
                [2.850, -1.925, 6.232, 1.381]]
     ord_res = OrdinationResults(
         short_method_name='PCoA',
         long_method_name='Principal Coordinate Analysis',
         eigvals=pd.Series(eigvals, index=axis_labels),
         samples=pd.DataFrame(np.asarray(samples),
                              index=sample_ids,
                              columns=axis_labels),
         proportion_explained=pd.Series(proportion_explained,
                                        index=axis_labels))
     fd, fp = mkstemp(suffix='.txt', dir=self.out_dir)
     close(fd)
     ord_res.write(fp)
     return fp
Example #5
0
    def test_str(self):
        exp = ("Ordination results:\n"
               "\tMethod: Correspondance Analysis (CA)\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: 3x2\n"
               "\tSamples: 3x2\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSample IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tMethod: Principal Coordinate Analysis (PCoA)\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: N/A\n"
               "\tSamples: 2x1\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: N/A\n"
               "\tSample IDs: 0, 1")
        samples_df = pd.DataFrame(np.array([[1], [2]]))
        obs = str(
            OrdinationResults('PCoA', 'Principal Coordinate Analysis',
                              pd.Series(np.array([4.2])), samples_df))
        self.assertEqual(obs.split('\n'), exp.split('\n'))
Example #6
0
    def setUp(self):
        self.tree = self.mock_tree_from_nwk()
        self.bp_tree = from_skbio_treenode(self.tree)
        self.table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0,
                                                                 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])
        self.sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.ids()))

        # (These are some Greengenes taxonomy annotations I took from the
        # moving pictures taxonomy.qza file. I made up the confidences.)
        self.feature_metadata = pd.DataFrame(
            {
                "Taxonomy":
                [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__"),
                 ("k__Bacteria; p__Proteobacteria; "
                  "c__Gammaproteobacteria; o__Pasteurellales; "
                  "f__Pasteurellaceae; g__; s__"),
                 ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__uniformis")],
                "Confidence": [0.95, 0.8, 0]
            },
            index=["e", "h", "a"])
        self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata)
        self.tip_md = self.split_tax_fm.loc[["a", "e"]]
        self.int_md = self.split_tax_fm.loc[["h"]]
        # This is designed to match the shearing that's done in the core test
        # for --p-shear-to-table
        self.shorn_tree = parse_newick(
            "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;")
        self.exp_split_fm_cols = [
            "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6",
            "Level 7", "Confidence"
        ]

        eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3'])
        samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                   [0.4, 0.5, 0.6]]
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(
            samples,
            index=['Sample1', 'Sample2', 'Sample3', 'Sample4'],
            columns=['PC1', 'PC2', 'PC3'])
        self.ordination = OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)
    def test_book_example_dataset(self):
        # Adapted from PyCogent's `test_principal_coordinate_analysis`:
        #   "I took the example in the book (see intro info), and did
        #   the principal coordinates analysis, plotted the data and it
        #   looked right".
        eigvals = [
            0.73599103, 0.26260032, 0.14926222, 0.06990457, 0.02956972,
            0.01931184, 0., 0., 0., 0., 0., 0., 0., 0.
        ]
        proportion_explained = [
            0.58105792, 0.20732046, 0.1178411, 0.05518899, 0.02334502,
            0.01524651, 0., 0., 0., 0., 0., 0., 0., 0.
        ]
        sample_ids = [str(i) for i in range(14)]
        axis_labels = ['PC%d' % i for i in range(1, 15)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAzeros_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        results = npt.assert_warns(RuntimeWarning, pcoa, self.dm)

        # Note the absolute value because column can have signs swapped
        results.samples = np.abs(results.samples)
        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
Example #8
0
    def test_scaling1(self):
        eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids)
        # p. 458
        features = pd.DataFrame(
            np.array([
                [1.31871, -0.34374],  # V
                [-0.37215, 1.48150],
                [-0.99972, -0.92612]
            ]),
            self.feature_ids,
            self.pc_ids)
        samples = pd.DataFrame(
            np.array([
                [-0.26322, -0.17862],  # F
                [-0.06835, 0.27211],
                [0.51685, -0.09517]
            ]),
            self.sample_ids,
            self.pc_ids)
        exp = OrdinationResults('CA',
                                'Correspondance Analysis',
                                eigvals=eigvals,
                                features=features,
                                samples=samples)
        scores = ca(self.contingency, 1)

        assert_ordination_results_equal(exp,
                                        scores,
                                        decimal=5,
                                        ignore_directionality=True)
    def test_simple(self):
        eigvals = [
            0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895,
            0.16054235, 0.15017696, 0.12245775, 0.0
        ]
        proportion_explained = [
            0.2675738328, 0.157044696, 0.1399118638, 0.1091402725,
            0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0
        ]
        sample_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAEigenResults_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
Example #10
0
    def test_scaling2(self):

        eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids)
        # p. 460 L&L 1998
        features = pd.DataFrame(
            np.array([
                [0.40887, -0.06955],  # F_hat
                [-0.11539, 0.29977],
                [-0.30997, -0.18739]
            ]),
            self.feature_ids,
            self.pc_ids)
        samples = pd.DataFrame(
            np.array([
                [-0.84896, -0.88276],  # V_hat
                [-0.22046, 1.34482],
                [1.66697, -0.47032]
            ]),
            self.sample_ids,
            self.pc_ids)
        exp = OrdinationResults('CA',
                                'Correspondance Analysis',
                                eigvals=eigvals,
                                features=features,
                                samples=samples)

        scores = ca(self.contingency, 2)

        assert_ordination_results_equal(exp,
                                        scores,
                                        decimal=5,
                                        ignore_directionality=True)
    def test_scaling2(self):

        scores = rda(self.Y, self.X, scaling=2)
        mat = np.loadtxt(get_data_path('example2_biplot_scaling2'))
        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        sample_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('example2_sample_constraints_scaling2')))

        # Load data as computed with vegan 2.0-8
        vegan_features = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_species_scaling2_from_vegan')),
            index=self.feature_ids,
            columns=self.pc_ids)

        vegan_samples = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_site_scaling2_from_vegan')),
            index=self.sample_ids,
            columns=self.pc_ids)

        sample_constraints = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_sample_constraints_scaling2')),
            index=self.sample_ids,
            columns=self.pc_ids)

        mat = np.loadtxt(get_data_path(
            'example2_biplot_scaling2'))
        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        proportion_explained = pd.Series([0.44275783, 0.25614586,
                                          0.15280354, 0.10497021,
                                          0.02873375, 0.00987052,
                                          0.00471828],
                                         index=self.pc_ids)

        eigvals = pd.Series([25.897954, 14.982578, 8.937841, 6.139956,
                             1.680705, 0.577350, 0.275984],
                            index=self.pc_ids)

        exp = OrdinationResults(
            'RDA', 'Redundancy Analysis',
            samples=vegan_samples,
            features=vegan_features,
            sample_constraints=sample_constraints,
            biplot_scores=biplot_scores,
            proportion_explained=proportion_explained,
            eigvals=eigvals)

        assert_ordination_results_equal(scores, exp,
                                        ignore_directionality=True,
                                        decimal=6)
Example #12
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2'])
        features = np.array([[0.408869425742, 0.0695518116298],
                             [-0.1153860437, -0.299767683538],
                             [-0.309967102571, 0.187391917117]])
        samples = np.array([[-0.848956053187, 0.882764759014],
                            [-0.220458650578, -1.34482000302],
                            [1.66697179591, 0.470324389808]])
        features_ids = ['Species1', 'Species2', 'Species3']
        sample_ids = ['Site1', 'Site2', 'Site3']

        samples_df = pd.DataFrame(samples,
                                  index=sample_ids,
                                  columns=['CA1', 'CA2'])
        features_df = pd.DataFrame(features,
                                   index=features_ids,
                                   columns=['CA1', 'CA2'])

        self.ordination_results = OrdinationResults('CA',
                                                    'Correspondance Analysis',
                                                    eigvals=eigvals,
                                                    samples=samples_df,
                                                    features=features_df)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8],
                                [22, -4.2, np.nan], ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'],
                                  ['PC1', 'PC2', 'PC3'])

        self.min_ord_results = OrdinationResults(
            'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
def apca(df):
    """Performs Aitchison PCA on a feature table.
    Parameters
    ----------
        df: pd.DataFrame
            A numeric DataFrame whose rows are "features" and whose columns are
            "samples."
    Returns
    -------
        A 3-tuple (U, p, V) where:
            U: pd.DataFrame
                Feature loadings.
            p: pd.DataFrame
                Proportions of variance explained.
            V: pd.DataFrame
                Sample loadings.
    """
    # do A-PCA
    U, s, V = svds(clr(df), k=2)
    V = V.T
    # reverse (see SVDs docs)
    U = np.flip(U, axis=1)
    V = np.flip(V, axis=1)
    s = s[::-1]

    # Rename columns; we use "Axis 1", etc. to be consistent with the Qurro
    # interface
    pcs = min(V.shape)
    cols = ["Axis {}".format(pc + 1) for pc in range(pcs)]

    # Make DataFrames from the feature (U) and sample (V) loadings
    U = pd.DataFrame(U[:, :pcs], df.index, cols)
    V = pd.DataFrame(V[:, :pcs], df.columns, cols)

    # For clarity, rename top-left cell in both loading DataFrames
    U.index.name = "FeatureID"
    V.index.name = "SampleID"

    # get prop. var. explained
    p = s**2 / np.sum(s**2)
    p = pd.Series(p.T, index=cols)

    # format eigenvalues in a way that OrdinationResults expects
    eigvals = pd.Series(s.T, index=cols)

    return OrdinationResults("apca",
                             "Aitchison PCA",
                             eigvals,
                             samples=V,
                             features=U,
                             proportion_explained=p)
Example #14
0
def rpca(in_biom: str, output_dir: str,
         min_sample_depth: int, rank: int) -> None:
    """ Runs RPCA with an rclr preprocessing step"""

    # import table
    table = load_table(in_biom)
    # filter sample to min depth

    def sample_filter(val, id_, md): return sum(val) > min_sample_depth
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    # rclr for saving the transformed OTU table (RSC edited)
    tablefit = rclr().fit_transform(table.copy())
    U,s,V = OptSpace().fit_transform(tablefit)
    tablefit = np.dot(np.dot(U, s), V.T)
    tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index)
    with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'):
        tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID')
    
    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    eigvals = pd.Series(opt.eigenvalues,
                        index=list(rename_cols.values()))
    # save ordination results
    ord_res = OrdinationResults(
        'PCoA',
        'Principal Coordinate Analysis',
        eigvals.copy(),
        sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # write files to output folder
    ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt'))
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(
        opt.distance, ids=sample_loading.index)
    dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt'))
    return
Example #15
0
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults:
    coefs = clr(centralize(clr_inv(coefficients)))
    u, s, v = np.linalg.svd(coefs)
    pc_ids = ['PC%d' % i for i in range(len(s))]
    samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                           columns=pc_ids, index=coefficients.index)
    features = pd.DataFrame(v.T[:, :len(s)],
                            columns=pc_ids, index=coefficients.columns)
    short_method_name = 'regression_biplot'
    long_method_name = 'Multinomial regression biplot'
    eigvals = pd.Series(s, index=pc_ids)
    proportion_explained = eigvals / eigvals.sum()
    res = OrdinationResults(short_method_name, long_method_name, eigvals,
                            samples=samples, features=features,
                            proportion_explained=proportion_explained)
    return res
Example #16
0
    def test_scaling2(self):
        scores = cca(self.Y, self.X, scaling=2)

        # Load data as computed with vegan 2.0-8
        vegan_features = pd.DataFrame(np.loadtxt(
            get_data_path('example3_species_scaling2_from_vegan')),
                                      index=self.feature_ids,
                                      columns=self.pc_ids)

        vegan_samples = pd.DataFrame(np.loadtxt(
            get_data_path('example3_site_scaling2_from_vegan')),
                                     index=self.sample_ids,
                                     columns=self.pc_ids)

        sample_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('example3_sample_constraints_scaling2')),
                                          index=self.sample_ids,
                                          columns=self.pc_ids)

        mat = np.loadtxt(get_data_path('example3_biplot_scaling2'))

        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        proportion_explained = pd.Series([
            0.466911, 0.238327, 0.100548, 0.104937, 0.044805, 0.029747,
            0.012631, 0.001562, 0.000532
        ],
                                         index=self.pc_ids)
        eigvals = pd.Series([
            0.366136, 0.186888, 0.078847, 0.082288, 0.035135, 0.023327,
            0.009905, 0.001225, 0.000417
        ],
                            index=self.pc_ids)

        exp = OrdinationResults('CCA',
                                'Canonical Correspondence Analysis',
                                samples=vegan_samples,
                                features=vegan_features,
                                sample_constraints=sample_constraints,
                                biplot_scores=biplot_scores,
                                proportion_explained=proportion_explained,
                                eigvals=eigvals)

        assert_ordination_results_equal(scores, exp, decimal=6)
    def test_extensive(self):
        eigvals = [
            0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0
        ]
        proportion_explained = [
            0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992,
            0.1263356565, 0.0
        ]
        sample_ids = [str(i) for i in range(6)]
        axis_labels = ['PC%d' % i for i in range(1, 7)]
        samples = [
            [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0],
            [
                0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366,
                0.0
            ],
            [
                -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553,
                0.0
            ],
            [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0],
            [
                -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632,
                0.0
            ],
            [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0]
        ]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(samples,
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        data = np.loadtxt(get_data_path('PCoA_sample_data_2'))
        # test passing a numpy.ndarray and a DistanceMatrix to pcoa
        # gives same results
        for dm in (data, DistanceMatrix(data)):
            results = pcoa(dm)
            assert_ordination_results_equal(results,
                                            expected_results,
                                            ignore_directionality=True)
Example #18
0
def ilr_phylogenetic_ordination(
        table: pd.DataFrame,
        tree: skbio.TreeNode,
        pseudocount: float = 0.5,
        top_k_var: int = 10,
        clades: list = None
) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame):
    t = tree.copy()
    t.bifurcate()
    _table, _tree = match_tips(table, t)
    _tree = rename_internal_nodes(_tree)
    if not clades:
        in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
        basis = _balance_basis(_tree)[0]
        _table = add_pseudocount(_table, pseudocount)
        basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes)
        balances = np.log(_table) @ basis
        var = balances.var(axis=0).sort_values(ascending=False)
        clades = var.index[:top_k_var]
        balances = balances[clades]
        basis = basis[clades]
    else:
        clades = clades[0].split(',')
        balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5)
        var = balances.var(axis=0).sort_values(ascending=False)

    balances.index.name = 'sampleid'
    # feature metadata
    eigvals = var
    prop = var[clades] / var.sum()
    balances = OrdinationResults(
        short_method_name='ILR',
        long_method_name='Phylogenetic Isometric Log Ratio Transform',
        samples=balances,
        features=pd.DataFrame(np.eye(len(clades)), index=clades),
        eigvals=eigvals,
        proportion_explained=prop)
    basis.index.name = 'featureid'
    return balances, _tree, basis
Example #19
0
def paired_omics(
        microbes: biom.Table,
        metabolites: biom.Table,
        metadata: Metadata = None,
        training_column: str = None,
        num_testing_examples: int = 5,
        min_feature_count: int = 10,
        epochs: int = 100,
        batch_size: int = 50,
        latent_dim: int = 3,
        input_prior: float = 1,
        output_prior: float = 1,
        learning_rate: float = 0.001,
        summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults):

    if metadata is not None:
        metadata = metadata.to_dataframe()

    # Note: there are a couple of biom -> pandas conversions taking
    # place here.  This is currently done on purpose, since we
    # haven't figured out how to handle sparse matrix multiplication
    # in the context of this algorithm.  That is a future consideration.
    res = split_tables(microbes,
                       metabolites,
                       metadata=metadata,
                       training_column=training_column,
                       num_test=num_testing_examples,
                       min_samples=min_feature_count)

    (train_microbes_df, test_microbes_df, train_metabolites_df,
     test_metabolites_df) = res

    train_microbes_coo = coo_matrix(train_microbes_df.values)
    test_microbes_coo = coo_matrix(test_microbes_df.values)

    with tf.Graph().as_default(), tf.Session() as session:
        model = MMvec(latent_dim=latent_dim,
                      u_scale=input_prior,
                      v_scale=output_prior,
                      learning_rate=learning_rate)
        model(session, train_microbes_coo, train_metabolites_df.values,
              test_microbes_coo, test_metabolites_df.values)

        loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval)

        U, V = model.U, model.V

        U_ = np.hstack((np.ones(
            (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U))
        V_ = np.vstack(
            (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V))

        ranks = pd.DataFrame(np.hstack((np.zeros(
            (model.U.shape[0], 1)), U_ @ V_)),
                             index=train_microbes_df.columns,
                             columns=train_metabolites_df.columns)

        ranks = ranks - ranks.mean(axis=1).values.reshape(-1, 1)
        ranks = ranks - ranks.mean(axis=0)
        u, s, v = svds(ranks, k=latent_dim)
        s = s[::-1]
        u = u[:, ::-1]
        v = v[::-1, :]
        microbe_embed = u @ np.diag(s)
        metabolite_embed = v.T

        pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
        features = pd.DataFrame(microbe_embed,
                                columns=pc_ids,
                                index=train_microbes_df.columns)
        samples = pd.DataFrame(metabolite_embed,
                               columns=pc_ids,
                               index=train_metabolites_df.columns)
        short_method_name = 'mmvec biplot'
        long_method_name = 'Multiomics mmvec biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
        biplot = OrdinationResults(short_method_name,
                                   long_method_name,
                                   eigvals,
                                   samples=samples,
                                   features=features,
                                   proportion_explained=proportion_explained)

        return ranks, biplot
Example #20
0
def scatterplot(df, x=None, y=None, z=None, remote=True):
    """Create an Emperor scatter plot from a Pandas DataFrame

    Parameters
    ----------
    df : pd.DataFrame
        Pandas DataFrame with the data to display, this includes both
        *metadata* and *coordinates* to position the samples in a 3D space.
    x, y, z : str, optional
        Column names in `df`, to use as first (``x``), second (``y``) and third
        (``z``) axes in the visualization. If these are not specified, axes
        are chosen according to the variance (in decremental order).
    remote : bool, optional
        Whether the JavaScript resources should be loaded locally or from
        GitHub. Defaults to ``True``.

    Returns
    -------
    emperor.core.Emperor
        Emperor object with the numerical data as the `ordination` attribute
        and the entire DataFrame as the `mf` attribute.

    Raises
    ------
    ValueError
        If `df` is not a PandasDataFrame
        If `x`, `y` or `z` are missing from `df` or if they are not numeric
        columns.
        If after removing rows with missing data there are fewer than 3
        samples.

    Notes
    -----
    If a row has missing data, that data point will be removed from the
    visualization.

    See Also
    --------
    emperor.core.Emperor
    """

    if not isinstance(df, pd.DataFrame):
        raise ValueError("The argument is not a Pandas DataFrame")

    for col in [z, y, x]:
        if col is None:
            continue

        if col not in df.columns:
            raise ValueError("'%s' is not a column in the DataFrame" % col)

        if not np.issubdtype(df[col].dtype, np.number):
            raise ValueError("'%s' is not a numeric column" % col)

    # remove NAs
    samples = df.select_dtypes(include=[np.number]).copy()
    samples.dropna(axis=0, how='any', inplace=True)

    if len(samples.columns) < 3:
        raise ValueError("Not enough data to plot")

    # sort columns by variance
    variance = samples.var().sort_values(ascending=False)
    samples = samples[variance.index]

    # re-order x, y and z
    ordered = samples.columns.tolist()
    for col in [z, y, x]:
        if col is not None:
            ordered.remove(col)
            ordered = [col] + ordered
    samples = samples[ordered]

    # match up the metadata and coordinates
    df = df.loc[samples.index]

    ores = OrdinationResults(short_method_name='',
                             long_method_name='',
                             eigvals=np.zeros_like(samples.columns),
                             samples=samples,
                             proportion_explained=variance)

    df.index.name = '#SampleID'

    # HACK: scale the position of the samples to fit better within the screen
    ores.samples = ores.samples / ores.samples.max(axis=0)

    return Emperor(ores,
                   df,
                   dimensions=len(ores.samples.columns),
                   remote=remote)
Example #21
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = DEFAULTS["training-column"],
                num_random_test_examples: int = (
                    DEFAULTS["num-random-test-examples"]
                ),
                epochs: int = DEFAULTS["epochs"],
                batch_size: int = DEFAULTS["batch-size"],
                differential_prior: float = DEFAULTS["differential-prior"],
                learning_rate: float = DEFAULTS["learning-rate"],
                clipnorm: float = DEFAULTS["clipnorm"],
                min_sample_count: int = DEFAULTS["min-sample-count"],
                min_feature_count: int = DEFAULTS["min-feature-count"],
                summary_interval: int = DEFAULTS["summary-interval"],
                random_seed: int = DEFAULTS["random-seed"],
                ) -> (
                    pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults
                ):

    # load metadata and tables
    metadata = metadata.to_dataframe()
    # match them
    table, metadata, design = match_and_filter(
        table, metadata,
        formula, min_sample_count, min_feature_count
    )

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(
        dense_table, metadata, design,
        training_column, num_random_test_examples,
        seed=random_seed,
    )

    model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm,
                           beta_mean=differential_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        tf.set_random_seed(random_seed)
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(
            epochs=epochs,
            summary_interval=summary_interval,
            checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = np.hstack((np.zeros((model.p, 1)), model.B))
    beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1)

    differentials = pd.DataFrame(
        beta_.T, columns=md_ids, index=obs_ids,
    )
    differentials.index.name = 'featureid'

    convergence_stats = pd.DataFrame(
        {
            'loss': loss,
            'cross-validation': cv,
            'iteration': its
        }
    )

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loss'].astype(np.float)
    convergence_stats['loss'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    # regression biplot
    if differentials.shape[-1] > 1:
        u, s, v = np.linalg.svd(differentials)
        pc_ids = ['PC%d' % i for i in range(len(s))]
        samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                               columns=pc_ids, index=differentials.index)
        features = pd.DataFrame(v.T[:, :len(s)],
                                columns=pc_ids, index=differentials.columns)
        short_method_name = 'regression_biplot'
        long_method_name = 'Multinomial regression biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = eigvals**2 / (eigvals**2).sum()
        biplot = OrdinationResults(
            short_method_name, long_method_name, eigvals,
            samples=samples, features=features,
            proportion_explained=proportion_explained)
    else:
        # this is to handle the edge case with only intercepts
        biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame())

    return differentials, qiime2.Metadata(convergence_stats), biplot
Example #22
0
coords = (np.random.randn(N, 10) * 1000).tolist()
pct_var = pd.Series(1/np.exp(np.arange(10)))
pct_var = pct_var / pct_var.sum()


md_headers = ['SampleID', 'DOB', 'Strings']
metadata = []
for _id in coords_ids:
    metadata.append([_id, ''.join(sample(set(categories), 1)), ''.join(choice(
        ascii_letters) for x in range(10))])

samples = pd.DataFrame(index=coords_ids, data=coords)

mf = pd.DataFrame(data=metadata, columns=md_headers)
mf.set_index('SampleID', inplace=True)

minerals = ['rhodium', 'platinum', 'gold', 'ruthenium']
mf['subject'] = np.random.randint(low=0, high=len(minerals), size=N)

mf['subject'] = mf['subject'].apply(lambda x: minerals[x])

res = OrdinationResults(short_method_name='PC', long_method_name='Principal '
                        'Coordinates Analysis', eigvals=pct_var,
                        samples=samples, proportion_explained=pct_var)


viz = Emperor(res, mf, remote=get_emperor_support_files_dir())

with open('new-emperor.html', 'w') as f:
    f.write(viz.make_emperor(standalone=True))
Example #23
0
def ctf_helper(
    table: biom.Table,
    sample_metadata: DataFrame,
    individual_id_column: str,
    state_columns: list,
    n_components: int = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations_als: int = DEFAULT_MAXITER,
    max_iterations_rptm: int = DEFAULT_MAXITER,
    n_initializations: int = DEFAULT_MAXITER,
    feature_metadata: DataFrame = DEFFM
) -> (dict, OrdinationResults, dict, tuple):
    """ Runs  Compositional Tensor Factorization CTF.
    """

    # validate the metadata using q2 as a wrapper
    if sample_metadata is not None and not isinstance(sample_metadata,
                                                      DataFrame):
        sample_metadata = sample_metadata.to_dataframe()
    keep_cols = state_columns + [individual_id_column]
    all_sample_metadata = sample_metadata.drop(keep_cols, axis=1)
    sample_metadata = sample_metadata[keep_cols]
    # validate the metadata using q2 as a wrapper
    if feature_metadata is not None and not isinstance(feature_metadata,
                                                       DataFrame):
        feature_metadata = feature_metadata.to_dataframe()
    # match the data (borrowed in part from gneiss.util.match)
    subtablefids = table.ids('observation')
    subtablesids = table.ids('sample')
    if len(subtablesids) != len(set(subtablesids)):
        raise ValueError('Data-table contains duplicate sample IDs')
    if len(subtablefids) != len(set(subtablefids)):
        raise ValueError('Data-table contains duplicate feature IDs')
    submetadataids = set(sample_metadata.index)
    subtablesids = set(subtablesids)
    subtablefids = set(subtablefids)
    if feature_metadata is not None:
        submetadatafeat = set(feature_metadata.index)
        fidx = subtablefids & submetadatafeat
        if len(fidx) == 0:
            raise ValueError(("No more features left.  Check to make "
                              "sure that the sample names between "
                              "`feature-metadata` and `table` are "
                              "consistent"))
        feature_metadata = feature_metadata.reindex(fidx)
    sidx = subtablesids & submetadataids
    if len(sidx) == 0:
        raise ValueError(("No more features left.  Check to make sure that "
                          "the sample names between `sample-metadata` and"
                          " `table` are consistent"))
    if feature_metadata is not None:
        table.filter(list(fidx), axis='observation', inplace=True)
    table.filter(list(sidx), axis='sample', inplace=True)
    sample_metadata = sample_metadata.reindex(sidx)

    # filter and import table
    for axis, min_sum in zip(['sample', 'observation'],
                             [min_sample_count, min_feature_count]):
        table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
                             axis=axis,
                             inplace=True)

    # table to dataframe
    table = DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                      table.ids('sample'))

    # tensor building
    tensor = build()
    tensor.construct(table, sample_metadata, individual_id_column,
                     state_columns)

    # factorize
    TF = TensorFactorization(n_components=n_components,
                             max_als_iterations=max_iterations_als,
                             max_rtpm_iterations=max_iterations_rptm,
                             n_initializations=n_initializations).fit(
                                 rclr(tensor.counts))
    # label tensor loadings
    TF.label(tensor, taxonomy=feature_metadata)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    if n_components == 2:
        TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index)
        TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index)
        TF.proportion_explained['PC3'] = 0
        TF.eigvals['PC3'] = 0

    # save ordination results
    short_method_name = 'CTF_Biplot'
    long_method_name = 'Compositional Tensor Factorization Biplot'
    # only keep PC -- other tools merge metadata
    keep_PC = [col for col in TF.features.columns if 'PC' in col]
    subj_ordin = OrdinationResults(
        short_method_name,
        long_method_name,
        TF.eigvals,
        samples=TF.subjects[keep_PC].dropna(axis=0),
        features=TF.features[keep_PC].dropna(axis=0),
        proportion_explained=TF.proportion_explained)
    # save distance matrix for each condition
    distances = {}
    state_ordn = {}
    subject_trajectories = {}
    feature_trajectories = {}
    for condition, cond, dist, straj, ftraj in zip(tensor.conditions,
                                                   TF.conditions,
                                                   TF.subject_distances,
                                                   TF.subject_trajectory,
                                                   TF.feature_trajectory):
        # match distances to metadata
        ids = straj.index
        ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
        inter = set(ind_dict).intersection(sample_metadata.index)
        indices = sorted([ind_dict[ind] for ind in inter])
        dist = dist[indices, :][:, indices]
        distances[condition] = skbio.stats.distance.DistanceMatrix(
            dist, ids=ids[indices])
        # fix conditions
        if n_components == 2:
            cond['PC3'] = [0] * len(cond.index)
        cond = OrdinationResults(short_method_name,
                                 long_method_name,
                                 TF.eigvals,
                                 samples=cond[keep_PC].dropna(axis=0),
                                 features=TF.features[keep_PC].dropna(axis=0),
                                 proportion_explained=TF.proportion_explained)
        state_ordn[condition] = cond
        # add the sample metadata before returning output
        # addtionally only keep metadata with trajectory
        # output available.
        pre_merge_cols = list(straj.columns)
        straj = concat(
            [straj.reindex(all_sample_metadata.index), all_sample_metadata],
            axis=1,
            sort=True)
        straj = straj.dropna(subset=pre_merge_cols)
        # ensure index name for q2
        straj.index.name = "#SampleID"
        # save traj.
        keep_PC_traj = [col for col in straj.columns if 'PC' in col]
        straj[keep_PC_traj] -= straj[keep_PC_traj].mean()
        ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean()
        subject_trajectories[condition] = straj
        ftraj.index = ftraj.index.astype(str)
        feature_trajectories[condition] = ftraj
    return (state_ordn, subj_ordin, distances, subject_trajectories,
            feature_trajectories)
Example #24
0
def paired_omics(microbes: biom.Table,
                 metabolites: biom.Table,
                 metadata: Metadata = None,
                 training_column: str = None,
                 num_testing_examples: int = 5,
                 min_feature_count: int = 10,
                 epochs: int = 100,
                 batch_size: int = 50,
                 latent_dim: int = 3,
                 input_prior: float = 1,
                 output_prior: float = 1,
                 learning_rate: float = 1e-3,
                 equalize_biplot: float = False,
                 arm_the_gpu: bool = False,
                 summary_interval: int = 60) -> (
                     pd.DataFrame, OrdinationResults, qiime2.Metadata
                 ):

    if metadata is not None:
        metadata = metadata.to_dataframe()

    if arm_the_gpu:
        # pick out the first GPU
        device_name = '/device:GPU:0'
    else:
        device_name = '/cpu:0'

    # Note: there are a couple of biom -> pandas conversions taking
    # place here.  This is currently done on purpose, since we
    # haven't figured out how to handle sparse matrix multiplication
    # in the context of this algorithm.  That is a future consideration.
    res = split_tables(
        microbes, metabolites,
        metadata=metadata, training_column=training_column,
        num_test=num_testing_examples,
        min_samples=min_feature_count)

    (train_microbes_df, test_microbes_df,
     train_metabolites_df, test_metabolites_df) = res

    train_microbes_coo = coo_matrix(train_microbes_df.values)
    test_microbes_coo = coo_matrix(test_microbes_df.values)

    with tf.Graph().as_default(), tf.Session() as session:
        model = MMvec(
            latent_dim=latent_dim,
            u_scale=input_prior, v_scale=output_prior,
            batch_size=batch_size,
            device_name=device_name,
            learning_rate=learning_rate)
        model(session,
              train_microbes_coo, train_metabolites_df.values,
              test_microbes_coo, test_metabolites_df.values)

        loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval)
        ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns,
                             columns=train_metabolites_df.columns)
        if latent_dim > 0:
            u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim)
        else:
            # fake it until you make it
            u, s, v = svds(ranks - ranks.mean(axis=0), k=1)

        ranks = ranks.T
        ranks.index.name = 'featureid'
        s = s[::-1]
        u = u[:, ::-1]
        v = v[::-1, :]
        if equalize_biplot:
            microbe_embed = u @ np.sqrt(np.diag(s))
            metabolite_embed = v.T @ np.sqrt(np.diag(s))
        else:
            microbe_embed = u @ np.diag(s)
            metabolite_embed = v.T

        pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
        features = pd.DataFrame(
            microbe_embed, columns=pc_ids,
            index=train_microbes_df.columns)
        samples = pd.DataFrame(
            metabolite_embed, columns=pc_ids,
            index=train_metabolites_df.columns)
        short_method_name = 'mmvec biplot'
        long_method_name = 'Multiomics mmvec biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
        biplot = OrdinationResults(
            short_method_name, long_method_name, eigvals,
            samples=samples, features=features,
            proportion_explained=proportion_explained)

        its = np.arange(len(loss))
        convergence_stats = pd.DataFrame(
            {
                'loss': loss,
                'cross-validation': cv,
                'iteration': its
            }
        )

        convergence_stats.index.name = 'id'
        convergence_stats.index = convergence_stats.index.astype(np.str)

        c = convergence_stats['loss'].astype(np.float)
        convergence_stats['loss'] = c

        c = convergence_stats['cross-validation'].astype(np.float)
        convergence_stats['cross-validation'] = c

        c = convergence_stats['iteration'].astype(np.int)
        convergence_stats['iteration'] = c

        return ranks, biplot, qiime2.Metadata(convergence_stats)
    def setUp(self):
        super(OrdinationResultsReaderWriterTests, self).setUp()

        # define in-memory results, one for each of the valid files in
        # self.valid_fps

        # CA results
        axes_ids = ['CA1', 'CA2']
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']
        eigvals = pd.Series([0.0961330159181, 0.0409418140138], axes_ids)
        species = pd.DataFrame([[0.408869425742, 0.0695518116298],
                                [-0.1153860437, -0.299767683538],
                                [-0.309967102571, 0.187391917117]],
                               index=species_ids,
                               columns=axes_ids)
        site = pd.DataFrame([[-0.848956053187, 0.882764759014],
                             [-0.220458650578, -1.34482000302],
                             [1.66697179591, 0.470324389808]],
                            index=site_ids,
                            columns=axes_ids)
        biplot = None
        site_constraints = None
        prop_explained = None
        ca_scores = OrdinationResults('CA',
                                      'Correspondence Analysis',
                                      eigvals=eigvals,
                                      features=species,
                                      samples=site,
                                      biplot_scores=biplot,
                                      sample_constraints=site_constraints,
                                      proportion_explained=prop_explained)

        # CCA results
        axes_ids = ['CCA%d' % i for i in range(1, 10)]
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5', 'Species6', 'Species7', 'Species8'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]

        eigvals = pd.Series([
            0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501,
            0.0351348475787, 0.0233265839374, 0.0099048981912,
            0.00122461669234, 0.000417454724117
        ], axes_ids)
        species = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_CCA_species')),
                               index=species_ids,
                               columns=axes_ids)
        site = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_CCA_site')),
                            index=site_ids,
                            columns=axes_ids)
        biplot = pd.DataFrame(
            [[-0.169746767979, 0.63069090084, 0.760769036049],
             [-0.994016563505, 0.0609533148724, -0.0449369418179],
             [0.184352565909, -0.974867543612, 0.0309865007541]],
            columns=axes_ids[:3])
        site_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_CCA_site_constraints')),
                                        index=site_ids,
                                        columns=axes_ids)
        prop_explained = None
        cca_scores = OrdinationResults('CCA',
                                       'Canonical Correspondence Analysis',
                                       eigvals=eigvals,
                                       features=species,
                                       samples=site,
                                       biplot_scores=biplot,
                                       sample_constraints=site_constraints,
                                       proportion_explained=prop_explained)

        # PCoA results
        axes_ids = ['PC%d' % i for i in range(1, 10)]
        species_ids = None
        site_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        eigvals = pd.Series([
            0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078,
            0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0
        ], axes_ids)
        species = None
        site = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_PCoA_site')),
                            index=site_ids,
                            columns=axes_ids)
        biplot = None
        site_constraints = None
        prop_explained = pd.Series([
            0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454,
            0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509,
            0.0
        ], axes_ids)
        pcoa_scores = OrdinationResults('PCoA',
                                        'Principal Coordinate Analysis',
                                        eigvals=eigvals,
                                        features=species,
                                        samples=site,
                                        biplot_scores=biplot,
                                        sample_constraints=site_constraints,
                                        proportion_explained=prop_explained)

        # RDA results
        axes_ids = ['RDA%d' % i for i in range(1, 8)]
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        eigvals = pd.Series([
            25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072,
            1.68070536498, 0.57735026919, 0.275983624351
        ], axes_ids)
        species = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_RDA_species')),
                               index=species_ids,
                               columns=axes_ids)
        site = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_RDA_site')),
                            index=site_ids,
                            columns=axes_ids)
        biplot = pd.DataFrame(
            [[0.422650019179, -0.559142585857, -0.713250678211],
             [0.988495963777, 0.150787422017, -0.0117848614073],
             [-0.556516618887, 0.817599992718, 0.147714267459],
             [-0.404079676685, -0.9058434809, -0.127150316558]],
            columns=axes_ids[:3])
        site_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('ordination_exp_Ordination_RDA_site_constraints')),
                                        index=site_ids,
                                        columns=axes_ids)
        prop_explained = None
        rda_scores = OrdinationResults('RDA',
                                       'Redundancy Analysis',
                                       eigvals=eigvals,
                                       features=species,
                                       samples=site,
                                       biplot_scores=biplot,
                                       sample_constraints=site_constraints,
                                       proportion_explained=prop_explained)

        self.ordination_results_objs = [
            ca_scores, cca_scores, pcoa_scores, rda_scores
        ]
Example #26
0
def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
    """Perform Principal Coordinate Analysis using a given algorithm to do so.

    Adapted from scikit-bio.

    Principal Coordinate Analysis (PCoA) is a method similar to PCA
    that works from distance matrices, and so it can be used with
    ecologically meaningful distances like UniFrac for bacteria.
    In ecology, the euclidean distance preserved by Principal
    Component Analysis (PCA) is often not a good choice because it
    deals poorly with double zeros (Species have unimodal
    distributions along environmental gradients, so if a species is
    absent from two sites at the same site, it can't be known if an
    environmental variable is too high in one of them and too low in
    the other, or too low in both, etc. On the other hand, if an
    species is present in two sites, that means that the sites are
    similar.).
    Parameters
    ----------
    algorithm : Algorithm
        Algorithm to use to decompose matrix into eigenvectors and eigenvalues
    num_dimensions_out
        k number of dimensions to return: selects k eigenvectors
        corresponding to the k largest eigenvalues
    distance_matrix : DistanceMatrix
        A distance matrix.
    Returns
    -------
    OrdinationResults
        Object that stores the PCoA results, including eigenvalues, the
        proportion explained by each of them, and transformed sample
        coordinates.
    See Also
    --------
    OrdinationResults
    Notes
    -----
    It is sometimes known as metric multidimensional scaling or
    classical scaling.
    .. note::
       If the distance is not euclidean (for example if it is a
       semimetric and the triangle inequality doesn't hold),
       negative eigenvalues can appear. There are different ways
       to deal with that problem (see Legendre & Legendre 1998, \S
       9.2.3), but none are currently implemented here.
       However, a warning is raised whenever negative eigenvalues
       appear, allowing the user to decide if they can be safely
       ignored.
    """
    if algorithm is None or not isinstance(algorithm, Algorithm):
        raise ValueError('Must specify algorithm and ensure it is a subclass'
                         ' of Algorithm.')

    # If distance_matrix is a raw numpy array representing a matrix, then
    # coerce it to scikitbio DistanceMatrix object
    if not isinstance(distance_matrix, DistanceMatrix):
        distance_matrix = DistanceMatrix(distance_matrix)

    # Implemented as per algorithm outlined in
    # Numerical Ecology (Legendre & Legendre 1998)
    # See Chapter 9, Equation 9.20
    E_matrix = e_matrix(distance_matrix.data)

    # FYI: If the used distance was euclidean, pairwise distances
    # needn't be computed from the data table Y because F_matrix =
    # Y.dot(Y.T) (if Y has been centred).
    # But since we're expecting distance_matrix to be non-euclidian,
    # we do the following computation as per
    # Numerical Ecology (Legendre & Legendre 1998)
    # See Chapter 9, Equation 9.21
    # ... which centers the matrix (a requirement for PcoA)
    F_matrix = f_matrix(E_matrix)

    # Run the given algorithm that decomposes the matrix into eigenvectors
    # and eigenvalues.
    eigenvectors, eigenvalues = algorithm.run(F_matrix, num_dimensions_out)

    # Coerce to numpy array just in case
    eigenvectors = np.array(eigenvectors)
    eigenvalues = np.array(eigenvalues)

    # Ensure eigenvectors are normalized
    eigenvectors = np.apply_along_axis(lambda vec: vec / np.linalg.norm(vec),
                                       axis=1,
                                       arr=eigenvectors)

    # Generate axis labels for output
    axis_labels = ['PC%d' % i for i in range(1, len(eigenvectors) + 1)]

    # Some algorithms do not return eigenvalues. Thus, we cannot compute
    # the array of proportion of variance explained and we cannot sort the
    # eigenvectors by their corresponding eigenvalues.
    if np.all(np.isnan(eigenvalues)):
        # Only return an OrdinationResults object wrapping the result's
        # eigenvectors. Leave the eigenvalues as NaNs.

        # TODO: Nystrom and SCMDS do not return
        # num_dimensions_out number of eigenvectors
        # Figure out if we need to throw away eigenvectors here
        # or if that's the intended behavior.

        return OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            samples=pd.DataFrame(eigenvectors,
                                 index=distance_matrix.ids,
                                 columns=axis_labels),
            eigvals=pd.Series(eigenvalues))
    else:
        # cogent makes eigenvalues positive by taking the
        # abs value, but that doesn't seem to be an approach accepted
        # by Legendre & Legendre to deal with negative eigenvalues.
        # We raise a warning in that case.

        # First, we coerce values close to 0 to equal 0.
        indices_close_to_zero = np.isclose(eigenvalues,
                                           np.zeros(eigenvalues.shape))
        eigenvalues[indices_close_to_zero] = 0

        if np.any(eigenvalues < 0):
            warn(
                "The result contains negative eigenvalues."
                " Please compare their magnitude with the magnitude of some"
                " of the largest positive eigenvalues. If the negative ones"
                " are smaller, it's probably safe to ignore them, but if they"
                " are large in magnitude, the results won't be useful. See the"
                " Notes section for more details. The smallest eigenvalue is"
                " {0} and the largest is {1}.".format(eigenvalues.min(),
                                                      eigenvalues.max()),
                RuntimeWarning)

        # eigvals might not be ordered, so we order them (at least one
        # is zero).
        indices_descending = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[indices_descending]
        # Sort eigenvectors in correspondance with eigenvalues' order
        eigenvectors = eigenvectors[:, indices_descending]

        # Note that at
        # least one eigenvalue is zero because only n-1 axes are
        # needed to represent n points in an euclidean space.

        # If we return only the coordinates that make sense (i.e., that have a
        # corresponding positive eigenvalue), then Jackknifed Beta Diversity
        # won't work as it expects all the OrdinationResults to have the same
        # number of coordinates. In order to solve this issue, we return the
        # coordinates that have a negative eigenvalue as 0
        num_positive = (eigenvalues >= 0).sum()
        eigenvectors[:, num_positive:] = np.zeros(
            eigenvectors[:, num_positive:].shape)
        eigenvalues[num_positive:] = np.zeros(eigenvalues[num_positive:].shape)

        # Scale eigenvalues to have length = sqrt(eigenvalue). This
        # works because our eigenvectors are normalized before doing this
        # operation.
        eigenvectors = eigenvectors * np.sqrt(eigenvalues)

        # Now remove the dimensions with the least information
        # Only select k (num_dimensions_out) first eigenvectors
        # and their corresponding eigenvalues from the sorted array
        # of eigenvectors / eigenvalues

        if len(eigenvalues) > num_dimensions_out:
            eigenvectors = eigenvectors[:, :num_dimensions_out]
            eigenvalues = eigenvalues[:num_dimensions_out]

        axis_labels = axis_labels[:num_dimensions_out]

        # Calculate the array of proportion of variance explained
        proportion_explained = eigenvalues / eigenvalues.sum()

        return OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigenvalues, index=axis_labels),
            samples=pd.DataFrame(eigenvectors,
                                 index=distance_matrix.ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))
Example #27
0
    def test_assert_ordination_results_equal(self):
        minimal1 = OrdinationResults('foo', 'bar', pd.Series([1.0, 2.0]),
                                     pd.DataFrame([[1, 2, 3], [4, 5, 6]]))

        # a minimal set of results should be equal to itself
        assert_ordination_results_equal(minimal1, minimal1)

        # type mismatch
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, 'foo')

        # numeric values should be checked that they're almost equal
        almost_minimal1 = OrdinationResults(
            'foo', 'bar',
            pd.Series([1.0000001, 1.9999999]),
            pd.DataFrame([[1, 2, 3], [4, 5, 6]]))
        assert_ordination_results_equal(minimal1, almost_minimal1)

        # test each of the optional numeric attributes
        for attr in ('features', 'samples', 'biplot_scores',
                     'sample_constraints'):
            # missing optional numeric attribute in one, present in the other
            setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3, 4]]))
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, but not almost equal
            setattr(minimal1, attr, pd.DataFrame([[1, 2], [3, 4]]))
            setattr(almost_minimal1, attr, pd.DataFrame([[1, 2],
                                                         [3.00002, 4]]))
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, and almost equal
            setattr(minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.0, 4.0]]))
            setattr(almost_minimal1, attr,
                    pd.DataFrame([[1.0, 2.0], [3.00000002, 4]]))
            assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

        # missing optional numeric attribute in one, present in the other
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3])
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None

        # optional numeric attributes present in both, but not almost equal
        minimal1.proportion_explained = pd.Series([1, 2, 3])
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00002])
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None
        almost_minimal1.proportion_explained = None

        # optional numeric attributes present in both, and almost equal
        minimal1.proportion_explained = pd.Series([1, 2, 3])
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00000002])
        assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None
        almost_minimal1.proportion_explained = None
    def test_biplot_score(self):

        rda_ = rda(y=self.Y, x=self.X, scale_Y=False, scaling=1)

        # Load data as computed with vegan 2.4-3:
        # library(vegan)
        # data(varechem)
        # data(varespec)
        # rda_ = rda(X=varespec, Y=varechem, scale=FALSE)
        # write.table(summary(rda_, scaling=1)$biplot,
        #             'vare_rda_biplot_from_vegan.csv', sep=',')
        # write.table(summary(rda_, scaling=1)$sites,
        #                     'vare_rda_sites_from_vegan.csv', sep=',')
        # write.table(summary(rda_, scaling=1)$species,
        #                     'vare_rda_species_from_vegan.csv', sep=',')
        # write.table(summary(rda_, scaling=1)$constraints, #
        #                     'vare_rda_constraints_from_vegan.csv', sep=',')
        # write.table(summary(rda_, scaling=1)$cont$importance[2, ],
        #                     'vare_rda_propexpl_from_vegan.csv', sep=',')
        # write.table(summary(rda_, scaling=1)$cont$importance[1, ],
        #                     'vare_rda_eigvals_from_vegan.csv', sep=',')

        vegan_features = pd.read_csv(
            get_data_path('vare_rda_species_from_vegan.csv'))
        vegan_samples = pd.read_csv(
            get_data_path('vare_rda_sites_from_vegan.csv'))
        vegan_biplot = pd.read_csv(
            get_data_path('vare_rda_biplot_from_vegan.csv'))
        vegan_constraints = pd.read_csv(
            get_data_path('vare_rda_constraints_from_vegan.csv'))
        vegan_propexpl = pd.read_csv(
            get_data_path('vare_rda_propexpl_from_vegan.csv'))
        vegan_propexpl = pd.Series(vegan_propexpl.x.values,
                                   index=rda_.eigvals.index)
        vegan_eigvals = pd.read_csv(
            get_data_path('vare_rda_eigvals_from_vegan.csv'))
        vegan_eigvals = pd.Series(vegan_eigvals.x.values,
                                  index=rda_.eigvals.index)

        # scikit-bio returns singular values, whereas vegan returns eigenvalues
        vegan_eigvals = np.sqrt(vegan_eigvals * vegan_eigvals.shape[0])
        vegan_propexpl = vegan_eigvals / vegan_eigvals.sum()

        # transform the output of rda_ to match column selection of vegan
        res_samples = rda_.samples.iloc[:, 0:6]
        res_features = rda_.features.iloc[:, 0:6]

        rda_ = OrdinationResults(
            'RDA',
            'Redundancy Analysis',
            samples=res_samples,
            features=res_features,
            sample_constraints=rda_.sample_constraints.iloc[:, 0:6],
            biplot_scores=rda_.biplot_scores.iloc[:, 0:6],
            proportion_explained=rda_.proportion_explained,
            eigvals=rda_.eigvals)

        exp = OrdinationResults('RDA',
                                'Redundancy Analysis',
                                samples=vegan_samples,
                                features=vegan_features,
                                sample_constraints=vegan_constraints,
                                biplot_scores=vegan_biplot,
                                proportion_explained=vegan_propexpl,
                                eigvals=vegan_eigvals)

        # This scaling constant is required to make skbio comparable to vegan.
        scaling = (rda_.eigvals[0] / rda_.eigvals[:6])
        exp.biplot_scores *= scaling
        assert_ordination_results_equal(rda_,
                                        exp,
                                        ignore_directionality=False,
                                        decimal=6)