Beispiel #1
0
def biplot(output_dir: str,
           biplot: skbio.OrdinationResults,
           sample_metadata: qiime2.Metadata,
           feature_metadata: qiime2.Metadata = None,
           ignore_missing_samples: bool = False,
           invert: bool = False,
           number_of_features: int = 5) -> None:

    if invert:
        biplot.samples, biplot.features = biplot.features, biplot.samples
        sample_metadata, feature_metadata = feature_metadata, sample_metadata

    # select the top N most important features based on the vector's magnitude
    feats = biplot.features.copy()
    origin = np.zeros_like(feats.columns)
    feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
    feats.sort_values('importance', inplace=True, ascending=False)
    feats.drop(['importance'], inplace=True, axis=1)
    biplot.features = feats[:number_of_features].copy()

    generic_plot(output_dir,
                 master=biplot,
                 other_pcoa=None,
                 ignore_missing_samples=ignore_missing_samples,
                 metadata=sample_metadata,
                 feature_metadata=feature_metadata,
                 plot_name='biplot')
 def _create_ordination_results(self):
     eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868]
     proportion_explained = [
         0.2675738328, 0.157044696, 0.1399118638, 0.1091402725
     ]
     sample_ids = [
         '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192',
         '1.SKB7.640196'
     ]
     axis_labels = ['PC1', 'PC2', 'PC3', 'PC4']
     samples = [[-2.584, 1.739, 3.828, -1.944],
                [-2.710, -1.859, -8.648, 1.180],
                [2.350, 9.625, -3.457,
                 -3.208], [2.614, -1.114, 1.476, 2.908],
                [2.850, -1.925, 6.232, 1.381]]
     ord_res = OrdinationResults(
         short_method_name='PCoA',
         long_method_name='Principal Coordinate Analysis',
         eigvals=pd.Series(eigvals, index=axis_labels),
         samples=pd.DataFrame(np.asarray(samples),
                              index=sample_ids,
                              columns=axis_labels),
         proportion_explained=pd.Series(proportion_explained,
                                        index=axis_labels))
     fd, fp = mkstemp(suffix='.txt', dir=self.out_dir)
     close(fd)
     ord_res.write(fp)
     return fp
    def test_standalone_rpca(self):
        """Checks the output produced by DEICODE's standalone script.

           This is more of an "integration test" than a unit test -- the
           details of the algorithm used by the standalone RPCA script are
           checked in more detail in deicode/tests/test_optspace.py, etc.
        """
        in_ = get_data_path('test.biom')
        out_ = os_path_sep.join(in_.split(os_path_sep)[:-1])
        runner = CliRunner()
        result = runner.invoke(sdc.commands['rpca'],
                               ['--in-biom', in_,
                                '--output-dir', out_])
        # Read the results
        dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t',
                               index_col=0)
        ord_res = OrdinationResults.read(get_data_path('ordination.txt'))

        # Read the expected results
        dist_exp = pd.read_csv(get_data_path('expected-distance-matrix.tsv'),
                               sep='\t', index_col=0)
        ord_exp = OrdinationResults.read(get_data_path(
                                         'expected-ordination.txt'))

        # Check that the distance matrix matches our expectations
        assert_array_almost_equal(dist_res.values, dist_exp.values)

        # Check that the ordination results match our expectations -- checking
        # each value for both features and samples
        assert_deicode_ordinationresults_equal(ord_res, ord_exp)

        # Lastly, check that DEICODE's exit code was 0 (indicating success)
        self.assertEqual(result.exit_code, 0)
Beispiel #4
0
def _pca(ranks_df: pd.DataFrame,
         n_components: int = None) -> (OrdinationResults, OrdinationResults):
    # perform PCA
    pca_result = PCA(n_components=n_components)
    pca_result.fit(ranks_df)

    # transform ranks
    ranks_transformed = pd.DataFrame(pca_result.transform(ranks_df))
    ranks_transformed.index = ranks_df.index

    components_loadings = pd.DataFrame(-1 * pca_result.components_.T *
                                       np.sqrt(pca_result.explained_variance_))
    components_loadings.index = ranks_df.columns
    eigenvalues = pd.Series(pca_result.explained_variance_)

    ores_scores = OrdinationResults(
        short_method_name="PCA",
        long_method_name="Principal Components Analysis",
        eigvals=eigenvalues,
        samples=ranks_transformed,
        features=None,
        biplot_scores=None,
        proportion_explained=pd.Series(pca_result.explained_variance_ratio_))

    ores_loadings = OrdinationResults(
        short_method_name="PCA",
        long_method_name="Principal Components Analysis",
        eigvals=eigenvalues,
        samples=components_loadings,
        features=None,
        biplot_scores=None,
        proportion_explained=pd.Series(pca_result.explained_variance_ratio_))

    return ores_scores, ores_loadings
Beispiel #5
0
def procrustes_analysis(
    reference: OrdinationResults,
    other: OrdinationResults,
    dimensions: int = 5,
    permutations: int = 999
) -> (OrdinationResults, OrdinationResults, pd.DataFrame):

    if reference.samples.shape != other.samples.shape:
        raise ValueError('The matrices cannot be fitted unless they have the '
                         'same dimensions')

    if reference.samples.shape[1] < dimensions:
        raise ValueError('Cannot fit fewer dimensions than available')

    # fail if there are any elements in the symmetric difference
    diff = reference.samples.index.symmetric_difference(other.samples.index)
    if not diff.empty:
        raise ValueError('The ordinations represent two different sets of '
                         'samples')

    # make the matrices be comparable
    other.samples = other.samples.reindex(index=reference.samples.index)
    mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions],
                                other.samples.values[:, :dimensions])

    axes = reference.samples.columns[:dimensions]
    samples1 = pd.DataFrame(data=mtx1,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())
    samples2 = pd.DataFrame(data=mtx2,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())

    info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions],
                                   other.samples.values[:, :dimensions], m2,
                                   permutations)

    out1 = OrdinationResults(short_method_name=reference.short_method_name,
                             long_method_name=reference.long_method_name,
                             eigvals=reference.eigvals[:dimensions].copy(),
                             samples=samples1,
                             features=reference.features,
                             biplot_scores=reference.biplot_scores,
                             sample_constraints=reference.sample_constraints,
                             proportion_explained=reference.
                             proportion_explained[:dimensions].copy())
    out2 = OrdinationResults(
        short_method_name=other.short_method_name,
        long_method_name=other.long_method_name,
        eigvals=other.eigvals[:dimensions].copy(),
        samples=samples2,
        features=other.features,
        biplot_scores=other.biplot_scores,
        sample_constraints=other.sample_constraints,
        proportion_explained=other.proportion_explained[:dimensions].copy())
    return out1, out2, info
Beispiel #6
0
def rpca(in_biom: str, output_dir: str,
         min_sample_depth: int, rank: int) -> None:
    """ Runs RPCA with an rclr preprocessing step"""

    # import table
    table = load_table(in_biom)
    # filter sample to min depth

    def sample_filter(val, id_, md): return sum(val) > min_sample_depth
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    # rclr for saving the transformed OTU table (RSC edited)
    tablefit = rclr().fit_transform(table.copy())
    U,s,V = OptSpace().fit_transform(tablefit)
    tablefit = np.dot(np.dot(U, s), V.T)
    tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index)
    with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'):
        tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID')
    
    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    eigvals = pd.Series(opt.eigenvalues,
                        index=list(rename_cols.values()))
    # save ordination results
    ord_res = OrdinationResults(
        'PCoA',
        'Principal Coordinate Analysis',
        eigvals.copy(),
        sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # write files to output folder
    ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt'))
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(
        opt.distance, ids=sample_loading.index)
    dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt'))
    return
Beispiel #7
0
    def test_scaling1(self):
        eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids)
        # p. 458
        features = pd.DataFrame(
            np.array([
                [1.31871, -0.34374],  # V
                [-0.37215, 1.48150],
                [-0.99972, -0.92612]
            ]),
            self.feature_ids,
            self.pc_ids)
        samples = pd.DataFrame(
            np.array([
                [-0.26322, -0.17862],  # F
                [-0.06835, 0.27211],
                [0.51685, -0.09517]
            ]),
            self.sample_ids,
            self.pc_ids)
        exp = OrdinationResults('CA',
                                'Correspondance Analysis',
                                eigvals=eigvals,
                                features=features,
                                samples=samples)
        scores = ca(self.contingency, 1)

        assert_ordination_results_equal(exp,
                                        scores,
                                        decimal=5,
                                        ignore_directionality=True)
    def test_book_example_dataset(self):
        # Adapted from PyCogent's `test_principal_coordinate_analysis`:
        #   "I took the example in the book (see intro info), and did
        #   the principal coordinates analysis, plotted the data and it
        #   looked right".
        eigvals = [
            0.73599103, 0.26260032, 0.14926222, 0.06990457, 0.02956972,
            0.01931184, 0., 0., 0., 0., 0., 0., 0., 0.
        ]
        proportion_explained = [
            0.58105792, 0.20732046, 0.1178411, 0.05518899, 0.02334502,
            0.01524651, 0., 0., 0., 0., 0., 0., 0., 0.
        ]
        sample_ids = [str(i) for i in range(14)]
        axis_labels = ['PC%d' % i for i in range(1, 15)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAzeros_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        results = npt.assert_warns(RuntimeWarning, pcoa, self.dm)

        # Note the absolute value because column can have signs swapped
        results.samples = np.abs(results.samples)
        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
Beispiel #9
0
    def test_scaling2(self):

        eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids)
        # p. 460 L&L 1998
        features = pd.DataFrame(
            np.array([
                [0.40887, -0.06955],  # F_hat
                [-0.11539, 0.29977],
                [-0.30997, -0.18739]
            ]),
            self.feature_ids,
            self.pc_ids)
        samples = pd.DataFrame(
            np.array([
                [-0.84896, -0.88276],  # V_hat
                [-0.22046, 1.34482],
                [1.66697, -0.47032]
            ]),
            self.sample_ids,
            self.pc_ids)
        exp = OrdinationResults('CA',
                                'Correspondance Analysis',
                                eigvals=eigvals,
                                features=features,
                                samples=samples)

        scores = ca(self.contingency, 2)

        assert_ordination_results_equal(exp,
                                        scores,
                                        decimal=5,
                                        ignore_directionality=True)
    def test_simple(self):
        eigvals = [
            0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895,
            0.16054235, 0.15017696, 0.12245775, 0.0
        ]
        proportion_explained = [
            0.2675738328, 0.157044696, 0.1399118638, 0.1091402725,
            0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0
        ]
        sample_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAEigenResults_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
Beispiel #11
0
    def test_str(self):
        exp = ("Ordination results:\n"
               "\tMethod: Correspondance Analysis (CA)\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: 3x2\n"
               "\tSamples: 3x2\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSample IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tMethod: Principal Coordinate Analysis (PCoA)\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: N/A\n"
               "\tSamples: 2x1\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: N/A\n"
               "\tSample IDs: 0, 1")
        samples_df = pd.DataFrame(np.array([[1], [2]]))
        obs = str(
            OrdinationResults('PCoA', 'Principal Coordinate Analysis',
                              pd.Series(np.array([4.2])), samples_df))
        self.assertEqual(obs.split('\n'), exp.split('\n'))
    def setUp(self):
        self.alpha = pd.Series([1, 2, 3], index=list('abc'))

        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        self.biom = biom.Table(data, ['O1', 'O2'], ['a', 'b', 'c'])

        eigvals = [0.51236726, 0.30071909, 0.26791207]
        proportion_explained = [0.2675738328, 0.157044696, 0.1399118638]
        sample_ids = ['a', 'b', 'c']
        axis_labels = ['PC%d' % i for i in range(1, 4)]
        np.random.seed(11)
        data = np.random.randn(3, 3)

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(
                data,
                index=sample_ids, columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))
        self.ordination = expected_results

        self.metadata = pd.DataFrame(data=[[':0', ':)', ':/'],
                                           [':D', 'xD', '<3'],
                                           [';L', ']:->', ':S']],
                                     index=list('abc'),
                                     columns=['foo', 'bar', 'baz'])
Beispiel #13
0
    def setUp(self):
        self.tree = self.mock_tree_from_nwk()
        self.bp_tree = from_skbio_treenode(self.tree)
        self.table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0,
                                                                 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])
        self.sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.ids()))

        # (These are some Greengenes taxonomy annotations I took from the
        # moving pictures taxonomy.qza file. I made up the confidences.)
        self.feature_metadata = pd.DataFrame(
            {
                "Taxonomy":
                [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__"),
                 ("k__Bacteria; p__Proteobacteria; "
                  "c__Gammaproteobacteria; o__Pasteurellales; "
                  "f__Pasteurellaceae; g__; s__"),
                 ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__uniformis")],
                "Confidence": [0.95, 0.8, 0]
            },
            index=["e", "h", "a"])
        self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata)
        self.tip_md = self.split_tax_fm.loc[["a", "e"]]
        self.int_md = self.split_tax_fm.loc[["h"]]
        # This is designed to match the shearing that's done in the core test
        # for --p-shear-to-table
        self.shorn_tree = parse_newick(
            "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;")
        self.exp_split_fm_cols = [
            "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6",
            "Level 7", "Confidence"
        ]

        eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3'])
        samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                   [0.4, 0.5, 0.6]]
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(
            samples,
            index=['Sample1', 'Sample2', 'Sample3', 'Sample4'],
            columns=['PC1', 'PC2', 'PC3'])
        self.ordination = OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)
    def test_scaling2(self):

        scores = rda(self.Y, self.X, scaling=2)
        mat = np.loadtxt(get_data_path('example2_biplot_scaling2'))
        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        sample_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('example2_sample_constraints_scaling2')))

        # Load data as computed with vegan 2.0-8
        vegan_features = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_species_scaling2_from_vegan')),
            index=self.feature_ids,
            columns=self.pc_ids)

        vegan_samples = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_site_scaling2_from_vegan')),
            index=self.sample_ids,
            columns=self.pc_ids)

        sample_constraints = pd.DataFrame(
            np.loadtxt(get_data_path(
                'example2_sample_constraints_scaling2')),
            index=self.sample_ids,
            columns=self.pc_ids)

        mat = np.loadtxt(get_data_path(
            'example2_biplot_scaling2'))
        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        proportion_explained = pd.Series([0.44275783, 0.25614586,
                                          0.15280354, 0.10497021,
                                          0.02873375, 0.00987052,
                                          0.00471828],
                                         index=self.pc_ids)

        eigvals = pd.Series([25.897954, 14.982578, 8.937841, 6.139956,
                             1.680705, 0.577350, 0.275984],
                            index=self.pc_ids)

        exp = OrdinationResults(
            'RDA', 'Redundancy Analysis',
            samples=vegan_samples,
            features=vegan_features,
            sample_constraints=sample_constraints,
            biplot_scores=biplot_scores,
            proportion_explained=proportion_explained,
            eigvals=eigvals)

        assert_ordination_results_equal(scores, exp,
                                        ignore_directionality=True,
                                        decimal=6)
Beispiel #15
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2'])
        features = np.array([[0.408869425742, 0.0695518116298],
                             [-0.1153860437, -0.299767683538],
                             [-0.309967102571, 0.187391917117]])
        samples = np.array([[-0.848956053187, 0.882764759014],
                            [-0.220458650578, -1.34482000302],
                            [1.66697179591, 0.470324389808]])
        features_ids = ['Species1', 'Species2', 'Species3']
        sample_ids = ['Site1', 'Site2', 'Site3']

        samples_df = pd.DataFrame(samples,
                                  index=sample_ids,
                                  columns=['CA1', 'CA2'])
        features_df = pd.DataFrame(features,
                                   index=features_ids,
                                   columns=['CA1', 'CA2'])

        self.ordination_results = OrdinationResults('CA',
                                                    'Correspondance Analysis',
                                                    eigvals=eigvals,
                                                    samples=samples_df,
                                                    features=features_df)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8],
                                [22, -4.2, np.nan], ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'],
                                  ['PC1', 'PC2', 'PC3'])

        self.min_ord_results = OrdinationResults(
            'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
Beispiel #16
0
    def test_from_seralized_results(self):
        # the current implementation of ordination results loses some
        # information, test that pcoa_biplot works fine regardless
        results = OrdinationResults.read(get_data_path('PCoA_skbio'))

        serialized = pcoa_biplot(results, self.descriptors)
        in_memory = pcoa_biplot(self.ordination, self.descriptors)

        assert_ordination_results_equal(serialized, in_memory,
                                        ignore_directionality=True,
                                        ignore_axis_labels=True,
                                        ignore_method_names=True)
def apca(df):
    """Performs Aitchison PCA on a feature table.
    Parameters
    ----------
        df: pd.DataFrame
            A numeric DataFrame whose rows are "features" and whose columns are
            "samples."
    Returns
    -------
        A 3-tuple (U, p, V) where:
            U: pd.DataFrame
                Feature loadings.
            p: pd.DataFrame
                Proportions of variance explained.
            V: pd.DataFrame
                Sample loadings.
    """
    # do A-PCA
    U, s, V = svds(clr(df), k=2)
    V = V.T
    # reverse (see SVDs docs)
    U = np.flip(U, axis=1)
    V = np.flip(V, axis=1)
    s = s[::-1]

    # Rename columns; we use "Axis 1", etc. to be consistent with the Qurro
    # interface
    pcs = min(V.shape)
    cols = ["Axis {}".format(pc + 1) for pc in range(pcs)]

    # Make DataFrames from the feature (U) and sample (V) loadings
    U = pd.DataFrame(U[:, :pcs], df.index, cols)
    V = pd.DataFrame(V[:, :pcs], df.columns, cols)

    # For clarity, rename top-left cell in both loading DataFrames
    U.index.name = "FeatureID"
    V.index.name = "SampleID"

    # get prop. var. explained
    p = s**2 / np.sum(s**2)
    p = pd.Series(p.T, index=cols)

    # format eigenvalues in a way that OrdinationResults expects
    eigvals = pd.Series(s.T, index=cols)

    return OrdinationResults("apca",
                             "Aitchison PCA",
                             eigvals,
                             samples=V,
                             features=U,
                             proportion_explained=p)
Beispiel #18
0
def procrustes_analysis(reference: OrdinationResults, other: OrdinationResults,
                        dimensions: int=5) -> (OrdinationResults,
                                               OrdinationResults):

    if reference.samples.shape != other.samples.shape:
        raise ValueError('The matrices cannot be fitted unless they have the '
                         'same dimensions')

    if reference.samples.shape[1] < dimensions:
        raise ValueError('Cannot fit fewer dimensions than available')

    # fail if there are any elements in the symmetric difference
    if not (reference.samples.index ^ other.samples.index).empty:
        raise ValueError('The ordinations represent two different sets of '
                         'samples')

    # make the matrices be comparable
    other.samples = other.samples.reindex(index=reference.samples.index)

    mtx1, mtx2, _ = procrustes(reference.samples.values[:, :dimensions],
                               other.samples.values[:, :dimensions])

    axes = reference.samples.columns[:dimensions]
    samples1 = pd.DataFrame(data=mtx1,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())
    samples2 = pd.DataFrame(data=mtx2,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())

    out1 = OrdinationResults(
            short_method_name=reference.short_method_name,
            long_method_name=reference.long_method_name,
            eigvals=reference.eigvals[:dimensions].copy(),
            samples=samples1,
            features=reference.features,
            biplot_scores=reference.biplot_scores,
            sample_constraints=reference.sample_constraints,
            proportion_explained=reference.proportion_explained[:dimensions]
            .copy())
    out2 = OrdinationResults(
            short_method_name=other.short_method_name,
            long_method_name=other.long_method_name,
            eigvals=other.eigvals[:dimensions].copy(),
            samples=samples2,
            features=other.features,
            biplot_scores=other.biplot_scores,
            sample_constraints=other.sample_constraints,
            proportion_explained=other.proportion_explained[:dimensions]
            .copy())
    return out1, out2
Beispiel #19
0
    def test_standalone_rpca_rank_est(self):
        """Checks the standalone rank estimate
           is used instead of a explicit rank
           setting.
        """
        in_ = get_data_path('test.biom')
        out_ = os_path_sep.join(in_.split(os_path_sep)[:-1])
        runner = CliRunner()
        result = runner.invoke(sdc.commands['auto-rpca'],
                               ['--in-biom', in_,
                                '--output-dir', out_])
        # Read the results
        dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t',
                               index_col=0)
        ord_res = OrdinationResults.read(get_data_path('ordination.txt'))

        # Read the expected results
        file_ = 'expected-est-distance-matrix.tsv'
        dist_exp = pd.read_csv(get_data_path(file_),
                               sep='\t', index_col=0)
        ord_exp = OrdinationResults.read(get_data_path(
                                         'expected-est-ordination.txt'))

        # Check that the distance matrix matches our expectations
        assert_array_almost_equal(dist_res.values, dist_exp.values)

        # Check that the ordination results match our expectations -- checking
        # each value for both features and samples
        assert_deicode_ordinationresults_equal(ord_res, ord_exp)

        # Lastly, check that DEICODE's exit code was 0 (indicating success)
        try:
            self.assertEqual(0, result.exit_code)
        except AssertionError:
            ex = result.exception
            error = Exception('Command failed with non-zero exit code')
            raise error.with_traceback(ex.__traceback__)
Beispiel #20
0
def _generate_ordination_results_summary(files, metadata, out_dir):
    # Magic number [0] -> there is only one plain text file and it is the
    # ordination results
    ord_res = OrdinationResults.read(files['plain_text'][0])
    md_df = pd.DataFrame.from_dict(metadata, orient='index')
    emp = Emperor(ord_res, md_df, remote="emperor_support_files")

    html_summary_fp = join(out_dir, 'index.html')
    esf_dp = join(out_dir, 'emperor_support_files')
    makedirs(esf_dp)
    with open(html_summary_fp, 'w') as f:
        f.write(emp.make_emperor(standalone=True))
        emp.copy_support_files(esf_dp)

    return html_summary_fp, esf_dp
    def test_standalone_rpca_rank_est(self):
        """Checks the standalone RPCA rank estimate
           is used instead of a explicit rank
           setting.
        """
        in_ = get_data_path('test.biom', subfolder='rpca_data')
        out_ = os_path_sep.join(in_.split(os_path_sep)[:-1])
        runner = CliRunner()
        result = runner.invoke(sdc.commands['auto-rpca'],
                               ['--in-biom', in_, '--output-dir', out_])
        # Read the results
        dist_res = pd.read_csv(get_data_path('distance-matrix.tsv',
                                             subfolder='rpca_data'),
                               sep='\t',
                               index_col=0)
        ord_res = OrdinationResults.read(
            get_data_path('ordination.txt', subfolder='rpca_data'))

        # Read the expected results
        file_ = 'expected-est-distance-matrix.tsv'
        dist_exp = pd.read_csv(get_data_path(file_, subfolder='rpca_data'),
                               sep='\t',
                               index_col=0)
        ord_exp = OrdinationResults.read(
            get_data_path('expected-est-ordination.txt',
                          subfolder='rpca_data'))

        # Check that the distance matrix matches our expectations
        assert_array_almost_equal(dist_res.values, dist_exp.values)

        # Check that the ordination results match our expectations -- checking
        # each value for both features and samples
        assert_ordinationresults_equal(ord_res, ord_exp)

        # Lastly, check that gemelli's exit code was 0 (indicating success)
        CliTestCase().assertExitCode(0, result)
Beispiel #22
0
def plot(output_dir: str, pcoa: skbio.OrdinationResults,
         metadata: qiime2.Metadata, custom_axes: str = None,
         ignore_missing_samples: bool = False,
         ignore_pcoa_features: bool = False) -> None:

    if ignore_pcoa_features:
        pcoa.features = None
    if pcoa.features is not None:
        raise ValueError("Arrows cannot be visualized with the 'plot' method, "
                         "use 'biplot' instead, or enable "
                         "`ignore_pcoa_features`.")

    generic_plot(output_dir, master=pcoa, metadata=metadata, other_pcoa=None,
                 ignore_missing_samples=ignore_missing_samples,
                 custom_axes=custom_axes, plot_name='plot')
Beispiel #23
0
 def test_standalone_rpca_n_components(self):
     """Tests the standalone script when n_components is 2
     """
     in_ = get_data_path('test.biom')
     out_ = os_path_sep.join(in_.split(os_path_sep)[:-1])
     runner = CliRunner()
     # run the same command but with rank==2
     result = runner.invoke(standalone_rpca, [
         '--in-biom', in_, '--output-dir', out_, '--n_components', 2,
         '--max_iterations', 5
     ])
     self.assertEqual(result.exit_code, 0)
     ord_res = OrdinationResults.read(get_data_path('ordination.txt'))
     # check it contains three axis
     if len(ord_res.proportion_explained) == 3:
         pass
Beispiel #24
0
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults:
    coefs = clr(centralize(clr_inv(coefficients)))
    u, s, v = np.linalg.svd(coefs)
    pc_ids = ['PC%d' % i for i in range(len(s))]
    samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                           columns=pc_ids, index=coefficients.index)
    features = pd.DataFrame(v.T[:, :len(s)],
                            columns=pc_ids, index=coefficients.columns)
    short_method_name = 'regression_biplot'
    long_method_name = 'Multinomial regression biplot'
    eigvals = pd.Series(s, index=pc_ids)
    proportion_explained = eigvals / eigvals.sum()
    res = OrdinationResults(short_method_name, long_method_name, eigvals,
                            samples=samples, features=features,
                            proportion_explained=proportion_explained)
    return res
Beispiel #25
0
def biplot(output_dir: str, biplot: skbio.OrdinationResults,
           sample_metadata: qiime2.Metadata, feature_metadata:
           qiime2.Metadata=None,
           number_of_features: int=5) -> None:

    # select the top N most important features based on the vector's magnitude
    feats = biplot.features.copy()
    origin = np.zeros_like(feats.columns)
    feats['importance'] = feats.apply(euclidean, axis=1, args=(origin,))
    feats.sort_values('importance', inplace=True, ascending=False)
    feats.drop(['importance'], inplace=True, axis=1)
    biplot.features = feats[:number_of_features].copy()

    _generic_plot(output_dir, master=biplot, other_pcoa=None,
                  metadata=sample_metadata, feature_metadata=feature_metadata,
                  plot_name='biplot')
Beispiel #26
0
    def test_scaling2(self):
        scores = cca(self.Y, self.X, scaling=2)

        # Load data as computed with vegan 2.0-8
        vegan_features = pd.DataFrame(np.loadtxt(
            get_data_path('example3_species_scaling2_from_vegan')),
                                      index=self.feature_ids,
                                      columns=self.pc_ids)

        vegan_samples = pd.DataFrame(np.loadtxt(
            get_data_path('example3_site_scaling2_from_vegan')),
                                     index=self.sample_ids,
                                     columns=self.pc_ids)

        sample_constraints = pd.DataFrame(np.loadtxt(
            get_data_path('example3_sample_constraints_scaling2')),
                                          index=self.sample_ids,
                                          columns=self.pc_ids)

        mat = np.loadtxt(get_data_path('example3_biplot_scaling2'))

        cropped_pc_ids = self.pc_ids[:mat.shape[1]]
        biplot_scores = pd.DataFrame(mat,
                                     index=self.env_ids,
                                     columns=cropped_pc_ids)

        proportion_explained = pd.Series([
            0.466911, 0.238327, 0.100548, 0.104937, 0.044805, 0.029747,
            0.012631, 0.001562, 0.000532
        ],
                                         index=self.pc_ids)
        eigvals = pd.Series([
            0.366136, 0.186888, 0.078847, 0.082288, 0.035135, 0.023327,
            0.009905, 0.001225, 0.000417
        ],
                            index=self.pc_ids)

        exp = OrdinationResults('CCA',
                                'Canonical Correspondence Analysis',
                                samples=vegan_samples,
                                features=vegan_features,
                                sample_constraints=sample_constraints,
                                biplot_scores=biplot_scores,
                                proportion_explained=proportion_explained,
                                eigvals=eigvals)

        assert_ordination_results_equal(scores, exp, decimal=6)
Beispiel #27
0
def _simulation_data(data, ids):
    with open("ordination.txt","w", encoding='utf8') as ordination:
        ordination.write('Eigvals\t0'+'\n\n')
        ordination.write('Proportion explained\t0'+'\n\n')
        ordination.write('Species\t0\t0\n\n')
        ordination.write('Site\t'+str(len(data)*len(data[0][0]))+'\t3\n')
        dm = {}
        j=0
        for row in data:
            identifier = ids[j]
            for i in range(len(row[0])):
                ordination.write(str(identifier)+"_t"+str(i)+"\t"+str(row[0][i])+"\t"+str(row[1][i])+"\t"+str(row[2][i])+"\n")
                dm.update({str(identifier)+"."+str(i):[row[0][i],row[1][i],row[2][i]]})
            j+=1
        ordination.write("\n")
        ordination.write("Biplot\t0\t0\n\n")
        ordination.write("Site constraints\t0\t0\n")
        ordination_results = OrdinationResults.read("ordination.txt")
    ordination.close
    os.remove("ordination.txt")
    
    # Distance matrix (euclidean)
    dm_0 = []
    dm_0.append("")
    distance_matrix = []
    for key in dm.keys():
        dm_0.append(key)
    distance_matrix.append(dm_0)
    for key in dm.keys():
        dm_1 = []
        dm_1.append(key)
        for key1 in dm.keys():
            dm_1.append(str(distance.euclidean(dm[key],dm[key1])))
        distance_matrix.append(dm_1)

    #Mapping file
    md_0 = ["#SampleID","Subject","Treatment","Timepoint"]
    md_1 = ["#q2:types","categorical","categorical","numeric"]
    md = []
    for id in ids:
        for i in range(len(data[0][0])):
            md.append([id+"_t"+str(i),id,''.join([k for k in id if not k.isdigit()])[:-1],i])
    metadata = [md_0,md_1]
    for row in md:
        metadata.append(row)
    #ADD FUNCTIONALITY TO RETURN MAPPING FILE
    return ordination_results, distance_matrix
 def test_standalone_rpca_n_components(self):
     """Tests the standalone RPCA script when n_components is 2
     """
     in_ = get_data_path('test.biom', subfolder='rpca_data')
     out_ = os_path_sep.join(in_.split(os_path_sep)[:-1])
     runner = CliRunner()
     # run the same command but with rank==2
     result = runner.invoke(sdc.commands['rpca'], [
         '--in-biom', in_, '--output-dir', out_, '--n_components', 2,
         '--max_iterations', 5
     ])
     CliTestCase().assertExitCode(0, result)
     ord_res = OrdinationResults.read(
         get_data_path('ordination.txt', subfolder='rpca_data'))
     # check it contains three axis
     if len(ord_res.proportion_explained) == 3:
         pass
    def test_extensive(self):
        eigvals = [
            0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0
        ]
        proportion_explained = [
            0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992,
            0.1263356565, 0.0
        ]
        sample_ids = [str(i) for i in range(6)]
        axis_labels = ['PC%d' % i for i in range(1, 7)]
        samples = [
            [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0],
            [
                0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366,
                0.0
            ],
            [
                -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553,
                0.0
            ],
            [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0],
            [
                -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632,
                0.0
            ],
            [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0]
        ]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(samples,
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        data = np.loadtxt(get_data_path('PCoA_sample_data_2'))
        # test passing a numpy.ndarray and a DistanceMatrix to pcoa
        # gives same results
        for dm in (data, DistanceMatrix(data)):
            results = pcoa(dm)
            assert_ordination_results_equal(results,
                                            expected_results,
                                            ignore_directionality=True)
Beispiel #30
0
def plot(output_dir: str,
         tree: NewickFormat,
         feature_table: pd.DataFrame,
         sample_metadata: qiime2.Metadata,
         pcoa: OrdinationResults = None,
         feature_metadata: qiime2.Metadata = None,
         ignore_missing_samples: bool = False,
         filter_missing_features: bool = False,
         number_of_features: int = 5,
         filter_unobserved_features_from_phylogeny: bool = True) -> None:

    if pcoa is not None and pcoa.features is not None:
        # select the top N most important features based on the vector's
        # magnitude (coped from q2-emperor)
        feats = pcoa.features.copy()
        origin = np.zeros_like(feats.columns)
        feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
        feats.sort_values('importance', inplace=True, ascending=False)
        feats.drop(['importance'], inplace=True, axis=1)
        pcoa.features = feats[:number_of_features].copy()

    sample_metadata = sample_metadata.to_dataframe()

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    # path to the actual newick file
    with open(str(tree)) as file:
        t = parse_newick(file.readline())
    trim_tree = filter_unobserved_features_from_phylogeny
    viz = Empress(tree=t,
                  table=feature_table,
                  sample_metadata=sample_metadata,
                  feature_metadata=feature_metadata,
                  ordination=pcoa,
                  ignore_missing_samples=ignore_missing_samples,
                  filter_missing_features=filter_missing_features,
                  filter_unobserved_features_from_phylogeny=trim_tree)

    with open(os.path.join(output_dir, 'empress.html'), 'w') as file:
        file.write(str(viz))

    viz.copy_support_files(output_dir)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
Beispiel #31
0
def community_plot(output_dir: str,
                   tree: NewickFormat,
                   feature_table: biom.Table,
                   sample_metadata: qiime2.Metadata,
                   pcoa: OrdinationResults = None,
                   feature_metadata: qiime2.Metadata = None,
                   ignore_missing_samples: bool = False,
                   filter_extra_samples: bool = False,
                   filter_missing_features: bool = False,
                   number_of_features: int = 5,
                   shear_tree: bool = True) -> None:
    """Visualizes a tree alongside community-level data.

       The functionality available in this visualization is a superset of the
       functionality in tree_plot() -- including sample metadata coloring /
       barplots, animations, and Emperor integration support.
    """
    if pcoa is not None and pcoa.features is not None:
        # select the top N most important features based on the vector's
        # magnitude (coped from q2-emperor)
        feats = pcoa.features.copy()
        # in cases where the axes are all zero there might be all-NA
        # columns
        feats.fillna(0, inplace=True)
        origin = np.zeros_like(feats.columns)
        feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
        feats.sort_values('importance', inplace=True, ascending=False)
        feats.drop(['importance'], inplace=True, axis=1)
        pcoa.features = feats[:number_of_features].copy()

    sample_metadata = sample_metadata.to_dataframe()

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    t = get_bp(tree)
    viz = Empress(tree=t,
                  table=feature_table,
                  sample_metadata=sample_metadata,
                  feature_metadata=feature_metadata,
                  ordination=pcoa,
                  ignore_missing_samples=ignore_missing_samples,
                  filter_extra_samples=filter_extra_samples,
                  filter_missing_features=filter_missing_features,
                  shear_tree=shear_tree)
    save_viz(viz, output_dir)
Beispiel #32
0
def _validate_ordination_results(files, metadata, out_dir):
    # Magic number [0] -> there is only one plain text file, which is the
    # ordination results
    ord_res_fp = files['plain_text'][0]
    ord_res = OrdinationResults.read(ord_res_fp)

    # Get the ids of the ordination results and the metadata
    ord_res_ids = set(ord_res.samples.index)
    metadata_ids = set(metadata)

    if not metadata_ids.issuperset(ord_res_ids):
        return (False, None, "The ordination results contain samples not "
                "present in the metadata")

    filepaths = [(ord_res_fp, 'plain_text')]

    return True, [ArtifactInfo(None, 'ordination_results', filepaths)], ""
Beispiel #33
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2'])
        features = np.array([[0.408869425742, 0.0695518116298],
                             [-0.1153860437, -0.299767683538],
                             [-0.309967102571, 0.187391917117]])
        samples = np.array([[-0.848956053187, 0.882764759014],
                            [-0.220458650578, -1.34482000302],
                            [1.66697179591, 0.470324389808]])
        features_ids = ['Species1', 'Species2', 'Species3']
        sample_ids = ['Site1', 'Site2', 'Site3']

        samples_df = pd.DataFrame(samples, index=sample_ids,
                                  columns=['CA1', 'CA2'])
        features_df = pd.DataFrame(features, index=features_ids,
                                   columns=['CA1', 'CA2'])

        self.ordination_results = OrdinationResults(
            'CA', 'Correspondance Analysis', eigvals=eigvals,
            samples=samples_df, features=features_df)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10],
                                [22, 0, 8],
                                [22, -4.2, np.nan],
                                ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        samples = np.array([[0.1, 0.2, 0.3],
                            [0.2, 0.3, 0.4],
                            [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'],
                                  ['PC1', 'PC2', 'PC3'])

        self.min_ord_results = OrdinationResults(
            'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
def create_emperor_visual(args, pcfile):
    """
    Sample .pc file
    #     Eigvals	4
    # 0.2705559825337763	0.07359266496720843	0.02997793703738496	0.0
    # 
    # Proportion explained	4
    # 0.7231669539538659	0.19670525434062255	0.0801277917055116	0.0
    # 
    # Species	0	0
    # 
    # Site	4	4
    # ICM_LCY_Bv6--LCY_0001_2003_05_11	-0.04067063044757823	-0.09380781760926289	0.13680474645584195	0.0
    # ICM_LCY_Bv6--LCY_0003_2003_05_04	-0.11521436634022217	-0.15957409396683217	-0.10315005726535573	0.0
    # ICM_LCY_Bv6--LCY_0005_2003_05_16	0.4268532792747924	0.06657577342833808	-0.02212569426459717	0.0
    # ICM_LCY_Bv6--LCY_0007_2003_05_04	-0.2709682824869916	0.18680613814775715	-0.011528994925888972	0.0
    # 
    # Biplot	0	0
    # 
    # Site constraints	0	0
    """
    #print PCoA_result
    from emperor import Emperor
    from skbio import OrdinationResults
    
    #load metadata
    mf = load_mf(args.map_fp)
    # must read from file (scikit-bio version 0.5.1 http://scikit-bio.org/docs/0.5.1/generated/generated/skbio.stats.ordination.OrdinationResults.html
    res = OrdinationResults.read(pcfile)
    emp = Emperor(res, mf)
    pcoa_outdir = os.path.join(args.basedir,'views', 'tmp',args.prefix+'_pcoa3d')
    print('OUT?',pcoa_outdir,args.basedir)
    os.makedirs(pcoa_outdir, exist_ok=True)
    with open(os.path.join(pcoa_outdir, 'index.html'), 'w') as f:
        f.write(emp.make_emperor(standalone=True))
        emp.copy_support_files(pcoa_outdir)
 def setUp(self):
     self.test_matrix = OrdinationResults.read(
         get_data_path('unweighted_unifrac_pc.txt'))
Beispiel #36
0
def _1(data: skbio.OrdinationResults) -> OrdinationFormat:
    ff = OrdinationFormat()
    data.write(str(ff), format='ordination')
    return ff
Beispiel #37
0
    def test_assert_ordination_results_equal(self):
        minimal1 = OrdinationResults('foo', 'bar', pd.Series([1.0, 2.0]),
                                     pd.DataFrame([[1, 2, 3], [4, 5, 6]]))

        # a minimal set of results should be equal to itself
        assert_ordination_results_equal(minimal1, minimal1)

        # type mismatch
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, 'foo')

        # numeric values should be checked that they're almost equal
        almost_minimal1 = OrdinationResults(
            'foo', 'bar',
            pd.Series([1.0000001, 1.9999999]),
            pd.DataFrame([[1, 2, 3], [4, 5, 6]]))
        assert_ordination_results_equal(minimal1, almost_minimal1)

        # test each of the optional numeric attributes
        for attr in ('features', 'samples', 'biplot_scores',
                     'sample_constraints'):
            # missing optional numeric attribute in one, present in the other
            setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3, 4]]))
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, but not almost equal
            setattr(minimal1, attr, pd.DataFrame([[1, 2], [3, 4]]))
            setattr(almost_minimal1, attr, pd.DataFrame([[1, 2],
                                                         [3.00002, 4]]))
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, and almost equal
            setattr(minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.0, 4.0]]))
            setattr(almost_minimal1, attr,
                    pd.DataFrame([[1.0, 2.0], [3.00000002, 4]]))
            assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

        # missing optional numeric attribute in one, present in the other
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3])
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None

        # optional numeric attributes present in both, but not almost equal
        minimal1.proportion_explained = pd.Series([1, 2, 3])
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00002])
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None
        almost_minimal1.proportion_explained = None

        # optional numeric attributes present in both, and almost equal
        minimal1.proportion_explained = pd.Series([1, 2, 3])
        almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00000002])
        assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.proportion_explained = None
        almost_minimal1.proportion_explained = None
Beispiel #38
0
class TestOrdinationResults(unittest.TestCase):
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2'])
        features = np.array([[0.408869425742, 0.0695518116298],
                             [-0.1153860437, -0.299767683538],
                             [-0.309967102571, 0.187391917117]])
        samples = np.array([[-0.848956053187, 0.882764759014],
                            [-0.220458650578, -1.34482000302],
                            [1.66697179591, 0.470324389808]])
        features_ids = ['Species1', 'Species2', 'Species3']
        sample_ids = ['Site1', 'Site2', 'Site3']

        samples_df = pd.DataFrame(samples, index=sample_ids,
                                  columns=['CA1', 'CA2'])
        features_df = pd.DataFrame(features, index=features_ids,
                                   columns=['CA1', 'CA2'])

        self.ordination_results = OrdinationResults(
            'CA', 'Correspondance Analysis', eigvals=eigvals,
            samples=samples_df, features=features_df)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10],
                                [22, 0, 8],
                                [22, -4.2, np.nan],
                                ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        samples = np.array([[0.1, 0.2, 0.3],
                            [0.2, 0.3, 0.4],
                            [0.3, 0.4, 0.5],
                            [0.4, 0.5, 0.6]])
        samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'],
                                  ['PC1', 'PC2', 'PC3'])

        self.min_ord_results = OrdinationResults(
            'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)

    def test_str(self):
        exp = ("Ordination results:\n"
               "\tMethod: Correspondance Analysis (CA)\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: 3x2\n"
               "\tSamples: 3x2\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSample IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tMethod: Principal Coordinate Analysis (PCoA)\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tFeatures: N/A\n"
               "\tSamples: 2x1\n"
               "\tBiplot Scores: N/A\n"
               "\tSample constraints: N/A\n"
               "\tFeature IDs: N/A\n"
               "\tSample IDs: 0, 1")
        samples_df = pd.DataFrame(np.array([[1], [2]]))
        obs = str(OrdinationResults('PCoA', 'Principal Coordinate Analysis',
                                    pd.Series(np.array([4.2])), samples_df))
        self.assertEqual(obs.split('\n'), exp.split('\n'))

    def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title,
                                  exp_legend_exists, exp_xlabel, exp_ylabel,
                                  exp_zlabel):
        # check type
        assert_is_instance(fig, mpl.figure.Figure)

        # check number of subplots
        axes = fig.get_axes()
        npt.assert_equal(len(axes), exp_num_subplots)

        # check title
        ax = axes[0]
        npt.assert_equal(ax.get_title(), exp_title)

        # shouldn't have tick labels
        for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() +
                           ax.get_zticklabels()):
            npt.assert_equal(tick_label.get_text(), '')

        # check if legend is present
        legend = ax.get_legend()
        if exp_legend_exists:
            assert_true(legend is not None)
        else:
            assert_true(legend is None)

        # check axis labels
        npt.assert_equal(ax.get_xlabel(), exp_xlabel)
        npt.assert_equal(ax.get_ylabel(), exp_ylabel)
        npt.assert_equal(ax.get_zlabel(), exp_zlabel)

    def test_plot_no_metadata(self):
        fig = self.min_ord_results.plot()
        self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2')

    def test_plot_with_numeric_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(
            self.df, 'numeric', axes=(1, 0, 2),
            axis_labels=['PC 2', 'PC 1', 'PC 3'], title='a title', cmap='Reds')
        self.check_basic_figure_sanity(
            fig, 2, 'a title', False, 'PC 2', 'PC 1', 'PC 3')

    def test_plot_with_categorical_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(
            self.df, 'categorical', axes=[2, 0, 1], title='a title',
            cmap='Accent')
        self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1')

    def test_plot_with_invalid_axis_labels(self):
        with six.assertRaisesRegex(self, ValueError, 'axis_labels.*4'):
            self.min_ord_results.plot(axes=[2, 0, 1],
                                      axis_labels=('a', 'b', 'c', 'd'))

    def test_validate_plot_axes_valid_input(self):
        # shouldn't raise an error on valid input. nothing is returned, so
        # nothing to check here
        samples = self.min_ord_results.samples.values.T
        self.min_ord_results._validate_plot_axes(samples, (1, 2, 0))

    def test_validate_plot_axes_invalid_input(self):
        # not enough dimensions
        with six.assertRaisesRegex(self, ValueError, '2 dimension\(s\)'):
            self.min_ord_results._validate_plot_axes(
                np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2))

        coord_matrix = self.min_ord_results.samples.values.T

        # wrong number of axes
        with six.assertRaisesRegex(self, ValueError, 'exactly three.*found 0'):
            self.min_ord_results._validate_plot_axes(coord_matrix, [])
        with six.assertRaisesRegex(self, ValueError, 'exactly three.*found 4'):
            self.min_ord_results._validate_plot_axes(coord_matrix,
                                                     (0, 1, 2, 3))

        # duplicate axes
        with six.assertRaisesRegex(self, ValueError, 'must be unique'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0))

        # out of range axes
        with six.assertRaisesRegex(self, ValueError, 'axes\[1\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2))
        with six.assertRaisesRegex(self, ValueError, 'axes\[2\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3))

    def test_get_plot_point_colors_invalid_input(self):
        # column provided without df
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(None, 'numeric',
                                                        ['B', 'C'], 'jet')

        # df provided without column
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(self.df, None,
                                                        ['B', 'C'], 'jet')

        # column not in df
        with six.assertRaisesRegex(self, ValueError, 'missingcol'):
            self.min_ord_results._get_plot_point_colors(self.df, 'missingcol',
                                                        ['B', 'C'], 'jet')

        # id not in df
        with six.assertRaisesRegex(self, ValueError, 'numeric'):
            self.min_ord_results._get_plot_point_colors(
                self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet')

        # missing data in df
        with six.assertRaisesRegex(self, ValueError, 'nancolumn'):
            self.min_ord_results._get_plot_point_colors(self.df, 'nancolumn',
                                                        ['B', 'C', 'A'], 'jet')

    def test_get_plot_point_colors_no_df_or_column(self):
        obs = self.min_ord_results._get_plot_point_colors(None, None,
                                                          ['B', 'C'], 'jet')
        npt.assert_equal(obs, (None, None))

    def test_get_plot_point_colors_numeric_column(self):
        # subset of the ids in df
        exp = [0.0, -4.2, 42.0]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

        # all ids in df
        exp = [0.0, 42.0, 42.19, -4.2]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

    def test_get_plot_point_colors_categorical_column(self):
        # subset of the ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]]
        exp_color_dict = {
            'foo': [0.5, 0., 0., 1.],
            22: [0., 0., 0.5, 1.]
        }
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        npt.assert_equal(obs[1], exp_color_dict)

        # all ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.],
                      [0., 0., 0.5, 1.]]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        # should get same color dict as before
        npt.assert_equal(obs[1], exp_color_dict)

    def test_plot_categorical_legend(self):
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        # we shouldn't have a legend yet
        assert_true(ax.get_legend() is None)

        self.min_ord_results._plot_categorical_legend(
            ax, {'foo': 'red', 'bar': 'green'})

        # make sure we have a legend now
        legend = ax.get_legend()
        assert_true(legend is not None)

        # do some light sanity checking to make sure our input labels and
        # colors are present. we're not using nose.tools.assert_items_equal
        # because it isn't available in Python 3.
        labels = [t.get_text() for t in legend.get_texts()]
        npt.assert_equal(sorted(labels), ['bar', 'foo'])

        colors = [l.get_color() for l in legend.get_lines()]
        npt.assert_equal(sorted(colors), ['green', 'red'])

    def test_repr_png(self):
        obs = self.min_ord_results._repr_png_()
        assert_is_instance(obs, binary_type)
        assert_true(len(obs) > 0)

    def test_repr_svg(self):
        obs = self.min_ord_results._repr_svg_()
        # print_figure(format='svg') can return text or bytes depending on the
        # version of IPython
        assert_true(isinstance(obs, text_type) or isinstance(obs, binary_type))
        assert_true(len(obs) > 0)

    def test_png(self):
        assert_is_instance(self.min_ord_results.png, Image)

    def test_svg(self):
        assert_is_instance(self.min_ord_results.svg, SVG)