def biplot(output_dir: str, biplot: skbio.OrdinationResults, sample_metadata: qiime2.Metadata, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, invert: bool = False, number_of_features: int = 5) -> None: if invert: biplot.samples, biplot.features = biplot.features, biplot.samples sample_metadata, feature_metadata = feature_metadata, sample_metadata # select the top N most important features based on the vector's magnitude feats = biplot.features.copy() origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) biplot.features = feats[:number_of_features].copy() generic_plot(output_dir, master=biplot, other_pcoa=None, ignore_missing_samples=ignore_missing_samples, metadata=sample_metadata, feature_metadata=feature_metadata, plot_name='biplot')
def procrustes_analysis( reference: OrdinationResults, other: OrdinationResults, dimensions: int = 5, permutations: int = 999 ) -> (OrdinationResults, OrdinationResults, pd.DataFrame): if reference.samples.shape != other.samples.shape: raise ValueError('The matrices cannot be fitted unless they have the ' 'same dimensions') if reference.samples.shape[1] < dimensions: raise ValueError('Cannot fit fewer dimensions than available') # fail if there are any elements in the symmetric difference diff = reference.samples.index.symmetric_difference(other.samples.index) if not diff.empty: raise ValueError('The ordinations represent two different sets of ' 'samples') # make the matrices be comparable other.samples = other.samples.reindex(index=reference.samples.index) mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions]) axes = reference.samples.columns[:dimensions] samples1 = pd.DataFrame(data=mtx1, index=reference.samples.index.copy(), columns=axes.copy()) samples2 = pd.DataFrame(data=mtx2, index=reference.samples.index.copy(), columns=axes.copy()) info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions], m2, permutations) out1 = OrdinationResults(short_method_name=reference.short_method_name, long_method_name=reference.long_method_name, eigvals=reference.eigvals[:dimensions].copy(), samples=samples1, features=reference.features, biplot_scores=reference.biplot_scores, sample_constraints=reference.sample_constraints, proportion_explained=reference. proportion_explained[:dimensions].copy()) out2 = OrdinationResults( short_method_name=other.short_method_name, long_method_name=other.long_method_name, eigvals=other.eigvals[:dimensions].copy(), samples=samples2, features=other.features, biplot_scores=other.biplot_scores, sample_constraints=other.sample_constraints, proportion_explained=other.proportion_explained[:dimensions].copy()) return out1, out2, info
def procrustes_analysis(reference: OrdinationResults, other: OrdinationResults, dimensions: int=5) -> (OrdinationResults, OrdinationResults): if reference.samples.shape != other.samples.shape: raise ValueError('The matrices cannot be fitted unless they have the ' 'same dimensions') if reference.samples.shape[1] < dimensions: raise ValueError('Cannot fit fewer dimensions than available') # fail if there are any elements in the symmetric difference if not (reference.samples.index ^ other.samples.index).empty: raise ValueError('The ordinations represent two different sets of ' 'samples') # make the matrices be comparable other.samples = other.samples.reindex(index=reference.samples.index) mtx1, mtx2, _ = procrustes(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions]) axes = reference.samples.columns[:dimensions] samples1 = pd.DataFrame(data=mtx1, index=reference.samples.index.copy(), columns=axes.copy()) samples2 = pd.DataFrame(data=mtx2, index=reference.samples.index.copy(), columns=axes.copy()) out1 = OrdinationResults( short_method_name=reference.short_method_name, long_method_name=reference.long_method_name, eigvals=reference.eigvals[:dimensions].copy(), samples=samples1, features=reference.features, biplot_scores=reference.biplot_scores, sample_constraints=reference.sample_constraints, proportion_explained=reference.proportion_explained[:dimensions] .copy()) out2 = OrdinationResults( short_method_name=other.short_method_name, long_method_name=other.long_method_name, eigvals=other.eigvals[:dimensions].copy(), samples=samples2, features=other.features, biplot_scores=other.biplot_scores, sample_constraints=other.sample_constraints, proportion_explained=other.proportion_explained[:dimensions] .copy()) return out1, out2
def scatterplot(df, x=None, y=None, z=None, remote=True): """Create an Emperor scatter plot from a Pandas DataFrame Parameters ---------- df : pd.DataFrame Pandas DataFrame with the data to display, this includes both *metadata* and *coordinates* to position the samples in a 3D space. x, y, z : str, optional Column names in `df`, to use as first (``x``), second (``y``) and third (``z``) axes in the visualization. If these are not specified, axes are chosen according to the variance (in decremental order). remote : bool, optional Whether the JavaScript resources should be loaded locally or from GitHub. Defaults to ``True``. Returns ------- emperor.core.Emperor Emperor object with the numerical data as the `ordination` attribute and the entire DataFrame as the `mf` attribute. Raises ------ ValueError If `df` is not a PandasDataFrame If `x`, `y` or `z` are missing from `df` or if they are not numeric columns. If after removing rows with missing data there are fewer than 3 samples. Notes ----- If a row has missing data, that data point will be removed from the visualization. See Also -------- emperor.core.Emperor """ if not isinstance(df, pd.DataFrame): raise ValueError("The argument is not a Pandas DataFrame") for col in [z, y, x]: if col is None: continue if col not in df.columns: raise ValueError("'%s' is not a column in the DataFrame" % col) if not np.issubdtype(df[col].dtype, np.number): raise ValueError("'%s' is not a numeric column" % col) # remove NAs samples = df.select_dtypes(include=[np.number]).copy() samples.dropna(axis=0, how='any', inplace=True) if len(samples.columns) < 3: raise ValueError("Not enough data to plot") # sort columns by variance variance = samples.var().sort_values(ascending=False) samples = samples[variance.index] # re-order x, y and z ordered = samples.columns.tolist() for col in [z, y, x]: if col is not None: ordered.remove(col) ordered = [col] + ordered samples = samples[ordered] # match up the metadata and coordinates df = df.loc[samples.index] ores = OrdinationResults(short_method_name='', long_method_name='', eigvals=np.zeros_like(samples.columns), samples=samples, proportion_explained=variance) df.index.name = '#SampleID' # HACK: scale the position of the samples to fit better within the screen ores.samples = ores.samples / ores.samples.max(axis=0) return Emperor(ores, df, dimensions=len(ores.samples.columns), remote=remote)