Beispiel #1
0
def biplot(output_dir: str,
           biplot: skbio.OrdinationResults,
           sample_metadata: qiime2.Metadata,
           feature_metadata: qiime2.Metadata = None,
           ignore_missing_samples: bool = False,
           invert: bool = False,
           number_of_features: int = 5) -> None:

    if invert:
        biplot.samples, biplot.features = biplot.features, biplot.samples
        sample_metadata, feature_metadata = feature_metadata, sample_metadata

    # select the top N most important features based on the vector's magnitude
    feats = biplot.features.copy()
    origin = np.zeros_like(feats.columns)
    feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
    feats.sort_values('importance', inplace=True, ascending=False)
    feats.drop(['importance'], inplace=True, axis=1)
    biplot.features = feats[:number_of_features].copy()

    generic_plot(output_dir,
                 master=biplot,
                 other_pcoa=None,
                 ignore_missing_samples=ignore_missing_samples,
                 metadata=sample_metadata,
                 feature_metadata=feature_metadata,
                 plot_name='biplot')
Beispiel #2
0
def procrustes_analysis(
    reference: OrdinationResults,
    other: OrdinationResults,
    dimensions: int = 5,
    permutations: int = 999
) -> (OrdinationResults, OrdinationResults, pd.DataFrame):

    if reference.samples.shape != other.samples.shape:
        raise ValueError('The matrices cannot be fitted unless they have the '
                         'same dimensions')

    if reference.samples.shape[1] < dimensions:
        raise ValueError('Cannot fit fewer dimensions than available')

    # fail if there are any elements in the symmetric difference
    diff = reference.samples.index.symmetric_difference(other.samples.index)
    if not diff.empty:
        raise ValueError('The ordinations represent two different sets of '
                         'samples')

    # make the matrices be comparable
    other.samples = other.samples.reindex(index=reference.samples.index)
    mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions],
                                other.samples.values[:, :dimensions])

    axes = reference.samples.columns[:dimensions]
    samples1 = pd.DataFrame(data=mtx1,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())
    samples2 = pd.DataFrame(data=mtx2,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())

    info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions],
                                   other.samples.values[:, :dimensions], m2,
                                   permutations)

    out1 = OrdinationResults(short_method_name=reference.short_method_name,
                             long_method_name=reference.long_method_name,
                             eigvals=reference.eigvals[:dimensions].copy(),
                             samples=samples1,
                             features=reference.features,
                             biplot_scores=reference.biplot_scores,
                             sample_constraints=reference.sample_constraints,
                             proportion_explained=reference.
                             proportion_explained[:dimensions].copy())
    out2 = OrdinationResults(
        short_method_name=other.short_method_name,
        long_method_name=other.long_method_name,
        eigvals=other.eigvals[:dimensions].copy(),
        samples=samples2,
        features=other.features,
        biplot_scores=other.biplot_scores,
        sample_constraints=other.sample_constraints,
        proportion_explained=other.proportion_explained[:dimensions].copy())
    return out1, out2, info
Beispiel #3
0
def procrustes_analysis(reference: OrdinationResults, other: OrdinationResults,
                        dimensions: int=5) -> (OrdinationResults,
                                               OrdinationResults):

    if reference.samples.shape != other.samples.shape:
        raise ValueError('The matrices cannot be fitted unless they have the '
                         'same dimensions')

    if reference.samples.shape[1] < dimensions:
        raise ValueError('Cannot fit fewer dimensions than available')

    # fail if there are any elements in the symmetric difference
    if not (reference.samples.index ^ other.samples.index).empty:
        raise ValueError('The ordinations represent two different sets of '
                         'samples')

    # make the matrices be comparable
    other.samples = other.samples.reindex(index=reference.samples.index)

    mtx1, mtx2, _ = procrustes(reference.samples.values[:, :dimensions],
                               other.samples.values[:, :dimensions])

    axes = reference.samples.columns[:dimensions]
    samples1 = pd.DataFrame(data=mtx1,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())
    samples2 = pd.DataFrame(data=mtx2,
                            index=reference.samples.index.copy(),
                            columns=axes.copy())

    out1 = OrdinationResults(
            short_method_name=reference.short_method_name,
            long_method_name=reference.long_method_name,
            eigvals=reference.eigvals[:dimensions].copy(),
            samples=samples1,
            features=reference.features,
            biplot_scores=reference.biplot_scores,
            sample_constraints=reference.sample_constraints,
            proportion_explained=reference.proportion_explained[:dimensions]
            .copy())
    out2 = OrdinationResults(
            short_method_name=other.short_method_name,
            long_method_name=other.long_method_name,
            eigvals=other.eigvals[:dimensions].copy(),
            samples=samples2,
            features=other.features,
            biplot_scores=other.biplot_scores,
            sample_constraints=other.sample_constraints,
            proportion_explained=other.proportion_explained[:dimensions]
            .copy())
    return out1, out2
Beispiel #4
0
def scatterplot(df, x=None, y=None, z=None, remote=True):
    """Create an Emperor scatter plot from a Pandas DataFrame

    Parameters
    ----------
    df : pd.DataFrame
        Pandas DataFrame with the data to display, this includes both
        *metadata* and *coordinates* to position the samples in a 3D space.
    x, y, z : str, optional
        Column names in `df`, to use as first (``x``), second (``y``) and third
        (``z``) axes in the visualization. If these are not specified, axes
        are chosen according to the variance (in decremental order).
    remote : bool, optional
        Whether the JavaScript resources should be loaded locally or from
        GitHub. Defaults to ``True``.

    Returns
    -------
    emperor.core.Emperor
        Emperor object with the numerical data as the `ordination` attribute
        and the entire DataFrame as the `mf` attribute.

    Raises
    ------
    ValueError
        If `df` is not a PandasDataFrame
        If `x`, `y` or `z` are missing from `df` or if they are not numeric
        columns.
        If after removing rows with missing data there are fewer than 3
        samples.

    Notes
    -----
    If a row has missing data, that data point will be removed from the
    visualization.

    See Also
    --------
    emperor.core.Emperor
    """

    if not isinstance(df, pd.DataFrame):
        raise ValueError("The argument is not a Pandas DataFrame")

    for col in [z, y, x]:
        if col is None:
            continue

        if col not in df.columns:
            raise ValueError("'%s' is not a column in the DataFrame" % col)

        if not np.issubdtype(df[col].dtype, np.number):
            raise ValueError("'%s' is not a numeric column" % col)

    # remove NAs
    samples = df.select_dtypes(include=[np.number]).copy()
    samples.dropna(axis=0, how='any', inplace=True)

    if len(samples.columns) < 3:
        raise ValueError("Not enough data to plot")

    # sort columns by variance
    variance = samples.var().sort_values(ascending=False)
    samples = samples[variance.index]

    # re-order x, y and z
    ordered = samples.columns.tolist()
    for col in [z, y, x]:
        if col is not None:
            ordered.remove(col)
            ordered = [col] + ordered
    samples = samples[ordered]

    # match up the metadata and coordinates
    df = df.loc[samples.index]

    ores = OrdinationResults(short_method_name='',
                             long_method_name='',
                             eigvals=np.zeros_like(samples.columns),
                             samples=samples,
                             proportion_explained=variance)

    df.index.name = '#SampleID'

    # HACK: scale the position of the samples to fit better within the screen
    ores.samples = ores.samples / ores.samples.max(axis=0)

    return Emperor(ores,
                   df,
                   dimensions=len(ores.samples.columns),
                   remote=remote)