Beispiel #1
0
def test_computed_y_column():
    p = (ggplot(df, aes('x'))
         + stat_ecdf(size=2)
         # Should be able to used computed y column & create a
         # new mapped column also called y
         + stat_ecdf(aes(y=after_stat('y-0.2')), size=2, color='blue')
         )
    assert p == 'computed_y_column'
Beispiel #2
0
def test_ecdf():
    p = ggplot(df, aes('x')) + stat_ecdf(size=2)

    assert p == 'ecdf'
Beispiel #3
0
import pandas as pd

from plotnine import ggplot, aes, after_stat, stat_ecdf


df = pd.DataFrame({'x': range(10)})
p = ggplot(df, aes('x')) + stat_ecdf(size=2)


def test_ecdf():
    p = ggplot(df, aes('x')) + stat_ecdf(size=2)

    assert p == 'ecdf'


def test_computed_y_column():
    p = (ggplot(df, aes('x'))
         + stat_ecdf(size=2)
         # Should be able to used computed y column & create a
         # new mapped column also called y
         + stat_ecdf(aes(y=after_stat('y-0.2')), size=2, color='blue')
         )
    assert p == 'computed_y_column'
Beispiel #4
0
def plot_ecdf(df_plot,
              variable_column,
              color_column='none',
              output_file='plot_distribution',
              facet_column='none',
              x_log10=False):
    """Plot plot_distribution to png.

    Parameters
    ----------
    df_plot : pandas.DataFrame
        DataFrame with <variable_column> as a column.
    variable_column : string
        String of variable_column column to plot.
    color_column : string
        String of color column to plot.
    output_file : string
        Basename of output file.
    facet_column : string
        Column to facet the plot by.

    Returns
    -------
    NULL
    """
    n_colors = 0
    if color_column != 'none':
        gplt = plt9.ggplot(df_plot,
                           plt9.aes(x=variable_column, color=color_column))
        n_colors = df_plot[color_column].nunique()
    else:
        gplt = plt9.ggplot(df_plot, plt9.aes(x=variable_column))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.stat_ecdf(alpha=0.8)
    if x_log10:
        gplt = gplt + plt9.scale_x_continuous(
            trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
    else:
        gplt = gplt + plt9.scale_x_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.labs(y='Cumulative density', title='')
    if n_colors != 0 and n_colors > 20:
        gplt = gplt + plt9.theme(legend_position='none')
    elif n_colors != 0 and n_colors < 9:
        gplt = gplt + plt9.scale_colour_brewer(palette='Dark2', type='qual')
    if facet_column != 'none':
        gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5)
        n_facets = df_plot[facet_column].nunique()
        gplt.save('{}.png'.format(output_file),
                  dpi=300,
                  width=6 * (n_facets / 4),
                  height=4 * (n_facets / 4),
                  limitsize=False)
    else:
        gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4)
    return 0
Beispiel #5
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Calcualte and compare LISI across a series of reduced dims and
            categorical variables.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    # parser.add_argument(
    #     '-h5', '--h5_anndata',
    #     action='store',
    #     dest='h5',
    #     required=True,
    #     help='H5 AnnData file.'
    # )

    parser.add_argument(
        '-rf',
        '--reduced_dims_tsv',
        action='store',
        dest='reduced_dims',
        required=True,
        help='List of tab-delimited files of reduced dimensions (e.g., PCs)\
            for each cell. First column is cell_barcode. List should be\
            split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).')

    parser.add_argument(
        '-lbl',
        '--reduced_dims_tsv_labels',
        action='store',
        dest='reduced_dims_labels',
        required=True,
        help='String of labels for each reduced_dims_tsv file. List should be\
            split by "::".')

    parser.add_argument(
        '-mf',
        '--metadata_tsv',
        action='store',
        dest='metadata_tsv',
        required=True,
        help='Tab-delimited file of metadata for each cell. First column\
            is cell_barcode.')

    parser.add_argument(
        '-mv',
        '--metadata_columns',
        action='store',
        dest='metadata_columns',
        default='experiment_id',
        help='Comma separated string of categorical variables to calculate\
            LISI with.\
            (default: %(default)s)')

    parser.add_argument('-p',
                        '--perplexity',
                        action='store',
                        dest='perplexity',
                        default=30.0,
                        type=float,
                        help='Perplexity.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: <metadata_tsv>-lisi)')

    options = parser.parse_args()

    # Fixed settings.
    # verbose = True

    # Get the out file base.
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}-lisi'.format(
            os.path.basename(
                options.metadata_tsv.rstrip('tsv.gz').rstrip('.')))

    # Get the columns to use
    lisi_columns = options.metadata_columns.split(',')
    # lisi_columns = ['experiment_id', 'batch']
    lisi_columns_dtype = dict(
        zip(lisi_columns, ['category'] * len(lisi_columns)))

    # Load the metadata file
    file_meta = options.metadata_tsv
    df_meta = pd.read_csv(file_meta,
                          sep='\t',
                          index_col='cell_barcode',
                          dtype=lisi_columns_dtype)

    # Load the reduced dims.
    files = options.reduced_dims.split('::')
    labels = options.reduced_dims_labels.split('::')
    assert len(files) == len(labels), 'ERROR: check files and labels input'

    # Make a dict of theoretical maximum LISI value for each label.
    lisi_limit = {}
    for col in lisi_columns:
        n_cat = len(df_meta[col].cat.categories)
        lisi_limit[col] = n_cat

    list_lisi = []
    for i in range(len(files)):
        df_reduced_dims = pd.read_csv(files[i],
                                      sep='\t',
                                      index_col='cell_barcode')

        # Run lisi and save results to dataframe
        _df_lisi = pd.DataFrame(hm.compute_lisi(
            df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns],
            lisi_columns),
                                columns=lisi_columns)
        _df_lisi['file'] = files[i]
        _df_lisi['label'] = labels[i]
        _df_lisi['cell_barcode'] = df_meta.index
        list_lisi.append(_df_lisi)

    # Make one long dataframe.
    df_lisi = pd.concat(list_lisi)
    # Make cell_barcode the first column.
    cols = list(df_lisi.columns)
    cols = [cols[-1]] + cols[:-1]

    # Save the results
    df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base),
                         sep='\t',
                         index=False,
                         quoting=csv.QUOTE_NONNUMERIC,
                         na_rep='',
                         compression='gzip')

    # Compare the lisi distributions
    n_labels = len(labels)
    for lisi_column in lisi_columns:
        # Make density plot.
        gplt = plt9.ggplot(df_lisi,
                           plt9.aes(
                               fill='label',
                               x='label',
                               y=lisi_column,
                           ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.geom_violin(alpha=0.9)
        gplt = gplt + plt9.geom_boxplot(
            group='label',
            position=plt9.position_dodge(width=.9),
            width=.1,
            fill='white',
            outlier_alpha=0  # Do not know how to totally remove outliers.
        )
        # Add a line at the theoretical maximum
        gplt = gplt + plt9.geom_hline(
            plt9.aes(yintercept=lisi_limit[lisi_column]))
        # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label))
        gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt = gplt + plt9.theme(legend_position='none')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
        gplt.save(
            '{}-{}-violin.png'.format(out_file_base, lisi_column),
            dpi=300,
            width=4 * (n_labels / 4),
            height=10,
            # height=4*(n_samples/4),
            limitsize=False)

        # Make ecdf.
        gplt = plt9.ggplot(df_lisi, plt9.aes(
            x=lisi_column,
            color='label',
        ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.stat_ecdf(alpha=0.8)
        gplt = gplt + plt9.labs(
            x='LISI',
            y='Cumulative density',
            # color='Reduction',
            title='')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
        gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)
Beispiel #6
0
def test_ecdf():
    p = ggplot(df, aes('x')) + stat_ecdf(size=2)

    assert p == 'ecdf'
Beispiel #7
0
# ## Univariate, Continuous Distribution

# ### Histogram

(
    p9.ggplot(df[~df["age"].isna()], p9.aes(x="age"))
    + p9.geom_histogram(binwidth=5)
    + p9.ggtitle("Histogram")
)


# ## ECDF

(
    p9.ggplot(df[~df["age"].isna()], p9.aes(x="age"))
    + p9.stat_ecdf()
    + p9.ggtitle("ECDF")
)


# ## Continuous Distribution, grouped by Categorical

# ### Box Plots

(
    p9.ggplot(df[~df["age"].isna()], p9.aes(x="sex", y="age"))
    + p9.geom_boxplot()
    + p9.coord_flip()
    + p9.ggtitle("Box plot")
)