Ejemplo n.º 1
0
def test_quantiles_width_dodge():
    p = (ggplot(df, aes('x')) +
         geom_violin(aes(y='y'), draw_quantiles=[.25, .75], size=2) +
         geom_violin(aes(y='y+25'), color='green', width=0.5, size=2) +
         geom_violin(aes(y='y+50', fill='factor(y%2)'), size=2) +
         theme(subplots_adjust={'right': 0.85}))
    assert p == 'quantiles_width_dodge'
Ejemplo n.º 2
0
def test_quantiles_width_dodge():
    p = (ggplot(df, aes('x')) +
         geom_violin(aes(y='y'),
                     draw_quantiles=[.25, .75], size=2) +
         geom_violin(aes(y='y+25'), color='green',
                     width=0.5, size=2) +
         geom_violin(aes(y='y+50', fill='factor(y%2)'),
                     size=2) +
         theme(subplots_adjust={'right': 0.85}))
    assert p == 'quantiles_width_dodge'
Ejemplo n.º 3
0
class TestAesthetics:
    p = (ggplot(df, aes('x')) + geom_violin(aes(y='y'), size=2) +
         geom_violin(df[:2 * m], aes(y='y+25', fill='x'), size=2) +
         geom_violin(df[2 * m:], aes(y='y+25', color='x'), size=2) +
         geom_violin(df[2 * m:], aes(y='y+50', linetype='x'), size=2))

    def test_aesthetics(self):
        assert self.p == 'aesthetics'

    def test_aesthetics_coordflip(self):
        assert self.p + coord_flip() == 'aesthetics+coord_flip'
Ejemplo n.º 4
0
def test_scale():
    p = (ggplot(df, aes('x')) +
         # Red should envelop blue
         geom_violin(aes(y='y'), scale='width',
                     color='red', fill='red', size=2) +
         geom_violin(aes(y='y'), scale='area',
                     color='blue', fill='blue', size=2) +
         geom_violin(df[:36], aes(y='y+25'), scale='count',
                     color='green', size=2) +
         # Yellow should envelop green
         geom_violin(aes(y='y+25'), scale='count',
                     color='yellow', fill='yellow', size=2) +
         geom_violin(df[:36], aes(y='y+25'), scale='count',
                     color='green', fill='green', size=2))
    assert p == 'scale'
Ejemplo n.º 5
0
def plot_auc(read_file_1, read_file_2, plot_dir, save_file, generate_auc):
    # read in data
    temp_sub = pd.read_csv(os.path.join(dir_output, read_file_1))
    temp_agg = pd.read_csv(os.path.join(dir_output, read_file_2))

    #subset agg model to match sub models
    temp_agg = subset_agg(temp_sub=temp_sub, temp_agg=temp_agg)

    # recode outcome
    temp_agg = recode_outcome(temp_dat=temp_agg)
    temp_sub = recode_outcome(temp_dat=temp_sub)

    if generate_auc:
        # get auc
        temp_sub = get_auc(temp_sub)
        temp_agg = get_auc(temp_agg)

    # remove NA
    temp_sub = temp_sub.dropna().reset_index(drop=True)
    temp_agg = temp_agg.dropna().reset_index(drop=True)

    # create new variable to indicate if agg or sub data
    temp_sub.insert(0, 'model', 'CPT specific')
    temp_agg.insert(0, 'model', 'Aggregate')

    # get outpult file
    plot_output = os.path.join(dir_figures, plot_dir)
    # combine data
    dat = pd.concat([temp_agg, temp_sub], axis=0).reset_index(drop=True)
    img = (ggplot(dat, aes(x='outcome', y='auc', fill='model')) +
           geom_violin(aes(draw_quantiles='auc')) +
           labs(x='Outcome', y='AUROC') + theme_bw())
    img.save(os.path.join(plot_output, save_file))
Ejemplo n.º 6
0
def plot_violin_plots(
    par_id: str,
    dims: List[str],
    draws: Dict,
    log_scale_variables: List[str],
    units: Dict[str, str],
    confidence_intervals,
    measurements,
):
    """Plot and save violin plots of parsed distributions.

    :param par_id: Name of the parameter plotted
    :param dims: Dimensions of the parameter
    :param draws: pd.Dataframe of parameter distribution
    indexed by dimensions and contains the population samples
    :param log_scale_variables: Parameters that are log-distributed
    :param units: Dictionary of units for each parameter
    """
    par_units = units[par_id]
    x = fill = dims[0] if len(dims) <= 1 else "experiments"
    plot = (p9.ggplot(data=draws) + p9.geom_violin(
        p9.aes(y=f"{par_id}", x=x, fill=fill),
        position="identity",
        color="None",
        size=0.5,
        alpha=0.7,
        weight=0.7,
        linetype="None",
    ) + p9.labels.ylab(f"{par_id} {par_units}"))
    if par_id in confidence_intervals.keys():
        plot += p9.geoms.geom_errorbar(
            p9.aes(x=x, ymin="lower_ci", ymax="upper_ci"),
            data=confidence_intervals[par_id],
            width=0.1,
        )
    if par_id in measurements.keys():
        if len(measurements[par_id]) > 0:
            plot += p9.geoms.geom_point(
                p9.aes(y="measurement", x=x),
                data=measurements[par_id],
            )
    if len(dims) == 1:
        plot += p9.themes.theme(axis_text_x=p9.element_text(angle=70), )
    if len(dims) > 1:
        plot += p9.facet_wrap(f"~{dims[1]}") + p9.themes.theme(
            panel_spacing_y=0.05,
            panel_spacing_x=0.35,
            axis_title=p9.element_text(size=10),
            axis_text=p9.element_text(size=11),
            axis_text_y=p9.element_text(size=8, angle=45),
            axis_title_x=p9.element_blank(),
            axis_text_x=p9.element_blank(),
        )
    if par_id in log_scale_variables:
        plot += p9.scale_y_log10()

    return plot
Ejemplo n.º 7
0
def make_violin_plot(data,x,y,color):
    """
        Make a violin plot between data[x], data[y] and data[color]
    """
    (p9.ggplot(data=train_data,
               mapping=p9.aes(x='factor(checkout_price)', y='num_orders', color='emailer_for_promotion'))
    + p9.geom_violin()
    + p9.theme(axis_text_x=p9.element_text(angle=90))
    + p9.labs(title='{} Vs {} Vs {}'.format(x,y,color)));
Ejemplo n.º 8
0
def test_scale():
    p = (
        ggplot(df, aes('x')) +
        # Red should envelop blue
        geom_violin(aes(y='y'), scale='width', color='red', fill='red', size=2)
        + geom_violin(
            aes(y='y'), scale='area', color='blue', fill='blue', size=2) +
        geom_violin(
            df[:36], aes(y='y+25'), scale='count', color='green', size=2) +
        # Yellow should envelop green
        geom_violin(aes(y='y+25'),
                    scale='count',
                    color='yellow',
                    fill='yellow',
                    size=2) + geom_violin(df[:36],
                                          aes(y='y+25'),
                                          scale='count',
                                          color='green',
                                          fill='green',
                                          size=2))
    assert p == 'scale'
Ejemplo n.º 9
0
def test_style_input_checks():
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), style=True)
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), style=1)
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), style='up')
Ejemplo n.º 10
0
def create_violin_plot(box_df):
    """This function should create a violin plot from the dataframe created in melt_data

    Input
    -----
    box_df: pandas.DataFrame
        The dataframe returned by melt_data

    Returns
    -------
    plot: plotnine.ggplot
        A violin plot visualizing the data in box_df
    """
    plot = ggplot(
        box_df,
        aes(x='treated/control', y='blood_pressure',
            fill='treated/control')) + geom_violin()

    return plot
Ejemplo n.º 11
0
def steps_violin_plotter(df_ar, testbed, run=0):
    df_estimate = testbed.estimate_distribution(1000)
    df_estimate = df_estimate.astype({"action": "int32"})
    df_ar = df_ar.loc[df_ar["run"] == run]
    df_ar = df_ar.astype({"action": "int32"})
    p = (
        p9.ggplot(
            p9.aes(
                x="reorder(factor(action), action)",
                y="reward",
            )
        )
        + p9.ggtitle(f"Action - Rewards across {df_ar.shape[0]} steps")
        + p9.xlab("k-arm")
        + p9.ylab("Reward")
        + p9.geom_violin(df_estimate, fill="#d0d3d4")
        + p9.geom_jitter(df_ar, p9.aes(color="step"))
        + p9.theme(figure_size=(20, 9))
    )
    fig = p.draw()

    return fig
Ejemplo n.º 12
0
    def plot(self):
        """Plot the figures using R"""
        df = pandas.DataFrame(
            self.data,
            columns=self.datacols,
        )
        with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "):
            df.columns = make_unique(df.columns.tolist())

        if self.savedata:
            datafile = self.outprefix + ".csv"
            logger.info(
                "[r]%s[/r]: Saving data to: %r",
                self.title,
                datafile,
                extra={"markup": True},
            )
            df.to_csv(datafile, index=False)

        if df.shape[0] == 0:
            logger.warning("No data points to plot")
            return

        aes_for_geom_fill = None
        aes_for_geom_color = None
        theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2))
        if df.shape[1] > 2:
            aes_for_geom_fill = p9.aes(fill=df.columns[2])
            aes_for_geom_color = p9.aes(color=df.columns[2])
        plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1]))
        if self.figtype == "scatter":
            plt = plt + p9.geom_point(aes_for_geom_color)
            theme_elems = None
        elif self.figtype == "line":
            pass
        elif self.figtype == "bar":
            plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0]))
        elif self.figtype == "col":
            plt = plt + p9.geom_col(aes_for_geom_fill)
        elif self.figtype == "pie":
            logger.warning("Pie chart is not support by plotnine yet, "
                           "plotting bar chart instead.")
            col0 = df.iloc[:, 0]
            if df.shape[1] > 2:
                plt = plt + p9.geom_bar(
                    p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]),
                    stat="identity"
                    # aes_for_geom_fill,
                    # x=df.Group,
                    # y=col0,
                    # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"),
                    # show_legend=False,
                    # position=p9.position_adjust_text(),
                )
            else:
                col0 = factor(col0, levels=rev(unique(as_character(col0))))
                fills = rev(levels(col0))
                sums = map(lambda x: sum(col0 == x), fills)
                print(col0)
                print(fills)
                plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) +
                       p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label(
                           x=1,
                           y=cumsum(sums) - sums / 2,
                           label=paste0(round(sums / sum(sums) * 100, 1), "%"),
                           show_legend=False,
                       ))
                theme_elems = p9.theme(
                    axis_title_x=p9.element_blank(),
                    axis_title_y=p9.element_blank(),
                    axis_text_y=p9.element_blank(),
                )
        elif self.figtype == "violin":
            plt = plt + p9.geom_violin(aes_for_geom_fill)
        elif self.figtype == "boxplot":
            plt = plt + p9.geom_boxplot(aes_for_geom_fill)
        elif self.figtype in ("histogram", "density"):
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            geom = getattr(p9, f"geom_{self.figtype}")
            if df.columns[1] != "ONE":
                plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6)
                theme_elems = None
            else:
                plt = plt + geom(alpha=0.6)
                theme_elems = p9.theme(legend_position="none")
        elif self.figtype == "freqpoly":
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            if df.columns[1] != "ONE":
                plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1]))
            else:
                plt = plt + p9.geom_freqpoly()
            theme_elems = None
        else:
            raise ValueError(f"Unknown figure type: {self.figtype}")

        plt = plt + p9.ggtitle(self.title)
        self.save_plot(plt, theme_elems)
Ejemplo n.º 13
0
)
final_df.head()

# # Distribution plot

g = (
    p9.ggplot(
        final_df.replace(
            {
                "pre_vs_published": "preprint-published",
                "pre_vs_random": "preprint-random",
            }
        )
    )
    + p9.aes(x="label", y="distance")
    + p9.geom_violin(fill="#a6cee3")
    + p9.labs(x="Document Pair Groups", y="Euclidean Distance")
    + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=1.35)
)
g.save("output/figures/biorxiv_article_distance_abstract_only.svg", dpi=250)
g.save("output/figures/biorxiv_article_distance_abstract_only.png", dpi=250)
print(g)

# # Plot Abstract Only vs Full Text Only

abstract_only = final_df
full_text = pd.read_csv("output/annotated_links/article_distances.tsv", sep="\t")

plot_df = (
    full_text.query("label=='pre_vs_published'")
    .rename(index=str, columns={"distance": "full_text_distance"})[
Ejemplo n.º 14
0
        arti_start = time.time()
        df, separated_peaks = er.proof_artificial(
            model,
            ad_partial,
            region_length=parameters['pad_to'],
            nb_datasets=parameters['artificial_nb_datasets'],
            nb_tfs=parameters['artificial_nb_tfs'],
            n_iter=500,
            squish_factor=parameters['squish_factor'])
        arti_end = time.time()
        print('Artificial data generalisation completed in ' +
              str(arti_end - arti_start) + ' s')

        # The plots
        a = ggplot(df, aes(x="type", y="rebuilt_value", fill="tf_group"))
        a1 = a + geom_violin(position=position_dodge(1), width=1)
        a2 = a + geom_boxplot(position=position_dodge(1), width=0.5)
        b = ggplot(df, aes(
            x="brothers", y="rebuilt_value",
            group="brothers")) + scale_fill_grey() + geom_boxplot(width=0.4)

        a2.save(filename=plot_output_path +
                'artifical_data_systematisation_value_per_type.png',
                height=10,
                width=14,
                units='in',
                dpi=400,
                verbose=False)
        b.save(filename=plot_output_path +
               'artifical_data_systematisation_value_per_brothers.png',
               height=10,
Ejemplo n.º 15
0
        for calculation in calculations:
            scores_df = pd.concat(
                list(
                    map(
                        lambda x: x.drop(['estimator', 'configuration'],
                                         axis=1).groupby([
                                             'dataset', 'feature_selection'
                                         ]).aggregate({
                                             score_type: calculation
                                         }).reset_index(),
                        working_projects.values())))
            scores = scores_df.loc[scores_df['feature_selection'] ==
                                   features_method]

            g = (ggplot(scores, aes(x='dataset', y=score_type)) +
                 geom_violin() + geom_boxplot(width=0.2) +
                 labs(title="{0} Score with features from {1}".format(
                     score_type.capitalize(), features_method.capitalize()),
                      x="Score Measure: {}".format(score_type.capitalize()),
                      y="Feature Selection Method: {}".format(
                          features_method.capitalize())) +
                 theme(plot_title=element_text(size=30,
                                               lineheight=.8,
                                               vjust=1,
                                               family="Fira Code",
                                               face="bold",
                                               margin={'b': 25}),
                       axis_text_x=element_text(size=15, family="Fira Code"),
                       axis_text_y=element_text(size=15, family="Fira Code"),
                       axis_title_x=element_text(size=20, family="Fira Code"),
                       axis_title_y=element_text(size=20, family="Fira Code")))
Ejemplo n.º 16
0
def test_style_alternating():
    p = (ggplot(df, aes('x')) +
         geom_violin(aes(y='y'), style='right-left', fill='green') +
         geom_violin(aes(y='y+25'), style='left-right', fill='yellow'))
    assert p == 'style_alternating'
Ejemplo n.º 17
0
def test_no_trim():
    p = (ggplot(df, aes('x')) + geom_violin(aes(y='y'), trim=False, size=2))
    assert p == 'no_trim'
Ejemplo n.º 18
0
def test_no_trim():
    p = (ggplot(df, aes('x')) +
         geom_violin(aes(y='y'), trim=False, size=2))
    assert p == 'no_trim'
Ejemplo n.º 19
0
def test_scale_area_coordflip():
    p = (ggplot(df, aes('dist', 'value')) + geom_violin(scale='area') +
         geom_sina(scale='area', random_state=123) + coord_flip())

    assert p == 'scale_area+coord_flip'
Ejemplo n.º 20
0
SDRsub.dropna(inplace=True)

# Add level column
SDRsuper.insert(0, 'Level', 'super')
SDRsub.insert(0, 'Level', 'sub')

SDRall = pd.concat([SDRsub, SDRsuper])

#%% SDRsuper and SDRsub violin plot + boxplot + lines

# =============================================================================
# Simple violin plot:
# =============================================================================

(ggplot(SDRall) + aes(y='value', x='Level', fill='Level') +
 geom_violin(scale="width"))

# =============================================================================
# Next level violin plots
# =============================================================================

shift = 0.1


def alt_sign(x):
    "Alternate +1/-1 if x is even/odd"
    return (-1)**x


m1 = aes(x=stage('Level', after_scale='x+shift*alt_sign(x)'))  # shift outward
m2 = aes(x=stage('Level', after_scale='x-shift*alt_sign(x)'),
Ejemplo n.º 21
0
def main():

    args = UserInput()

    if args.y_lim:
        y_lim = np.array(args.y_lim, dtype=np.float32)
    else:
        y_lim = None
    if args.size:
        size = np.array(args.size, dtype=np.float32)
    else:
        size = args.size

###################################

    df_list = [
        pd.read_csv(f, sep=args.sep, skipinitialspace=True)
        for f in args.infile
    ]

    ## only take input with 1 or 2 columns; for 2 columns, 1st is always removed
    lg_list = []
    for idx, df in enumerate(df_list):
        xdf = pd.DataFrame(df.iloc[:, int(args.col) - 1])

        if args.col_names:
            xdf.columns = [args.col_names[idx]]

        lg_list.append(pd.melt(xdf))

    lg_df = pd.concat(lg_list)
    lg_df.columns = [args.x_name, args.y_name]
    print(lg_df)

    ## plotnine method
    if args.use_p9:
        import plotnine as p9
        Quant = [.25, .5, .75]

        if y_lim is not None:
            set_ylim = p9.ylim(y_lim)
        else:
            set_ylim = p9.ylim(
                [lg_df[args.y_name].min(), lg_df[args.y_name].max()])

        df_plot = (p9.ggplot(
            lg_df, p9.aes(x=args.x_name, y=args.y_name, fill=args.x_name)) +
                   p9.geom_violin(
                       width=.75, draw_quantiles=Quant, show_legend=False) +
                   p9.ggtitle(args.title) + p9.theme_classic() + set_ylim +
                   p9.scale_x_discrete(limits=args.col_names) +
                   p9.theme(text=p9.element_text(size=12, color='black'),
                            axis_text_x=p9.element_text(angle=33),
                            panel_grid_major_y=p9.element_line(color='gray',
                                                               alpha=.5)))

        p9.ggsave(filename='{0}.violin.{1}'.format(args.outpref, args.img),
                  plot=df_plot,
                  dpi=int(args.dpi),
                  format=args.img,
                  width=size[0],
                  height=size[1],
                  units='in',
                  verbose=False)

    else:
        ## Seaborn method
        import seaborn as sns
        sns.set(style='whitegrid')

        ax = sns.violinplot(x=args.x_name,
                            y=args.y_name,
                            data=lg_df,
                            linewidth=1,
                            inner='box')
        if args.title:
            ax.set_title(args.title)
        if y_lim is not None:
            ax.set(ylim=y_lim)

        plt.savefig('{0}.violin.{1}'.format(args.outpref, args.img),
                    figsize=tuple(size),
                    format=args.img,
                    dpi=int(args.dpi))
        plt.clf()
final_cosine_df.head()

final_cosine_df = biorxiv_journal_df[["document",
                                      "preprint_doi"]].merge(final_cosine_df)
final_cosine_df.to_csv("output/annotated_links/article_distances_cosine.tsv",
                       sep="\t",
                       index=False)
final_cosine_df.head()

# # Distribution plot

g = (p9.ggplot(
    final_original_df.replace({
        "pre_vs_published": "preprint-published",
        "pre_vs_random": "preprint-random",
    })) + p9.aes(x="label", y="distance") + p9.geom_violin(fill="#a6cee3") +
     p9.labs(x="Document Pair Groups", y="Euclidean Distance") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=2) +
     p9.theme(figure_size=(11, 8.5)))
print(g)

g = (p9.ggplot(
    final_cosine_proxy_df.replace({
        "pre_vs_published": "preprint-published",
        "pre_vs_random": "preprint-random",
    })) + p9.aes(x="label", y="distance") + p9.geom_violin(fill="#a6cee3") +
     p9.labs(x="Document Pair Groups", y="Euclidean (L2 Norm) Distance") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=2) +
     p9.theme(figure_size=(11, 8.5)))
Ejemplo n.º 23
0
def plot_violinbox_plots_per_category(
        dataframe: pandas.DataFrame,
        plot_type: str,
        target_feature: str,
        label_column: str,
        colors: List[str],
        coloring_style: str,
        value_skip_list: List = [],
        jitter_alpha: float = 0.7,
        plot_alpha: float = 0.5,
        log_10_scale: bool = False,
        theme: str = 'gray',
        save_to_file: str = None,
        dpi: int = 150,
        show: bool = True
) -> p9.ggplot:
    """
        The :func:`plot_violinbox_plots_per_category` helps with providing the user with nicely plotted violin and
        box plots of the distribution of data points.

        Parameters
        ----------
        dataframe: `pandas.DataFrame`, required
            This is the main parameter that this method is supposed to work with, which is a dataframe that has
            a label column in which we have integer values starting from 0, and a float feature column the distribution
            of which we tend to monitor.
        plot_type: `str`, required
            This value, either `box` or `violin`, determines the type of plot.
        target_feature: `str`, required
            This parameter is the column name of the features that we want to monitor.
        label_column: `str`, required
            The input dataframe must have a label_column (preferably integer starting from 0), the name of that
            column should be input here.
        colors: `List[str]`, required
            Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
            or a list of two colors indicating a range of color values.
        coloring_style: `str`, optional (default='manual')
            Either `manual` or `gradient` which helps assigning colors to clusters.
        value_skip_list: `List`, optional (default=[])
            If some values in the feature column are to be skipped, they should be put in here so that they
            are ignored in the plots. For example, if for some reason some values are -10000000, they can be taken care
            of in here.
        jitter_alpha: `float`, optional (default=0.7)
            The jitter value transparency is set in this parameter.
        plot_alpha: `float`, optional (default=0.5)
            The transparency intensity can be determined by setting this parameter.
        log_10_scale: `bool`, optional (default=False)
            If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
        theme: `str`, optional (default='gray')
            This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
            are consistent with `plotnine` package's format.
        save_to_file: `str`, optional (default=None)
            If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
        dpi: `int`, optional (default=150)
            The dpi for saving the plots indicating the image quality.
        show: `bool`, optional (default=True)
            Whether or not the plot is to be shown is set in this parameter.
        Returns
        ----------
        The output of this method is of `p9.ggplot` type.
        """
    if len(value_skip_list) > 0:
        df = dataframe[~dataframe[target_feature].isin(value_skip_list)]

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(df[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_jitter(alpha=jitter_alpha)

    if plot_type == 'box':
        pplot += p9.geom_boxplot(alpha=plot_alpha)
    elif plot_type == 'violin':
        pplot += p9.geom_violin(alpha=plot_alpha)
    else:
        raise Exception('unknown plot type, it must be violin or box.')

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)

    if show:
        pplot.draw()

    return pplot
Ejemplo n.º 24
0
        # Different thresholds for the two cell lines

        if CELL_LINE == 'jurkat':
            if val > 0: toadd = 'a__0-0.5'
            if val > 0.5: toadd = 'b__0.5-1'
            if val > 1: toadd = 'c__1-2'
            if val > 2: toadd = 'd__2-3'
            if val > 3: toadd = 'e__3+'

        if CELL_LINE == "mcf7":
            if val > 0: toadd = 'a__0-3'
            if val > 3: toadd = 'b__3-10'
            if val > 5: toadd = 'c__5-10'
            if val > 10: toadd = 'd__10+'

        update_ratio_binarized += [toadd]

    sub['update_ratio_bin'] = update_ratio_binarized
    sub['sqrt_peak_score'] = np.sqrt(sub['peak_score'])

    # Now do violin plot
    p4 = (ggplot(data=sub[0:10000],
                 mapping=aes(x='update_ratio_bin', y='peak_score')) +
          geom_violin(position=position_dodge(1), width=1) + scale_y_log10() +
          geom_boxplot(position=position_dodge(1), width=0.25))

    p4.save(
        FIGURE_DIRECTORY +
        "peak_confirmation_nb_update_ratio_well_characterized_crm_violin_plot.pdf",
        verbose=False)
Ejemplo n.º 25
0
def test_method_counts():
    p = (ggplot(df, aes('dist', 'value')) + geom_violin() +
         geom_sina(method='counts', random_state=123))

    assert p == 'method_counts'
Ejemplo n.º 26
0
def control_list(in_file=None,
                 out_dir=None,
                 reference_gene_file=None,
                 log2=False,
                 page_width=None,
                 page_height=None,
                 user_img_file=None,
                 page_format=None,
                 pseudo_count=1,
                 set_colors=None,
                 dpi=300,
                 rug=False,
                 jitter=False,
                 skip_first=False):
    # -------------------------------------------------------------------------
    #
    # Check in_file content
    #
    # -------------------------------------------------------------------------

    for p, line in enumerate(in_file):

        line = chomp(line)
        line = line.split("\t")

        if len(line) > 2:
            message("Need a two columns file.",
                    type="ERROR")
        if skip_first:
            if p == 0:
                continue
        try:
            fl = float(line[1])
        except ValueError:
            msg = "It seems that column 2 of input file"
            msg += " contains non numeric values. "
            msg += "Check that no header is present and that "
            msg += "columns are ordered properly. "
            msg += "Or use '--skip-first'. "
            message(msg, type="ERROR")

        if log2:
            fl = fl + pseudo_count
            if fl <= 0:
                message("Can not log transform negative/zero values. Add a pseudo-count.",
                        type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check colors
    #
    # -------------------------------------------------------------------------

    set_colors = set_colors.split(",")

    if len(set_colors) != 2:
        message("Need two colors. Please fix.", type="ERROR")

    mcolors_name = mcolors.cnames

    for i in set_colors:
        if i not in mcolors_name:
            if not is_hex_color(i):
                message(i + " is not a valid color. Please fix.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Preparing output files
    #
    # -------------------------------------------------------------------------

    # Preparing pdf file name
    file_out_list = make_outdir_and_file(out_dir, ["control_list.txt",
                                                   "reference_list.txt",
                                                   "diagnostic_diagrams." + page_format],
                                         force=True)

    control_file, reference_file_out, img_file = file_out_list

    if user_img_file is not None:

        os.unlink(img_file.name)
        img_file = user_img_file

        if not img_file.name.endswith(page_format):
            msg = "Image format should be: {f}. Please fix.".format(f=page_format)
            message(msg, type="ERROR")

        test_path = os.path.abspath(img_file.name)
        test_path = os.path.dirname(test_path)

        if not os.path.exists(test_path):
            os.makedirs(test_path)

    # -------------------------------------------------------------------------
    #
    # Read the reference list
    #
    # -------------------------------------------------------------------------

    try:
        reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        message("No genes in --reference-gene-file.", type="ERROR")

    reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True)

    # -------------------------------------------------------------------------
    #
    # Delete duplicates
    #
    # -------------------------------------------------------------------------

    before = len(reference_genes)
    reference_genes = reference_genes.drop_duplicates(['gene'])
    after = len(reference_genes)

    msg = "%d duplicate lines have been deleted in reference file."
    message(msg % (before - after))

    # -------------------------------------------------------------------------
    #
    # Read expression data and add the pseudo_count
    #
    # -------------------------------------------------------------------------

    if skip_first:
        exp_data = pd.read_csv(in_file.name, sep="\t",
                               header=None, index_col=None,
                               skiprows=[0], names=['exprs'])
    else:

        exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0)

    exp_data.exprs = exp_data.exprs.values + pseudo_count

    # -------------------------------------------------------------------------
    #
    # log transformation
    #
    # -------------------------------------------------------------------------

    ylabel = 'Expression'

    if log2:
        if len(exp_data.exprs.values[exp_data.exprs.values == 0]):
            message("Can't use log transformation on zero or negative values. Use -p.",
                    type="ERROR")
        else:
            exp_data.exprs = np.log2(exp_data.exprs.values)
            ylabel = 'log2(Expression)'

    # -------------------------------------------------------------------------
    #
    # Are reference gene found in control list
    #
    # -------------------------------------------------------------------------

    # Sort in increasing order
    exp_data = exp_data.sort_values('exprs')

    #  Vector with positions indicating which in the
    # expression data list are found in reference_gene

    reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index]

    msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found)
    message(msg)

    not_found = [x for x in reference_genes['gene'] if x not in exp_data.index]

    if len(not_found):
        if len(not_found) == len(reference_genes):
            message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR")
        else:
            message("List of reference genes not found :%s" % not_found)
    else:
        message("All reference genes were found.")

    # -------------------------------------------------------------------------
    #
    # Search for genes with matched signal
    #
    # -------------------------------------------------------------------------

    exp_data_save = exp_data.copy()

    control_list = list()

    nb_candidate_left = exp_data.shape[0] - len(reference_genes_found)

    message("Searching for genes with matched signal.")

    if nb_candidate_left < len(reference_genes_found):
        message("Not enough element to perform selection. Exiting", type="ERROR")

    for i in reference_genes_found:
        not_candidates = reference_genes_found + control_list
        not_candidates = list(set(not_candidates))

        diff = abs(exp_data.loc[i] - exp_data)
        control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist())

    # -------------------------------------------------------------------------
    #
    # Prepare a dataframe for plotting
    #
    # -------------------------------------------------------------------------

    message("Preparing a dataframe for plotting.")

    reference = exp_data_save.loc[reference_genes_found].sort_values('exprs')
    reference = reference.assign(genesets=['Reference'] * reference.shape[0])

    control = exp_data_save.loc[control_list].sort_values('exprs')
    control = control.assign(genesets=['Control'] * control.shape[0])

    data = pd.concat([reference, control])
    data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index)
    data['genesets'] = Categorical(data['genesets'])

    # -------------------------------------------------------------------------
    #
    # Diagnostic plots
    #
    # -------------------------------------------------------------------------

    p = ggplot(data, aes(x='sets', y='exprs', fill='genesets'))

    p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors)))

    p += geom_violin(color=None)

    p += xlab('Gene sets') + ylab(ylabel)

    p += facet_wrap('~genesets')

    if rug:
        p += geom_rug()

    if jitter:
        p += geom_jitter()

    p += theme_bw()
    p += theme(axis_text_x=element_blank())

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    #
    # Saving
    #
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()
        message("Saving diagram to file : " + img_file.name)
        message("Be patient. This may be long for large datasets.")

        try:
            p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False)
        except PlotnineError as err:
            message("Plotnine message: " + err.message)
            message("Plotnine encountered an error.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # write results
    #
    # -------------------------------------------------------------------------

    exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t")
    exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
Ejemplo n.º 27
0
def MDplot(Data,
           Names=None,
           Ordering='Default',
           Scaling=None,
           Fill='darkblue',
           RobustGaussian=True,
           GaussianColor='magenta',
           Gaussian_lwd=1.5,
           BoxPlot=False,
           BoxColor='darkred',
           MDscaling='width',
           LineColor='black',
           LineSize=0.01,
           QuantityThreshold=40,
           UniqueValuesThreshold=12,
           SampleSize=500000,
           SizeOfJitteredPoints=1,
           OnlyPlotOutput=True,
           ValueColumn=None,
           ClassColumn=None):
    """
    Plots a mirrored density plot for each numeric column
    
    Args:
        Data (dataframe): dataframe containing data. Each column is one 
                          variable (wide table format, for long table format 
                          see ValueColumn and ClassColumn)
        Names (list): list of column names (will be used if data is not a 
                      dataframe)
        Ordering (str): 'Default', 'Columnwise', 'Alphabetical' or 'Statistics'
        Scaling (str): scaling method, one of: Percentalize, CompleteRobust, 
                                               Robust, Log
        Fill (str): color of MD-Plot
        RobustGaussian (bool): draw a gaussian distribution if column is 
                               gaussian
        GaussianColor (str): color for gaussian distribution
        Gaussian_lwd (float): line width of gaussian distribution
        BoxPlot (bool): draw box-plot
        BoxColor (str): color for box-plots
        MDscaling (str): scale of ggplot violin
        LineSize (float): line width of ggplot violin
        QuantityThreshold (int): minimal number of rows
        UniqueValuesThreshold (int): minimal number of unique values per 
                                         column
        SampleSize (int): number of samples used if number of rows is larger 
                          than SampleSize
        OnlyPlotOutput (bool): if True than returning only ggplot object,
                               if False than returning dictionary containing 
                               ggplot object and additional infos
        ValueColumn (str): name of the column of values to be plotted
                           (data in long table format)
        ClassColumn (str): name of the column with class identifiers for the 
                           value column (data in long table format)
        
    Returns:
        ggplot object or dictionary containing ggplot object and additional 
        infos
    """

    if not isinstance(Data, pd.DataFrame):
        try:
            if Names is not None:
                Data = pd.DataFrame(Data, columns=Names)
            else:
                Data = pd.DataFrame(Data)
                lstCols = list(Data.columns)
                dctCols = {}
                for strCol in lstCols:
                    dctCols[strCol] = "C_" + str(strCol)
                Data = Data.rename(columns=dctCols)
        except:
            raise Exception("Data cannot be converted into pandas dataframe")
    else:
        Data = Data.reset_index(drop=True)

    if ValueColumn is not None and ClassColumn is not None:
        lstCols = list(Data.columns)
        if ValueColumn not in lstCols:
            raise Exception("ValueColumn not contained in dataframe")
        if ClassColumn not in lstCols:
            raise Exception("ClassColumn not contained in dataframe")

        lstClasses = list(Data[ClassColumn].unique())
        DataWide = pd.DataFrame()
        for strClass in lstClasses:
            if len(DataWide) == 0:
                DataWide = Data[Data[ClassColumn] == strClass].copy()\
                .reset_index(drop=True)
                DataWide = DataWide.rename(columns={ValueColumn: strClass})
                DataWide = DataWide[[strClass]]
            else:
                dfTemp = Data[Data[ClassColumn] == strClass].copy()\
                .reset_index(drop=True)
                dfTemp = dfTemp.rename(columns={ValueColumn: strClass})
                dfTemp = dfTemp[[strClass]]
                DataWide = DataWide.join(dfTemp, how='outer')
        Data = DataWide.copy()

    lstCols = list(Data.columns)
    for strCol in lstCols:
        if not is_numeric_dtype(Data[strCol]):
            print("Deleting non numeric column: " + strCol)
            Data = Data.drop([strCol], axis=1)
        else:
            if abs(Data[strCol].sum()) == np.inf:
                print("Deleting infinite column: " + strCol)
                Data = Data.drop([strCol], axis=1)

    Data = Data.rename_axis("index", axis="index")\
    .rename_axis("variable", axis="columns")
    dvariables = Data.shape[1]
    nCases = Data.shape[0]

    if nCases > SampleSize:
        print('Data has more cases than "SampleSize". Drawing a sample for '
              'faster computation. You can omit this by setting '
              '"SampleSize=len(data)".')
        sampledIndex = np.sort(
            np.random.choice(list(Data.index), size=SampleSize, replace=False))
        Data = Data.loc[sampledIndex]

    nPerVar = Data.apply(lambda x: len(x.dropna()))
    nUniquePerVar = Data.apply(lambda x: len(list(x.dropna().unique())))

    # renaming columns to nonumeric names
    lstCols = list(Data.columns)
    dctCols = {}
    for strCol in lstCols:
        try:
            a = float(strCol)
            dctCols[strCol] = "C_" + str(strCol)
        except:
            dctCols[strCol] = str(strCol)
    Data = Data.rename(columns=dctCols)

    if Scaling == "Percentalize":
        Data = Data.apply(lambda x: 100 * (x - x.min()) / (x.max() - x.min()))
    if Scaling == "CompleteRobust":
        Data = robust_normalization(Data, centered=True, capped=True)
    if Scaling == "Robust":
        Data = robust_normalization(Data, centered=False, capped=False)
    if Scaling == "Log":
        Data = signed_log(Data, base="Ten")
        if RobustGaussian == True:
            RobustGaussian = False
            print("log with robust gaussian does not work, because mean and "
                  "variance is not valid description for log normal data")

#_______________________________________________Roboust Gaussian and Statistics
    if RobustGaussian == True or Ordering == "Statistics":
        Data = Data.applymap(lambda x: np.nan if abs(x) == np.inf else x)

        if nCases < 50:
            warnings.warn("Sample is maybe too small for statistical testing")

        factor = pd.Series([0.25, 0.75]).apply(lambda x: abs(norm.ppf(x)))\
        .sum()
        std = Data.std()

        dfQuartile = Data.apply(
            lambda x: mquantiles(x, [0.25, 0.75], alphap=0.5, betap=0.5))
        dfQuartile = dfQuartile.append(dfQuartile.loc[1] - dfQuartile.loc[0],
                                       ignore_index=True)
        dfQuartile.index = ["low", "hi", "iqr"]
        dfMinMax = Data.apply(
            lambda x: mquantiles(x, [0.001, 0.999], alphap=0.5, betap=0.5))
        dfMinMax.index = ["min", "max"]

        shat = pd.Series()
        mhat = pd.Series()
        nonunimodal = pd.Series()
        skewed = pd.Series()
        bimodalprob = pd.Series()
        isuniformdist = pd.Series()
        nSample = max([10000, nCases])
        normaldist = np.empty((nSample, dvariables))
        normaldist[:] = np.nan
        normaldist = pd.DataFrame(normaldist, columns=lstCols)

        for strCol in lstCols:
            shat[strCol] = min(
                [std[strCol], dfQuartile[strCol].loc["iqr"] / factor])
            mhat[strCol] = trim_mean(Data[strCol].dropna(), 0.1)

            if nCases > 45000 and nPerVar[strCol] > 8:
                # statistical testing does not work with to many cases
                sampledIndex = np.sort(
                    np.random.choice(list(Data.index),
                                     size=45000,
                                     replace=False))
                vec = Data[strCol].loc[sampledIndex]
                if nUniquePerVar[strCol] > UniqueValuesThreshold:
                    nonunimodal[strCol] = dip.diptst(vec.dropna(), numt=100)[1]
                    skewed[strCol] = skewtest(vec)[1]
                    args = (dfMinMax[strCol].loc["min"],
                            dfMinMax[strCol].loc["max"] \
                            - dfMinMax[strCol].loc["min"])
                    isuniformdist[strCol] = kstest(vec, "uniform", args)[1]
                    bimodalprob[strCol] = bimodal(vec)["Bimodal"]
                else:
                    print("Not enough unique values for statistical testing, "
                          "thus output of testing is ignored.")
                    nonunimodal[strCol] = 1
                    skewed[strCol] = 1
                    isuniformdist[strCol] = 0
                    bimodalprob[strCol] = 0
            elif nPerVar[strCol] < 8:
                warnings.warn("Sample of finite values to small to calculate "
                              "agostino.test or dip.test for " + strCol)
                nonunimodal[strCol] = 1
                skewed[strCol] = 1
                isuniformdist[strCol] = 0
                bimodalprob[strCol] = 0
            else:
                if nUniquePerVar[strCol] > UniqueValuesThreshold:
                    nonunimodal[strCol] = dip.diptst(Data[strCol].dropna(),
                                                     numt=100)[1]
                    skewed[strCol] = skewtest(Data[strCol])[1]
                    args = (dfMinMax[strCol].loc["min"],
                            dfMinMax[strCol].loc["max"] \
                            - dfMinMax[strCol].loc["min"])
                    isuniformdist[strCol] = kstest(Data[strCol], "uniform",
                                                   args)[1]
                    bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"]
                else:
                    print("Not enough unique values for statistical testing, "
                          "thus output of testing is ignored.")
                    nonunimodal[strCol] = 1
                    skewed[strCol] = 1
                    isuniformdist[strCol] = 0
                    bimodalprob[strCol] = 0

            if isuniformdist[strCol] < 0.05 and nonunimodal[strCol] > 0.05 \
            and skewed[strCol] > 0.05 and bimodalprob[strCol] < 0.05 \
            and nPerVar[strCol] > QuantityThreshold \
            and nUniquePerVar[strCol] > UniqueValuesThreshold:
                normaldist[strCol] = np.random.normal(mhat[strCol],
                                                      shat[strCol], nSample)
                normaldist[strCol] = normaldist[strCol]\
                .apply(lambda x: np.nan if x < Data[strCol].min() \
                                 or x > Data[strCol].max() else x)
        nonunimodal[nonunimodal == 0] = 0.0000000001
        skewed[skewed == 0] = 0.0000000001
        effectStrength = (-10 * np.log(skewed) - 10 * np.log(nonunimodal)) / 2

#______________________________________________________________________Ordering
    if Ordering == "Default":
        bimodalprob = pd.Series()
        for strCol in lstCols:
            if nCases > 45000 and nPerVar[strCol] > 8:
                sampledIndex = np.sort(
                    np.random.choice(list(Data.index),
                                     size=45000,
                                     replace=False))
                vec = Data[strCol].loc[sampledIndex]
                bimodalprob[strCol] = bimodal(vec)["Bimodal"]
            elif nPerVar[strCol] < 8:
                bimodalprob[strCol] = 0
            else:
                bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"]
        if len(list(bimodalprob.unique())) < 2 and dvariables > 1 \
        and RobustGaussian == True:
            rangfolge = list(effectStrength.sort_values(ascending=False).index)
            print("Using statistics for ordering instead of default")
        else:
            rangfolge = list(bimodalprob.sort_values(ascending=False).index)

    if Ordering == "Columnwise":
        rangfolge = lstCols

    if Ordering == "Alphabetical":
        rangfolge = lstCols.copy()
        rangfolge.sort()

    if Ordering == "Statistics":
        rangfolge = list(effectStrength.sort_values(ascending=False).index)

#________________________________________________________________Data Reshaping
    if nPerVar.min() < QuantityThreshold \
    or nUniquePerVar.min() < UniqueValuesThreshold:
        warnings.warn("Some columns have less than " + str(QuantityThreshold) +
                      " data points or less than " +
                      str(UniqueValuesThreshold) +
                      " unique values. Changing from MD-plot to Jitter-Plot "
                      "for these columns.")
        dataDensity = Data.copy()
        mm = Data.median()
        for strCol in lstCols:
            if nPerVar[strCol] < QuantityThreshold \
            or nUniquePerVar[strCol] < UniqueValuesThreshold:
                if mm[strCol] != 0:
                    dataDensity[strCol] = mm[strCol] \
                    * np.random.uniform(-0.001, 0.001, nCases) + mm[strCol]
                else:
                    dataDensity[strCol] = np.random.uniform(
                        -0.001, 0.001, nCases)
        # Generates in the cases where pdf cannot be estimated a scatter plot
        dataJitter = dataDensity.copy()
        # Delete all scatters for features where distributions can be estimated
        for strCol in lstCols:
            if nPerVar[strCol] >= QuantityThreshold \
            and nUniquePerVar[strCol] >= UniqueValuesThreshold:
                dataJitter[strCol] = np.nan
        #apply ordering
        dataframe = dataDensity[rangfolge].reset_index()\
        .melt(id_vars=["index"])
    else:
        dataframe = Data[rangfolge].reset_index().melt(id_vars=["index"])

    dctCols = {"index": "ID", "variable": "Variables", "value": "Values"}
    dataframe = dataframe.rename(columns=dctCols)

    #______________________________________________________________________Plotting
    plot = p9.ggplot(dataframe, p9.aes(x="Variables", group="Variables",
                                        y="Values")) \
                     + p9.scale_x_discrete(limits=rangfolge)

    plot = plot + p9.geom_violin(stat = stat_pde_density(scale=MDscaling),
                                 fill=Fill, colour=LineColor,
                                 size=LineSize, trim=True) \
                           + p9.theme(axis_text_x=p9.element_text(rotation=90))

    if nPerVar.min() < QuantityThreshold \
    or nUniquePerVar.min() < UniqueValuesThreshold:
        dataframejitter = dataJitter[rangfolge].reset_index()\
        .melt(id_vars=["index"])
        dataframejitter = dataframejitter.rename(columns=dctCols)
        plot = plot + p9.geom_jitter(
            size=SizeOfJitteredPoints,
            data=dataframejitter,
            colour=LineColor,
            mapping=p9.aes(x="Variables", group="Variables", y="Values"),
            position=p9.position_jitter(0.15))

    if RobustGaussian == True:
        dfTemp = normaldist[rangfolge].reset_index().melt(id_vars=["index"])
        dfTemp = dfTemp.rename(columns=dctCols)
        if dfTemp["Values"].isnull().all() == False:
            plot = plot + p9.geom_violin(
                data=dfTemp,
                mapping=p9.aes(x="Variables", group="Variables", y="Values"),
                colour=GaussianColor,
                alpha=0,
                scale=MDscaling,
                size=Gaussian_lwd,
                na_rm=True,
                trim=True,
                fill=None,
                position="identity",
                width=1)

    if BoxPlot == True:
        plot = plot + p9.stat_boxplot(geom = "errorbar", width = 0.5,
                                      color=BoxColor) \
                    + p9.geom_boxplot(width=1, outlier_colour = None, alpha=0,
                                      fill='#ffffff', color=BoxColor,
                                      position="identity")

    if OnlyPlotOutput == True:
        return plot
    else:
        print(plot)
        return {
            "Ordering": rangfolge,
            "DataOrdered": Data[rangfolge],
            "ggplotObj": plot
        }
Ejemplo n.º 28
0
def test_quantiles_input_checks():
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), draw_quantiles=True)
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), draw_quantiles=["A", 0.25])
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), draw_quantiles=[0.25, 1.25])
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), draw_quantiles=[0.])
    with pytest.raises(ValueError):
        geom_violin(aes('x', 'y'), draw_quantiles=[1.])
    g = geom_violin(aes('x', 'y'), draw_quantiles=np.array([0.25, 0.25]))
    assert isinstance(g.params['draw_quantiles'], pd.Series)
    g = geom_violin(aes('x', 'y'), draw_quantiles=0.5)
    assert isinstance(g.params['draw_quantiles'], pd.Series)
Ejemplo n.º 29
0
def show_fraction(adata, mode='labelling', group=None):
    """Plot the fraction of each category of data used in the velocity estimation.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`
        an Annodata object
    mode: `string` (default: labeling)
        Which mode of data do you want to show, can be one of `labeling`, `splicing` and `full`.
    group: `string` (default: None)
        Which group to facets the data into subplots. Default is None, or no faceting will be used.

    Returns
    -------
        A ggplot-like plot that shows the fraction of category, produced from plotnine (A equivalent of R's ggplot2 in Python).
    """

    import plotnine as p9

    if not (mode in ['labelling', 'splicing', 'full']):
        raise Exception(
            'mode can be only one of the labelling, splicing or full')

    if mode is 'labelling' and all(
        [i in adata.layers.keys() for i in ['new', 'old']]):
        new_mat, old_mat = adata.layers['new'], adata.layers['old']
        new_cell_sum, old_cell_sum = np.sum(new_mat, 1), np.sum(old_mat, 1) if not issparse(new_mat) else new_mat.sum(1).A1, \
                                     old_mat.sum(1).A1

        tot_cell_sum = new_cell_sum + old_cell_sum
        new_frac_cell, old_frac_cell = new_cell_sum / tot_cell_sum, old_cell_sum / tot_cell_sum
        df = pd.DataFrame({
            'new_frac_cell': new_frac_cell,
            'old_frac_cell': old_frac_cell
        })

        if group is not None and group in adata.obs.key():
            df['group'] = adata.obs[group]

        df = df.melt(value_vars=['new_frac_cell', 'old_frac_cell'])

    elif mode is 'splicing' and all([
            i in adata.layers.keys()
            for i in ['spliced', 'ambiguous', 'unspliced']
    ]):
        unspliced_mat, spliced_mat, ambiguous_mat = adata.layers[
            'unspliced'], adata.layers['spliced'], adata.layers['ambiguous']
        un_cell_sum, sp_cell_sum, am_cell_sum = np.sum(unspliced_mat, 1), np.sum(spliced_mat, 1), np.sum(ambiguous_mat, 1)  if not \
            issparse(unspliced_mat) else unspliced_mat.sum(1).A1, spliced_mat.sum(1).A1, ambiguous_mat.sum(1).A1

        tot_cell_sum = un_cell_sum + sp_cell_sum + am_cell_sum
        un_frac_cell, sp_frac_cell, am_frac_cell = un_cell_sum / tot_cell_sum, sp_cell_sum / tot_cell_sum, am_cell_sum / tot_cell_sum
        df = pd.DataFrame({
            'unspliced': un_frac_cell,
            'spliced': sp_frac_cell,
            'ambiguous': am_frac_cell
        })

        if group is not None and group in adata.obs.key():
            df['group'] = adata.obs[group]

        df = df.melt(value_vars=['unspliced', 'spliced', 'ambiguous'])

    elif mode is 'full' and all(
        [i in adata.layers.keys() for i in ['uu', 'ul', 'su', 'sl']]):
        uu, ul, su, sl = adata.layers['uu'], adata.layers['ul'], adata.layers[
            'su'], adata.layers['sl']
        uu_sum, ul_sum, su_sum, sl_sum = np.sum(uu, 1), np.sum(ul, 1), np.sum(su, 1), np.sum(sl, 1) if not issparse(uu) \
            else uu.sum(1).A1, ul.sum(1).A1, su.sum(1).A1, sl.sum(1).A1

        tot_cell_sum = uu + ul + su + sl
        uu_frac, ul_frac, su_frac, sl_frac = uu_sum / tot_cell_sum, ul_sum / tot_cell_sum, su / tot_cell_sum, sl / tot_cell_sum
        df = pd.DataFrame({
            'uu_frac': uu_frac,
            'ul_frac': ul_frac,
            'su_frac': su_frac,
            'sl_frac': sl_frac
        })

        if group is not None and group in adata.obs.key():
            df['group'] = adata.obs[group]

        df = df.melt(value_vars=['uu_frac', 'ul_frac', 'su_frac', 'sl_frac'])

    else:
        raise Exception(
            'Your adata is corrupted. Make sure that your layer has keys new, old for the labelling mode, '
            'spliced, ambiguous, unspliced for the splicing model and uu, ul, su, sl for the full mode'
        )

    if group is None:
        (p9.ggplot(df, p9.aes(x=variable, y=value)) + p9.geom_violin() +
         p9.facet_wrap('~group') + p9.xlab('Category') + p9.ylab('Fraction'))
    else:
        (p9.ggplot(df, p9.aes(x=variable, y=value)) + p9.geom_violin() +
         p9.facet_wrap('~group') + p9.xlab('Category') + p9.ylab('Fraction'))
Ejemplo n.º 30
0
def test_style():
    p = (ggplot(df, aes('x')) + geom_violin(aes(y='y'), style='right') +
         geom_violin(aes(y='y+25'), style='left'))
    assert p == 'style'
Ejemplo n.º 31
0
            ),
            ignore_index=True,
        )
        final_acc = final_acc.append(
            {"verb": verb, "trial": trial, "accuracy": sum(accuracies[-5:]) / 5},
            ignore_index=True,
        )

print(final_acc[final_acc['verb']=='WondowLess'].describe())
print(final_acc[final_acc['verb']=='AllOpen'].describe())

print(stats.ttest_ind(
    final_acc[final_acc['verb']=='WondowLess']['accuracy'].values, 
    final_acc[final_acc['verb']=='AllOpen']['accuracy'].values
))

print(long_data)

plot = pn.ggplot(long_data) + pn.geom_line(
    pn.aes(x="step", y="accuracy", color="verb", group="verb*trial"), alpha=0.5
)

print(plot)

plot = (
    pn.ggplot(final_acc, pn.aes(x="verb", y="accuracy"))
    + pn.geom_violin(pn.aes(fill="verb"))
    # + pn.geom_dotplot(dotsize=0.05, stackdir="centerwhole", binaxis="y", binwidth=0.0005)
    + pn.geom_point(size=0.5, alpha=0.5)
)
print(plot)
Ejemplo n.º 32
0
def test_scale_count():
    p = (ggplot(df, aes('dist', 'value')) + geom_violin(scale='count') +
         geom_sina(scale='count', random_state=123))

    assert p == 'scale_count'
Ejemplo n.º 33
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Calcualte and compare LISI across a series of reduced dims and
            categorical variables.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    # parser.add_argument(
    #     '-h5', '--h5_anndata',
    #     action='store',
    #     dest='h5',
    #     required=True,
    #     help='H5 AnnData file.'
    # )

    parser.add_argument(
        '-rf',
        '--reduced_dims_tsv',
        action='store',
        dest='reduced_dims',
        required=True,
        help='List of tab-delimited files of reduced dimensions (e.g., PCs)\
            for each cell. First column is cell_barcode. List should be\
            split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).')

    parser.add_argument(
        '-lbl',
        '--reduced_dims_tsv_labels',
        action='store',
        dest='reduced_dims_labels',
        required=True,
        help='String of labels for each reduced_dims_tsv file. List should be\
            split by "::".')

    parser.add_argument(
        '-mf',
        '--metadata_tsv',
        action='store',
        dest='metadata_tsv',
        required=True,
        help='Tab-delimited file of metadata for each cell. First column\
            is cell_barcode.')

    parser.add_argument(
        '-mv',
        '--metadata_columns',
        action='store',
        dest='metadata_columns',
        default='experiment_id',
        help='Comma separated string of categorical variables to calculate\
            LISI with.\
            (default: %(default)s)')

    parser.add_argument('-p',
                        '--perplexity',
                        action='store',
                        dest='perplexity',
                        default=30.0,
                        type=float,
                        help='Perplexity.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: <metadata_tsv>-lisi)')

    options = parser.parse_args()

    # Fixed settings.
    # verbose = True

    # Get the out file base.
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}-lisi'.format(
            os.path.basename(
                options.metadata_tsv.rstrip('tsv.gz').rstrip('.')))

    # Get the columns to use
    lisi_columns = options.metadata_columns.split(',')
    # lisi_columns = ['experiment_id', 'batch']
    lisi_columns_dtype = dict(
        zip(lisi_columns, ['category'] * len(lisi_columns)))

    # Load the metadata file
    file_meta = options.metadata_tsv
    df_meta = pd.read_csv(file_meta,
                          sep='\t',
                          index_col='cell_barcode',
                          dtype=lisi_columns_dtype)

    # Load the reduced dims.
    files = options.reduced_dims.split('::')
    labels = options.reduced_dims_labels.split('::')
    assert len(files) == len(labels), 'ERROR: check files and labels input'

    # Make a dict of theoretical maximum LISI value for each label.
    lisi_limit = {}
    for col in lisi_columns:
        n_cat = len(df_meta[col].cat.categories)
        lisi_limit[col] = n_cat

    list_lisi = []
    for i in range(len(files)):
        df_reduced_dims = pd.read_csv(files[i],
                                      sep='\t',
                                      index_col='cell_barcode')

        # Run lisi and save results to dataframe
        _df_lisi = pd.DataFrame(hm.compute_lisi(
            df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns],
            lisi_columns),
                                columns=lisi_columns)
        _df_lisi['file'] = files[i]
        _df_lisi['label'] = labels[i]
        _df_lisi['cell_barcode'] = df_meta.index
        list_lisi.append(_df_lisi)

    # Make one long dataframe.
    df_lisi = pd.concat(list_lisi)
    # Make cell_barcode the first column.
    cols = list(df_lisi.columns)
    cols = [cols[-1]] + cols[:-1]

    # Save the results
    df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base),
                         sep='\t',
                         index=False,
                         quoting=csv.QUOTE_NONNUMERIC,
                         na_rep='',
                         compression='gzip')

    # Compare the lisi distributions
    n_labels = len(labels)
    for lisi_column in lisi_columns:
        # Make density plot.
        gplt = plt9.ggplot(df_lisi,
                           plt9.aes(
                               fill='label',
                               x='label',
                               y=lisi_column,
                           ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.geom_violin(alpha=0.9)
        gplt = gplt + plt9.geom_boxplot(
            group='label',
            position=plt9.position_dodge(width=.9),
            width=.1,
            fill='white',
            outlier_alpha=0  # Do not know how to totally remove outliers.
        )
        # Add a line at the theoretical maximum
        gplt = gplt + plt9.geom_hline(
            plt9.aes(yintercept=lisi_limit[lisi_column]))
        # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label))
        gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt = gplt + plt9.theme(legend_position='none')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
        gplt.save(
            '{}-{}-violin.png'.format(out_file_base, lisi_column),
            dpi=300,
            width=4 * (n_labels / 4),
            height=10,
            # height=4*(n_samples/4),
            limitsize=False)

        # Make ecdf.
        gplt = plt9.ggplot(df_lisi, plt9.aes(
            x=lisi_column,
            color='label',
        ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.stat_ecdf(alpha=0.8)
        gplt = gplt + plt9.labs(
            x='LISI',
            y='Cumulative density',
            # color='Reduction',
            title='')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
        gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)