Ejemplo n.º 1
0
def hist_pseudotime(adata, fill='#595959', alpha=1, bins=30):
    """Plots a histogram of pseudotime
    
    Parameters
    --------------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.pseudotime`.
    fill: str
        Controls the color of the histogram bars. Must be a supported color
        name or hex-code.
    alpha: float
        A float between 0 and 1. Controls the transparency of the bars.
        
    Returns
    -----------
    A plotnine histogram of pseudotime.
    """
    if fill in adata.obs.columns:
        hist_plt = (ggplot(adata.obs, aes('pseudotime', fill=fill)) +
                    geom_histogram(alpha=alpha, bins=bins))

    else:
        hist_plt = (ggplot(adata.obs, aes('pseudotime')) +
                    geom_histogram(fill=fill, alpha=alpha, bins=bins))

    hist_plt = (hist_plt + labs(x='Pseudotime', y='Count') + theme_std)

    return hist_plt
Ejemplo n.º 2
0
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df, questions = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    print('{} questions'.format(len(set(df.qid))))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color),
        index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
Ejemplo n.º 3
0
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
Ejemplo n.º 4
0
 def plot_char_percent_vs_accuracy_histogram(self, category=False):
     if category:
         return (ggplot(self.char_plot_df) + facet_wrap('category_jmlr') +
                 aes(x='char_percent', fill='Outcome') +
                 geom_histogram(binwidth=.05))
     else:
         return (ggplot(self.char_plot_df) +
                 aes(x='char_percent', fill='Outcome') +
                 geom_histogram(binwidth=.05))
Ejemplo n.º 5
0
 def plot_char_percent_vs_accuracy_histogram(self, category=False):
     if category:
         return (
             ggplot(self.char_plot_df) + facet_wrap('category_jmlr')
             + aes(x='char_percent', fill='Outcome')
             + geom_histogram(binwidth=.05)
         )
     else:
         return (
             ggplot(self.char_plot_df)
             + aes(x='char_percent', fill='Outcome')
             + geom_histogram(binwidth=.05)
         )
Ejemplo n.º 6
0
 def plot_char_percent_vs_accuracy_histogram(self, category=False):
     if category:
         return (
             ggplot(self.char_plot_df)
             + facet_wrap("category_jmlr")
             + aes(x="char_percent", fill="Outcome")
             + geom_histogram(binwidth=0.05)
         )
     else:
         return (
             ggplot(self.char_plot_df)
             + aes(x="char_percent", fill="Outcome")
             + geom_histogram(binwidth=0.05)
         )
Ejemplo n.º 7
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') +
           geom_histogram(binwidth=2, position='identity', alpha=.6) +
           geom_text(aes(x='x', y=.22, label='x', color='Method'),
                     mean_len_df,
                     inherit_aes=False,
                     format_string='{:.1f}',
                     show_legend=False) +
           geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
                        mean_len_df,
                        inherit_aes=False,
                        color='black') + scale_linetype_manual(['dashed']) +
           facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) +
           xlab('Example Length') + ylab('Frequency') +
           scale_color_manual(values=COLORS) +
           scale_fill_manual(values=COLORS) + theme_fs() + theme(
               aspect_ratio=1,
               legend_title=element_blank(),
               legend_position=legend_position,
               legend_box=legend_box,
           ))

    return plt
Ejemplo n.º 8
0
def test_midpoint():
    p = (ggplot(df, aes('x')) +
         geom_histogram(aes(fill='factor(z)'), bins=n, alpha=0.25) +
         geom_freqpoly(bins=n, size=4) +
         geom_point(stat='bin', bins=n, size=4, stroke=0, color='red'))

    assert p + _theme == 'midpoint'
Ejemplo n.º 9
0
def test_scale_transformed_breaks():
    df = pd.DataFrame({'x': np.repeat(range(1, 5), range(1, 5))})
    p = ggplot(df, aes('x')) + geom_histogram(breaks=[1, 2.5, 4])
    out1 = layer_data(p)
    out2 = layer_data(p + scale_x_sqrt())
    np.testing.assert_allclose(out1.xmin, [1, 2.5])
    np.testing.assert_allclose(out2.xmin, np.sqrt([1, 2.5]))
Ejemplo n.º 10
0
 def plot_n_train_vs_accuracy(self):
     return (
         ggplot(self.combined_df)
         + facet_wrap("seen")
         + aes(x="n_train", fill="Outcome")
         + geom_histogram(binwidth=1)
     )
Ejemplo n.º 11
0
def plot_frequency(n = 200):
    """
    Draws the histogram of the distribution of n tweets by date.
    
    Parameters
    ----------
    n: int
    An integer specifying how many tweets should be analysed.
    
    Returns
    -------
    It saves the histogram as a .png file in the static folder.

    """
        
    from plotnine import ggplot, aes, geom_histogram,  scale_x_datetime, labs, theme_minimal, ggsave 
    from Mod_1_API import gather_tweets
    from mizani.breaks import date_breaks
    from mizani.formatters import date_format
    import pandas
    
     
    df = pandas.DataFrame(gather_tweets(n))
       
    plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) +
           geom_histogram() +
           scale_x_datetime(breaks=date_breaks('1 week')) +
           labs(x = "Time in weeks", y = "Number of tweets by source") +
           theme_minimal()
           )
    ggsave(plot = plot1, filename = "test.png", path = "static/")
Ejemplo n.º 12
0
    def plot_ccs_stats(self, variable, *,
                       trim_frac=0.005, bins=25, histogram_stat='count',
                       maxcol=None, panelsize=1.75):
        """Plot histograms of CCS stats for all runs.

        Parameters
        ----------
        variable : {'length', 'passes', 'accuracy'}
            Variable for which we plot stats. You will get an error
            if :meth:`Summaries.has_stat` is not true for `variable`.
        trim_frac : float
            Trim this amount of the bottom and top fraction from the
            data before plotting. Useful if outliers greatly extend scale.
        bins : int
            Number of histogram binds
        histogram_stat : {'count', 'density'}
            Plot the count of CCSs or their density normalized for each run.
        maxcol : None or int
            Max number of columns in faceted plot.
        panelsize : float
            Size of each plot panel.

        Returns
        -------
        plotnine.ggplot.ggplot
            A panel of histograms.

        """
        df = (self.ccs_stats(variable)
              .assign(lower=lambda x: x[variable].quantile(trim_frac),
                      upper=lambda x: x[variable].quantile(1 - trim_frac),
                      trim=lambda x: ((x[variable] > x['upper']) |
                                      (x[variable] < x['lower']))
                      )
              .query('not trim')
              )

        npanels = len(df['name'].unique())
        if maxcol is None:
            ncol = npanels
        else:
            ncol = min(maxcol, npanels)
        nrow = math.ceil(npanels / ncol)

        p = (p9.ggplot(df, p9.aes(variable, y=f"..{histogram_stat}..")) +
             p9.geom_histogram(bins=bins) +
             p9.facet_wrap('~ name', ncol=ncol) +
             p9.theme(figure_size=(panelsize * ncol, panelsize * nrow),
                      axis_text_x=p9.element_text(angle=90,
                                                  vjust=1,
                                                  hjust=0.5)
                      ) +
             p9.ylab('number of CCSs')
             )

        return p
Ejemplo n.º 13
0
def plot_histogram(df_plot,
                   variable_column,
                   output_file='plot_distribution',
                   facet_column='none',
                   x_log10=False):
    """Plot plot_distribution to png.

    Parameters
    ----------
    df_plot : pandas.DataFrame
        DataFrame with <variable_column> as a column.
    variable_column : string
        String of variable_column column to plot.
    output_file : string
        Basename of output file.
    facet_column : string
        Column to facet the plot by.

    Returns
    -------
    NULL
    """
    df_plot['x'] = df_plot[variable_column]
    if x_log10:
        if np.any(df_plot['x'].values < 0):
            return 1
        elif np.any(df_plot['x'].values == 0):
            df_plot['x'] = np.log10(df_plot['x'].values + 1e-10)
            variable_column = variable_column + ' (log10)'
        else:
            df_plot['x'] = np.log10(df_plot['x'].values)
            variable_column = variable_column + ' (log10)'
    gplt = plt9.ggplot(df_plot, plt9.aes(x='x'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_histogram(alpha=0.8)
    gplt = gplt + plt9.scale_x_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.labs(title='', x=variable_column)
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=-45, hjust=0))
    if facet_column != 'none':
        gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5)
        n_facets = df_plot[facet_column].nunique()
        gplt.save('{}.png'.format(output_file),
                  dpi=300,
                  width=6 * (n_facets / 4),
                  height=4 * (n_facets / 4),
                  limitsize=False)
    else:
        gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4)
    return 0
Ejemplo n.º 14
0
def test_deepcopy():
    p = ggplot(aes('x'), data=df) + geom_histogram()
    p2 = deepcopy(p)
    assert p is not p2
    # Not sure what we have to do for that...
    assert p.data is p2.data
    assert len(p.layers) == len(p2.layers)
    assert p.layers[0].geom is not p2.layers[0].geom
    assert len(p.mapping) == len(p2.mapping)
    assert p.mapping is not p2.mapping
    assert p.environment is p2.environment
Ejemplo n.º 15
0
def plot_pixel_values(im):
    plotr = pd.DataFrame(
        {'values': im.flatten()},
        index=range(len(im.flatten()))
    )
    return (
        p9.ggplot()
        + p9.geom_histogram(data=plotr, mapping=p9.aes('values'))
        + p9.theme_xkcd()
        + p9.labels.xlab('Pixel values')
    )
Ejemplo n.º 16
0
def plot_hists(df, out=None, **kwargs):
    r"""Construct histograms

    Create a set of histograms. Often used to visualize the results of random
    sampling for multiple outputs.

    Usually called as a dispatch from plot_auto().

    Args:
        out (list of strings): Variables to plot

    Returns:
        Seaborn histogram plot

    Examples:

        >>> import grama as gr
        >>> import matplotlib.pyplot as plt
        >>> from grama.models import make_cantilever_beam
        >>> md = make_cantilever_beam()
        >>> ## Dispatch from autoplotter
        >>> (
        >>>     md
        >>>     >> gr.ev_sample(n=100, df_det="nom")
        >>>     >> gr.pt_auto()
        >>> )
        >>> ## Re-create without metadata
        >>> (
        >>>     md
        >>>     >> gr.ev_sample(n=100, df_det="nom")
        >>>     >> gr.pt_hists(out=md.out)
        >>> )

    """
    if out is None:
        raise ValueError("Must provide input columns list as keyword out")

    return (
        df
        >> tf_pivot_longer(
            columns=out,
            names_to="var",
            values_to="value",
        )
        >> ggplot(aes("value"))
        + geom_histogram(bins=30)
        + facet_wrap("var", scales="free")
        + theme_minimal()
        + labs(
            x="Output Value",
            y="Count",
        )
    )
Ejemplo n.º 17
0
def plot_estimate_distribution(dist):
    return (pn.ggplot(dist, pn.aes(x='estimates')) +
            pn.geom_histogram(bins=25) + pn.geom_vline(
                xintercept=sum(pile['denomination']),
                color="#FF5500",
                size=2,
            ) + pn.geom_vline(
                xintercept=3363400,
                color="#FF5500",
                size=2,
                linetype='dotted',
            ))
Ejemplo n.º 18
0
    def density_plot(  # type: ignore
        self,
        df: pd.DataFrame,
        xmin=None,
        xmax=None,
        fill: str = "#fbb4ae",
        bins: int = 50,
        **kwargs,
    ):

        return (ggplot(df, aes(df.columns[0])) +
                geom_histogram(fill=fill, bins=bins) +
                self._scale_x(xmin, xmax) + ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 19
0
    def comparison_plot(  # type: ignore
            self,
            df: pd.DataFrame,
            xmin=None,
            xmax=None,
            bins: int = 50,
            **kwargs):

        return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) +
                scale_fill_brewer(type="qual", palette="Pastel1") +
                geom_histogram(position="identity", alpha=0.9, bins=bins) +
                self._scale_x(xmin, xmax) + facet_wrap(df.columns[0], ncol=1) +
                guides(fill=False) + ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 20
0
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("loc")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_grid(facets="category ~ .", scales='free_y') +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format())
      #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l])
      + ggtitle("Class Sizes") + xlab("Lines of Code") +
      ylab("Number of Classes") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1
                                                         })).save(file_path,
                                                                  width=8,
                                                                  height=18)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Distributions of QMOOD Quality Attributes") +
      xlab("Quality Attribute Value") + ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            subplots_adjust={
                "wspace": 0.35,
                "hspace": 0.35
            })).save(file_path, width=24, height=12)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Intensity of Design Pattern Use") +
      xlab("Percentage of Classes Participating in Design Pattern") +
      ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            axis_title_y=element_text(margin={"r": 40}),
            subplots_adjust={
                "wspace": 0.3,
                "hspace": 0.5
            })).save(file_path, width=24, height=24)
Ejemplo n.º 23
0
def plot_histogram_100_bins(histogram_df):
    """This function plots the data in histogram_df as a histogram with 100 bins

    Inputs
    ------
    histogram_df: pandas.DataFrame
        The dataframe containing the data to be plotted

    Returns
    -------
    plot: plotnine.ggplot
        The histogram figure
    """
    plot = ggplot(histogram_df, aes(x='effect_size')) + geom_histogram(bins=100)

    return plot
Ejemplo n.º 24
0
def plot_dist_with_ci(dist):
    return (pn.ggplot(dist, pn.aes(x='estimates')) +
            pn.geom_histogram(bins=25) + pn.geom_vline(
                xintercept=dist.quantile(0.025),
                color="#FF5500",
                size=2,
                linetype='dotted',
            ) + pn.geom_vline(
                xintercept=dist.quantile(0.975),
                color="#FF5500",
                size=2,
                linetype='dotted',
            ) + pn.ggtitle("${0:,.0f} ({1:,.0f}, {2:,.0f})".format(
                np.mean(dist.estimates),
                dist.estimates.quantile(0.025),
                dist.estimates.quantile(0.975),
            )))
Ejemplo n.º 25
0
    def plotMutsHistogram(self, value, *,
                          mutant_order=1, bins=30, wt_vline=True):
        """Plot distribution of phenotype for all mutants of a given order.

        Parameters
        ----------
        value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}
            What value to plot.
        mutant_order : int
            Plot mutations of this order. Currently only works for 1
            (single mutants).
        bins : int
            Number of bins in histogram.
        wt_vline : bool
            Draw a vertical line at the wildtype value.

        Returns
        -------
        plotnine.ggplot.ggplot
            Histogram of phenotype for all mutants.

        """
        if mutant_order != 1:
            raise ValueError('only implemented for `mutant_order` of 1')

        if value not in {'latentPhenotype', 'observedPhenotype',
                         'observedEnrichment'}:
            raise ValueError(f"invalid `value` of {value}")
        func = getattr(self, value)

        xlist = [func(m) for m in self.muteffects.keys()]

        p = (p9.ggplot(pd.DataFrame({value: xlist}),
                       p9.aes(value)) +
             p9.geom_histogram(bins=bins) +
             p9.theme(figure_size=(3.5, 2.5)) +
             p9.ylab(f"number of {mutant_order}-mutants")
             )

        if wt_vline:
            p = p + p9.geom_vline(
                        xintercept=func(''),
                        color=CBPALETTE[1],
                        linetype='dashed')

        return p
Ejemplo n.º 26
0
def plot_pred_hist(label_list, pred_list, names=None, n_bins=10):
    """
    予測確率のヒストグラムを描く
    :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)]  のようにして与える,  pred_list に対応させる
    :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること
    :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3  ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする.
    :param: n_bins: ヒストグラムのビン数
    :return: plotnine オブジェクト
    TODO: geom_vline の表示方法
    """
    if names is None:
        if len(label_list) == 2:
            names = ('train', 'test')
        elif len(label_list) == 3:
            names = ('train', 'valid', 'test')
        else:
            names = list(range(len(label_list)))
    else:
        pass
    name_order = {k: v for v, k in enumerate(names)}
    name_order_rev = {str(k): v for v, k in name_order.items()}
    d = pd.DataFrame(
            {col: v for col, v in zip(('y', 'prediction'), [list(chain.from_iterable(x)) for x in ([label_list, pred_list])])}
    ).assign(
        model=list(chain.from_iterable([[name] * len(l) for name, l in zip(names, label_list)]))
    ).melt(
        id_vars='model'
    ).assign(
        order=lambda x: x.model.replace(name_order)
    ).sort_values(['order', 'variable'])
    # 補助線としての平均値を引くためのデータ
    d_mean = d.drop(columns='order').groupby(['variable', 'model']).mean(
            ).reset_index().rename(columns={'value': 'mean'})
    d = d.merge(d_mean, on=['variable', 'model'])
    return ggplot(
            d,
            aes(x='value', y='..density..', group='variable', fill='variable')
    ) + geom_histogram(position='identity', alpha=.5, bins=10
    ) + geom_vline(
            aes(xintercept='mean', group='variable', color='variable',
                linetype='variable')
    ) + labs(x='prediction', fill='frequency', linetype='mean', color='mean'
    ) + facet_wrap(
            '~order', scales='free_y', labeller=lambda x: name_order_rev[x]
    ) + theme_classic() + theme(figure_size=(6, 4))
    def hist_residuals(self, figure_size=(8, 4), sample_frac=1.0):
        """Histogram of residuals

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(8, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        return (ggplot(self.df.sample(frac=sample_frac), aes(x="residual")) +
                geom_histogram(fill="lightblue", colour="grey") +
                geom_vline(xintercept=0, color="red", linetype="dashed") +
                labs(title="Residuals", x="Residuals") +
                theme(figure_size=figure_size))
Ejemplo n.º 28
0
    def show_community_prediction(
        self,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """
        Plot samples from the community prediction on this question

        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """
        community_samples = pd.Series([
            self.sample_normalized_community() for _ in range(0, num_samples)
        ])

        (_xmin,
         _xmax) = self.get_central_quantiles(community_samples,
                                             percent_kept=percent_kept,
                                             side_cut_from=side_cut_from)
        _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])

        df = pd.DataFrame(
            data={"samples": self.denormalize_samples(community_samples)})
        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )
        return (ggplot(df, aes("samples")) +
                geom_histogram(fill="#b3cde3", bins=bins) +
                scale_x_datetime(limits=(_xmin, _xmax)) +
                labs(x="Prediction", y="Counts", title=title_name) +
                ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 29
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (
        ggplot(len_df)
        + aes(x='x', fill='Method', y='..density..')
        + geom_histogram(binwidth=2, position='identity', alpha=.6)
        + geom_text(
            aes(x='x', y=.22, label='x', color='Method'),
            mean_len_df,
            inherit_aes=False,
            format_string='{:.1f}',
            show_legend=False
        )
        + geom_segment(
            aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
            mean_len_df,
            inherit_aes=False, color='black'
        )
        + scale_linetype_manual(['dashed'])
        + facet_wrap('Task')
        + xlim(0, 20) + ylim(0, .23)
        + xlab('Example Length') + ylab('Frequency')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            aspect_ratio=1,
            legend_title=element_blank(),
            legend_position=legend_position,
            legend_box=legend_box,
        )
    )

    return plt
Ejemplo n.º 30
0
    def plotMutsHistogram(self,
                          value, *,
                          k=None,
                          mutant_order=1,
                          bins=30,
                          wt_vline=True,
                          ):
        """Plot distribution of phenotype for all mutants of a given order.

        Parameters
        ----------
        value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}
            What value to plot.
        k : int or None
            If value is `latentPhenotype, which phenotype (1 <= `k` <=
            :attr:`MultiLatentSigmoidPhenotypeSimulator.n_latent_phenotypes`)
            to plot.
        mutant_order : int
            Plot mutations of this order. Currently only works for 1
            (single mutants).
        bins : int
            Number of bins in histogram.
        wt_vline : bool
            Draw a vertical line at the wildtype value.

        Returns
        -------
        plotnine.ggplot.ggplot
            Histogram of phenotype for all mutants.

        """
        if mutant_order != 1:
            raise ValueError('only implemented for `mutant_order` of 1')

        if value == 'latentPhenotype':
            if isinstance(k, int) and 1 <= k <= self.n_latent_phenotypes:
                kwargs = {'k': k}
                xlabel = f"latentPhenotype {k}"
            else:
                raise ValueError(f"invalid `k` of {k}")
        else:
            kwargs = {}
            xlabel = value

        if value not in {'latentPhenotype', 'observedPhenotype',
                         'observedEnrichment'}:
            raise ValueError(f"invalid `value` of {value}")
        func = getattr(self, value)

        xlist = [func(m, **kwargs) for m in self._all_subs]

        p = (p9.ggplot(pd.DataFrame({value: xlist}),
                       p9.aes(value)) +
             p9.geom_histogram(bins=bins) +
             p9.theme(figure_size=(3.5, 2.5)) +
             p9.ylab(f"number of {mutant_order}-mutants") +
             p9.xlab(xlabel)
             )

        if wt_vline:
            p = p + p9.geom_vline(
                        xintercept=func('', **kwargs),
                        color=CBPALETTE[1],
                        linetype='dashed')

        return p
Ejemplo n.º 31
0
def test_histogram_count():
    p = (ggplot(df, aes('x')) +
         geom_histogram(aes(fill='factor(z)'), bins=n))

    assert p + _theme == 'histogram-count'
Ejemplo n.º 32
0
 def plot_n_train_vs_accuracy(self):
     return (
         ggplot(self.combined_df) + facet_wrap('seen')
         + aes(x='n_train', fill='Outcome')
         + geom_histogram(binwidth=1)
     )
Ejemplo n.º 33
0




#Do we need to normalize our numeric and integer cols?
#SeniorCitizen was already in binary form. So no. 

from plotnine import ggplot, aes, geom_histogram, geom_boxplot
(ggplot(dat, aes(x='MonthlyCharges'))
+ geom_histogram()).save(filename="MonthlyCharges_Hist.png", dpi=300)

(ggplot(dat, aes(x='TotalCharges'))
+ geom_histogram()).save(filename="TotalCharges_Hist.png", dpi=300)

#Neither follow a normal distribution. Log transformation could help, but these are odd. 
dat["LogTotalCharges"] = np.log(dat["TotalCharges"]+1)
dat["LogMonthlyCharges"] = np.log(dat["MonthlyCharges"]+1)


(ggplot(dat, aes(x='LogMonthlyCharges'))
+ geom_histogram())

(ggplot(dat, aes(x='LogTotalCharges'))
+ geom_histogram())

#Doesn't really help so leave this for now. 

dat = dat.drop(columns = ["LogTotalCharges", "LogMonthlyCharges"])

Ejemplo n.º 34
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_normed_samples = self.normalize_samples(samples)

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_normalized_community()
                        for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_normed_samples,  # type: ignore
                })
            # import pdb
            # pdb.set_trace()
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df["prediction"] = self.denormalize_samples(df["prediction"])
            df["community"] = self.denormalize_samples(df["community"])

            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_histogram(position="identity", alpha=0.9) +
                    scale_x_datetime(limits=(_xmin, _xmax)) +
                    facet_wrap("sources", ncol=1) + labs(
                        x="Prediction",
                        y="Counts",
                        title=title_name,
                    ) + guides(fill=False) + ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            (_xmin, _xmax) = self.get_central_quantiles(
                prediction_normed_samples,
                percent_kept=percent_kept,
                side_cut_from=side_cut_from,
            )
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df = pd.DataFrame(data={
                "prediction":
                self.denormalize_samples(prediction_normed_samples)
            })
            return (ggplot(df, aes("prediction")) +
                    geom_histogram(fill="#b3cde3", bins=bins)
                    # + coord_cartesian(xlim = (_xmin,_xmax))
                    + scale_x_datetime(limits=(_xmin, _xmax)) +
                    labs(x="Prediction", y="Counts", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 35
0
    user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)


p0 = ggplot(user_stat) \
        + geom_point(aes(x='ratio', y='accuracy',
                     size='n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + theme(aspect_ratio=1)
p0.save('protobowl_users.pdf')
# p0.draw()
print('p0 done')


p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + theme(aspect_ratio=0.3)
p1.save('protobowl_hist.pdf')
# p1.draw()
print('p1 done')


p2 = ggplot(user_stat, aes(x='accuracy', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density(aes(x='accuracy')) \
        + theme(aspect_ratio=0.3)
p2.save('protobowl_acc.pdf')
# p2.draw()
print('p2 done')