Exemple #1
0
def estimate_cutoffs_plot(output_file,
                          df_plt,
                          df_cell_estimate_cutoff,
                          df_fit=None,
                          scale_x_log10=False,
                          save_plot=True):
    """Plot UMI counts by sorted cell barcodes."""
    if min(df_plt['umi_counts']) <= 0:
        fix_log_scale = min(df_plt['umi_counts']) + 1
        df_plt['umi_counts'] = df_plt['umi_counts'] + fix_log_scale
    gplt = plt9.ggplot()
    gplt = gplt + plt9.theme_bw()
    if len(df_plt) <= 50000:
        gplt = gplt + plt9.geom_point(mapping=plt9.aes(x='barcode',
                                                       y='umi_counts'),
                                      data=df_plt,
                                      alpha=0.05,
                                      size=0.1)
    else:
        gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='barcode',
                                                      y='umi_counts'),
                                     data=df_plt,
                                     alpha=0.25,
                                     size=0.75,
                                     color='black')
    gplt = gplt + plt9.geom_vline(mapping=plt9.aes(xintercept='n_cells',
                                                   color='method'),
                                  data=df_cell_estimate_cutoff,
                                  alpha=0.75,
                                  linetype='dashdot')
    gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
    if scale_x_log10:
        gplt = gplt + plt9.scale_x_continuous(
            trans='log10', labels=comma_labels, minor_breaks=0)
    else:
        gplt = gplt + plt9.scale_x_continuous(labels=comma_labels,
                                              minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    gplt = gplt + plt9.labs(title='',
                            y='UMI counts',
                            x='Barcode index, sorted by UMI count',
                            color='Cutoff')
    # Add the fit of the droplet utils model
    if df_fit:
        gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='x', y='y'),
                                     data=df_fit,
                                     alpha=1,
                                     color='yellow')
    if save_plot:
        gplt.save('{}.png'.format(output_file), dpi=300, width=5, height=4)
    return gplt
Exemple #2
0
def test_wrong_bases():
    # x axis not transformed
    p = (ggplot(df, aes('x', 'x')) +
         annotation_logticks(sides='b', size=.75, base=10) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis not transform, but ticks requested for a different base
    p = (ggplot(df, aes('x', 'x')) +
         annotation_logticks(sides='b', size=.75, base=10) +
         scale_x_continuous(trans=log_trans(8)) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis is discrete
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('discrete', 'x')) +
         annotation_logticks(sides='b', size=.75, base=None) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # y axis is discrete
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('x', 'discrete')) +
         annotation_logticks(sides='l', size=.75, base=None) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()
def test_annotation_stripes_continuous_transformed():
    pdf = mtcars.assign(am=pd.Categorical(mtcars.am))
    p = (ggplot(pdf) +
         annotation_stripes(fills=["red", "green", "blue"], alpha=0.1) +
         geom_jitter(aes("hp", "wt", color="am"), random_state=5) +
         scale_x_continuous(trans='log2'))
    assert p == "annotation_stripes_continuous_transformed"
Exemple #4
0
def round_2_plot():
    if not os.path.exists(round_2_df_path):
        eprint(f'Downloading {round_2_df_url} to {round_2_df_path}')
        urlretrieve(round_2_df_url, round_2_df_path)
    verify_checksum(round_2_df_checksum, round_2_df_path)
    df = pd.read_json(round_2_df_path)
    p = (
        ggplot(df) + aes(x='char_percent', y='correct', color='Dataset') +
        facet_wrap('Guessing_Model', nrow=1) + stat_summary_bin(
            fun_data=mean_no_se, bins=20, shape='.', linetype='None',
            size=0.5) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) +
        scale_x_continuous(breaks=[0, .5, 1]) +
        coord_cartesian(ylim=[0, 0.7]) +
        ggtitle('Round 2 Attacks and Models') +
        xlab('Percent of Question Revealed') + ylab('Accuracy') + theme(
            #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
            strip_text_x=element_text(margin={
                't': 6,
                'b': 6,
                'l': 1,
                'r': 5
            })) +
        scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'],
                           name='Questions'))
    p.save('2019_tacl_trick/auto_fig/round_2_json.pdf', width=7.0, height=1.7)
Exemple #5
0
    def plot_time_curve_with_threshold(self):
        toplot = self.aggregated.melt(
            id_vars='hour',
            value_vars=['number_bacteria', 'number_actin'],
            value_name='counts',
            var_name='Object')

        colors = self.create_color_list()

        myfig = (
            ggplot(toplot, aes("hour", "counts", color="Object")) +
            geom_point() + geom_line() + labels.xlab("Time [hours]") +
            labels.ylab("Average number of objects/nuclei") +
            pn.scale_colour_manual(values=colors,
                                   labels=list(self.sel_channel_time.value),
                                   name="") + pn.labs(colour="") +
            pn.scale_x_continuous(
                breaks=np.sort(self.result.hour.unique()),
                labels=list(np.sort(self.result.hour.unique()).astype(str))))

        self.time_curve_fig = myfig

        self.out_plot2.clear_output()
        with self.out_plot2:
            display(myfig)
Exemple #6
0
    def plot_time_curve_by_channel(self, b=None):
        """Callback to polot time curve of number of bacteria/nuclei for
        each selected channel. Called by plot_time_curve_button."""

        if self.aggregated is None:
            self.data_aggregation()

        if len(self.sel_channel_time.value) == 0:
            print("Select at least one channel")
        else:
            subset = self.aggregated[self.aggregated.channel.isin(
                self.sel_channel_time.value)].copy(deep=True)
            subset.loc[:, "channel"] = subset.channel.astype(
                pd.CategoricalDtype(self.sel_channel_time.value, ordered=True))

            colors = self.create_color_list()

            myfig = (
                ggplot(subset, aes("hour", "normalized", color="channel")) +
                geom_point() + geom_line() + labels.xlab("Time [hours]") +
                labels.ylab("Average number of bacteria/nuclei") +
                pn.scale_colour_manual(
                    values=colors,
                    labels=list(self.sel_channel_time.value),
                    name="") + pn.labs(colour="") + pn.scale_x_continuous(
                        breaks=np.sort(self.result.hour.unique()),
                        labels=list(
                            np.sort(self.result.hour.unique()).astype(str))))

            self.time_curve_fig = myfig

            self.out_plot2.clear_output()
            with self.out_plot2:
                display(myfig)
Exemple #7
0
def round_1_plot():
    df = pd.read_csv('2019_tacl_trick/data/round_1.csv')
    model_dtype = CategoricalDtype(['DAN', 'RNN', 'IR'], ordered=True)
    df['Model'] = df['Model'].astype(model_dtype)

    # This following is a hack so that the legend widths are the same across plots
    def rename(x):
        if x == 'Round 1 - IR Adversarial':
            return 'Round 1 - IR Adversarial    '
        else:
            return x

    df['Dataset'] = df['Dataset'].map(rename)
    p = (ggplot(df) + aes(x='x', y='y', color='Dataset') +
         facet_wrap('Model', nrow=1) + geom_point(size=1.0, shape='o') +
         scale_y_continuous(breaks=np.linspace(0, 1, 6), limits=[0, 0.6]) +
         scale_x_continuous(breaks=[0, .5, 1]) +
         xlab('Percent of Question Revealed') + ylab('Accuracy') +
         ggtitle('Round 1 Attacks and Models') +
         theme(strip_text_x=element_text(margin={
             't': 6,
             'b': 6,
             'l': 1,
             'r': 5
         })) + scale_color_manual(
             values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'],
             name='Questions'))
    p.save('2019_tacl_trick/auto_fig/round_1_csv.pdf', width=7.0, height=1.7)
Exemple #8
0
def plot_train_test(ags):
    frontiers = data.train_test(ags)
    frontiers, model = data.train_test_model(frontiers)

    labs = frontiers.sort_values('train_flops').groupby(
        'elo').first().reset_index()
    desc = f'log₁₀(test) = {model.params[1]:.1f} · log₁₀(train) + {model.params[2]:.1g} · elo + {model.params[0]:.0f}'

    return (
        pn.ggplot(
            frontiers,
            pn.aes(x='train_flops', y='test_flops', color='elo',
                   group='elo')) + pn.geom_line(size=.5, show_legend=False) +
        pn.geom_line(pn.aes(y='test_flops_hat'),
                     size=.25,
                     show_legend=False,
                     linetype='dashed')
        # + pn.geom_point(size=.5, show_legend=False)
        + pn.geom_text(pn.aes(label='elo.astype(int)'),
                       labs,
                       show_legend=False,
                       size=6,
                       nudge_y=+.2) + pn.scale_color_cmap(limits=(-1500, 0)) +
        pn.scale_x_continuous(trans='log10') +
        pn.scale_y_continuous(trans='log10') + pn.annotate(
            'text', 1.5e13, 5e9, label=desc, ha='left', size=6, family='serif')
        + pn.labs(x='Train-time compute (FLOPS-seconds)',
                  y='Test-time compute (FLOPS-seconds)') + plot.IEEE())
def scatter_plot(df,
                 xcol,
                 ycol,
                 domain,
                 xname=None,
                 yname=None,
                 log=False,
                 width=6,
                 height=6,
                 clamp=True,
                 tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 0.5
    DASH_PATTERN = (0, (3, 1))

    if xname == None:
        xname = xcol
    if yname == None:
        yname = ycol

    # formater for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df = df.copy(deep=True)
        df.loc[df[xcol] > domain[1], xcol] = domain[1]
        df.loc[df[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True)
    scatter += p9.labs(x=xname, y=yname)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    #scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(
        panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(figure_size=(width, height))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1,
                              linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1],
                             linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1],
                             linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
def limits(x, y=None, xbreaks=None, ybreaks=None):
    if y is None:
        y = x

    x0, x1 = x
    y0, y1 = y

    if xbreaks is None:
        xbreaks = np.linspace(x0, x1, x1 - x0 + 1)
    if ybreaks is None:
        ybreaks = np.linspace(y0, y1, y1 - y0 + 1)

    # We want these plots to continue to the top and left.
    return [ gg.coord_cartesian(xlim=x, ylim=y),
             gg.scale_x_continuous(limits=(x0, None), breaks=xbreaks),
             gg.scale_y_continuous(limits=(y0, None), breaks=ybreaks) ]
    return [ gg.scale_x_continuous(limits=x, breaks=xbreaks),
             gg.scale_y_continuous(limits=y, breaks=ybreaks) ]
Exemple #11
0
    def __plot(
        self,
        plot_data,
        x,
        y,
        colour,
        lbl_x,
        lbl_y,
        facet,
        facet_scales,
        facet_by,
        smoothed,
        points,
        error_bars,
        save,
    ):
        cbbPalette = [
            "#000000",
            "#E69F00",
            "#56B4E9",
            "#009E73",
            "#0072B2",
            "#D55E00",
            "#CC79A7",
        ]
        plt = ggplot(data=plot_data, mapping=aes(x=x, y=y, colour=colour))
        plt += xlab(lbl_x)
        plt += ylab(lbl_y)
        # + facet_grid("site~", scales="free")
        # + geom_line()
        if facet:
            # TODO: use facet as save
            nrow, ncol = self.get_facet_rows(plot_data, facet_by)
            plt += facet_wrap(facet_by, nrow=nrow, ncol=ncol, scales=facet_scales)
        if points:
            plt += geom_point()
        if error_bars:
            # TODO use generic way to compute them
            pass
            # self.plt += geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std"))
        # TODO: use smooth as save
        if smoothed:
            plt += geom_smooth(
                method="mavg",
                se=False,
                method_args={"window": 4, "center": True, "min_periods": 1},
            )
        else:
            plt += geom_line()
        plt += scale_colour_manual(values=cbbPalette, guide=False)
        plt += scale_x_continuous(labels=label_x)

        plt += theme(figure_size=(15, 18), dpi=150)

        if save:
            plt.save(**save)
        return plt
Exemple #12
0
def plot_histogram(df_plot,
                   variable_column,
                   output_file='plot_distribution',
                   facet_column='none',
                   x_log10=False):
    """Plot plot_distribution to png.

    Parameters
    ----------
    df_plot : pandas.DataFrame
        DataFrame with <variable_column> as a column.
    variable_column : string
        String of variable_column column to plot.
    output_file : string
        Basename of output file.
    facet_column : string
        Column to facet the plot by.

    Returns
    -------
    NULL
    """
    df_plot['x'] = df_plot[variable_column]
    if x_log10:
        if np.any(df_plot['x'].values < 0):
            return 1
        elif np.any(df_plot['x'].values == 0):
            df_plot['x'] = np.log10(df_plot['x'].values + 1e-10)
            variable_column = variable_column + ' (log10)'
        else:
            df_plot['x'] = np.log10(df_plot['x'].values)
            variable_column = variable_column + ' (log10)'
    gplt = plt9.ggplot(df_plot, plt9.aes(x='x'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_histogram(alpha=0.8)
    gplt = gplt + plt9.scale_x_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.labs(title='', x=variable_column)
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=-45, hjust=0))
    if facet_column != 'none':
        gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5)
        n_facets = df_plot[facet_column].nunique()
        gplt.save('{}.png'.format(output_file),
                  dpi=300,
                  width=6 * (n_facets / 4),
                  height=4 * (n_facets / 4),
                  limitsize=False)
    else:
        gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4)
    return 0
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 1.5
    DASH_PATTERN = (0, (6, 2))

    if xname is None:
        xname = xcol
    if yname is None:
        yname = ycol

    # formatter for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df1 = df1.copy(deep=True)
        df1.loc[df1[xcol] > domain[1], xcol] = domain[1]
        df1.loc[df1[ycol] > domain[1], ycol] = domain[1]

        df2 = df2.copy(deep=True)
        df2.loc[df2[xcol] > domain[1], xcol] = domain[1]
        df2.loc[df2[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df1)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5)
    scatter += p9.labs(x=xname, y=yname)

    # rug plots
    scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05)
    scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    # scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(panel_grid_minor=p9.element_blank())
    scatter += p9.theme(figure_size=(width, height))
    scatter += p9.theme(text=p9.element_text(size=24, color="black"))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Exemple #14
0
def test_annotation_logticks_base_8():
    base = 8
    df = pd.DataFrame({'x': base**np.arange(4)})
    # The grid should align with the logticks
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_continuous(trans=log_trans(base=base)) +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_base_8'
Exemple #15
0
def plot_optimal_model_size(ags):
    from statsmodels.formula import api as smf

    results = {}
    for b, g in ags.groupby('boardsize'):
        ordered = g.sort_values('elo').copy()
        ordered['params'] = g.width**2 * g.depth

        left = np.log10(g.train_flops.min())
        right = np.log10(g.train_flops.max())
        for f in np.linspace(left, right, 11)[1:]:
            subset = ordered[ordered.train_flops <= 10**f]
            results[b, 10**f] = subset.params.iloc[-1]
    df = pd.Series(results).reset_index()
    df.columns = ['boardsize', 'approx_flops', 'params']

    model = smf.ols('np.log10(params) ~ np.log10(approx_flops) + 1', df).fit()

    left, right = np.log10(df.approx_flops.min()), np.log10(
        df.approx_flops.max())
    preds = pd.DataFrame({'approx_flops': 10**np.linspace(left, right, 21)})
    preds['params'] = 10**model.predict(preds)

    labs = df.sort_values('approx_flops').groupby(
        'boardsize').last().reset_index()
    labs['params'] = labs.apply(
        lambda r: df[df.approx_flops <= r.approx_flops].params.max(), axis=1)

    points = df.sort_values('approx_flops').groupby(
        'boardsize').last().reset_index()

    desc = f'log₁₀(params) = {model.params[1]:.2f} · log₁₀(compute) − {-model.params[0]:.1f}'

    return (
        pn.ggplot(df, pn.aes(x='approx_flops', y='params')) +
        pn.geom_line(pn.aes(color='factor(boardsize)', group='boardsize'),
                     show_legend=False) +
        pn.geom_line(data=preds, linetype='dashed', size=.25) +
        pn.geom_point(pn.aes(color='factor(boardsize)', group='boardsize'),
                      data=points,
                      size=.5,
                      show_legend=False) +
        pn.geom_text(pn.aes(
            color='factor(boardsize)', group='boardsize', label='boardsize'),
                     data=labs,
                     nudge_y=+.5,
                     show_legend=False,
                     size=6) +
        pn.annotate(
            'text',
            1e9, 2e7, label=desc, ha='left', size=6, family='serif') +
        pn.scale_x_continuous(trans='log10') +
        pn.scale_y_continuous(trans='log10') + pn.scale_color_hue(l=.4) +
        pn.labs(x='Train-time compute (FLOPS-seconds)',
                y='Optimal model size (params)') + plot.IEEE())
Exemple #16
0
def plot_convergence(pile):
    stops = range(100, int(len(pile) / 10), utils.bills_per_pound)
    dist_stats = pd.DataFrame([get_sample_dist(pile, size) for size in stops])

    return (
        pn.ggplot(dist_stats) + pn.geom_line(pn.aes(x='size', y='mean')) +
        pn.geom_line(
            pn.aes(x='size', y='lower'), color='#FF5500', linetype='dotted') +
        pn.geom_line(
            pn.aes(x='size', y='upper'), color='#FF5500', linetype='dotted') +
        pn.scale_x_continuous(breaks=stops) +
        pn.theme(axis_text_x=pn.element_text(angle=270, hjust=1)))
Exemple #17
0
def plot_fees(fees, title, y_axis, years, filename):
    p = pn.ggplot(fees, pn.aes('year', y_axis, color = 'conference', shape = 'conference')) + \
        pn.geom_point() + \
        pn.geom_line() + \
        pn.labs(title = title, x = 'Year', y = 'Fee (€)') + \
        pn.ylim(0, 1000) + \
        pn.theme_light() + \
        pn.scale_x_continuous(breaks = years) + \
        pn.scale_colour_discrete(name = 'Conference') + \
        pn.scale_shape_discrete(name = 'Conference')

    p.save(filename, width=6, height=3, dpi=300)
Exemple #18
0
    def plot_series(self, y: str,
                    series_state_df: pd.DataFrame) -> plotnine.ggplot:
        """
        """
        aes_kwargs = dict(x="round_number",
                          y=y,
                          color="factor(game_id)",
                          group="factor(game_id)")

        return (plotnine.ggplot(series_state_df) + plotnine.aes(**aes_kwargs) +
                plotnine.geom_point() + plotnine.geom_line() +
                plotnine.theme_light() +
                plotnine.scale_x_continuous(breaks=range(1, 20, 1)))
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("loc")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_grid(facets="category ~ .", scales='free_y') +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format())
      #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l])
      + ggtitle("Class Sizes") + xlab("Lines of Code") +
      ylab("Number of Classes") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1
                                                         })).save(file_path,
                                                                  width=8,
                                                                  height=18)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Distributions of QMOOD Quality Attributes") +
      xlab("Quality Attribute Value") + ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            subplots_adjust={
                "wspace": 0.35,
                "hspace": 0.35
            })).save(file_path, width=24, height=12)
Exemple #21
0
def plot_training_curves(ags):
    df = ags[ags.test_nodes == 64].copy()
    df['g'] = df.run + df.test_nodes.astype(str)

    return (pn.ggplot(
        df,
        pn.aes(x='train_flops',
               y='400/np.log(10)*elo',
               group='g',
               color='factor(boardsize)')) + pn.geom_line() +
            pn.geom_point(size=.5) + pn.scale_x_continuous(trans='log10') +
            pn.scale_color_discrete(name='Boardsize') +
            pn.labs(x='Training FLOPS',
                    y='Elo v. perfect play',
                    title='All agents\' training curves') + plot.mpl_theme() +
            plot.poster_sizes())
def test_area_aesthetics():
    p = (ggplot(df, aes('x', 'ymax+2', group='factor(z)')) + geom_area() +
         geom_area(aes('x+width', alpha='z')) +
         geom_area(aes('x+2*width', linetype='factor(z)'),
                   color='black',
                   fill=None,
                   size=2) +
         geom_area(aes('x+3*width', color='z'), fill=None, size=2) +
         geom_area(aes('x+4*width', fill='factor(z)')) +
         geom_area(aes('x+5*width', size='z'), color='black', fill=None) +
         scale_x_continuous(
             breaks=[i * 2 * np.pi
                     for i in range(7)],
             labels=['0'] + [r'${}\pi$'.format(2 * i) for i in range(1, 7)]))

    assert p + _theme == 'area_aesthetics'
Exemple #23
0
def plot_sample_efficiency(ags):
    df = ags.query('boardsize == 9 & test_nodes == 64').copy()
    df['params'] = df.train_flops / df.samples
    return (
        pn.ggplot(
            df,
            pn.aes(x='samples',
                   y='400/np.log(10)*elo',
                   color='params',
                   group='run')) + pn.geom_line() + pn.geom_point() +
        pn.scale_x_continuous(trans='log10') +
        pn.scale_color_continuous(trans='log10', name='Params') + pn.labs(
            title=
            'Bigger networks might not be comute efficient, but they are sample efficient',
            y='Elo v. perfect play',
            x='Train FLOPS') + plot.mpl_theme() + plot.poster_sizes() +
        plot.no_colorbar_ticks())
Exemple #24
0
def plot_params(ags):
    df = ags.query('boardsize == 9 & test_nodes == 64').copy()
    df['params'] = df.train_flops / df.samples
    return (
        pn.ggplot(
            df,
            pn.aes(x='train_flops',
                   y='400/np.log(10)*elo',
                   color='params',
                   group='run')) + pn.geom_line() + pn.geom_point() +
        pn.scale_x_continuous(trans='log10') +
        pn.scale_color_continuous(trans='log10', name='Params') + pn.labs(
            title=
            'Smaller networks are more compute efficient for lower performances, but plateau earlier',
            y='Elo v. perfect play',
            x='Train FLOPS') + plot.mpl_theme() + plot.poster_sizes() +
        plot.no_colorbar_ticks())
def test_area_aesthetics():
    p = (ggplot(df, aes('x', 'ymax+2', group='factor(z)')) +
         geom_area() +
         geom_area(aes('x+width', alpha='z')) +
         geom_area(aes('x+2*width', linetype='factor(z)'),
                   color='black', fill=None, size=2) +
         geom_area(aes('x+3*width', color='z'),
                   fill=None, size=2) +
         geom_area(aes('x+4*width', fill='factor(z)')) +
         geom_area(aes('x+5*width', size='z'),
                   color='black', fill=None) +
         scale_x_continuous(
             breaks=[i*2*np.pi for i in range(7)],
             labels=['0'] + [r'${}\pi$'.format(2*i) for i in range(1, 7)])
         )

    assert p + _theme == 'area_aesthetics'
Exemple #26
0
def plot_resid_var_trends(ags):
    resid_var = data.residual_vars(ags)
    return (
        pn.ggplot(
            resid_var,
            pn.aes(x='ratio',
                   y='rv',
                   color='factor(predicted)',
                   group='predicted')) + pn.geom_line(size=2) +
        pn.geom_text(pn.aes(label='seen'), nudge_y=-.1, size=14) +
        pn.geom_point(size=4) + pn.scale_x_continuous(trans='log10') +
        pn.scale_y_continuous(trans='log10') +
        pn.scale_color_discrete(name='Predicted frontier') + pn.labs(
            x='(cost of observed frontier)/(cost of predicted frontier)',
            y='residual variance in performance',
            title=
            'Frontiers of small problems are good, cheap proxies for frontiers of expensive problems'
        ) + plot.mpl_theme() + plot.poster_sizes())
def test_inplace_add():
    p = _p = ggplot(df)

    p += aes('x', 'y')
    assert p is _p

    p += geom_point()
    assert p is _p

    p += stat_identity()
    assert p is _p

    p += scale_x_continuous()
    assert p is _p

    with pytest.warns(PlotnineWarning):
        # Warning for; replacing existing scale added above
        p += xlim(0, 10)
        assert p is _p

    p += lims(y=(0, 10))
    assert p is _p

    p += labs(x='x')
    assert p is _p

    p += coord_trans()
    assert p is _p

    p += facet_null()
    assert p is _p

    p += annotate('point', 5, 5, color='red', size=5)
    assert p is _p

    p += guides()
    assert p is _p

    p += theme_gray()
    assert p is _p

    th = _th = theme_gray()
    th += theme(aspect_ratio=1)
    assert th is _th
Exemple #28
0
def test_wrong_bases():
    # x axis not transformed
    p = (ggplot(df, aes('x', 'x'))
         + annotation_logticks(sides='b', size=.75, base=10)
         + geom_point()
         )

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis not transform, but ticks requested for a different base
    p = (ggplot(df, aes('x', 'x'))
         + annotation_logticks(sides='b', size=.75, base=10)
         + scale_x_continuous(trans=log_trans(8))
         + geom_point()
         )

    with pytest.warns(PlotnineWarning):
        p.draw_test()
Exemple #29
0
def plot_flops_frontier(ags):
    df = data.modelled_elos(ags)

    return (
        pn.ggplot(
            df,
            pn.aes(
                x='train_flops', color='factor(boardsize)', group='boardsize'))
        + pn.geom_line(pn.aes(y='400/np.log(10)*elo'), size=2) + pn.geom_line(
            pn.aes(y='400/np.log(10)*elohat'), size=1, linetype='dashed') +
        pn.labs(
            x='Training FLOPS',
            y='Elo v. perfect play',
            title=
            'Performance is a sigmoid of compute, linearly scaled by board size'
        ) + pn.scale_x_continuous(trans='log10') +
        pn.scale_color_discrete(name='Boardsize') +
        pn.coord_cartesian(None,
                           (None, 0)) + plot.mpl_theme() + plot.poster_sizes())
Exemple #30
0
def yoy_growth():
    """
    This creates figures showing the number of questions versus year in dataset
    """
    with open('data/external/datasets/qanta.mapped.2018.04.18.json') as f:
        year_pages = defaultdict(set)
        year_questions = Counter()
        for q in json.load(f)['questions']:
            if q['page'] is not None:
                year_pages[q['year']].add(q['page'])
                year_questions[q['year']] += 1
    start_year = min(year_pages)
    # 2017 is the earlier year we have a full year's worth of data, including partial 2018 isn't accurate
    end_year = min(2017, max(year_pages))
    upto_year_pages = defaultdict(set)
    upto_year_questions = Counter()
    for upto_y in range(start_year, end_year + 1):
        for curr_y in range(start_year, upto_y + 1):
            upto_year_questions[upto_y] += year_questions[curr_y]
            for page in year_pages[curr_y]:
                upto_year_pages[upto_y].add(page)
    year_page_counts = {}
    for y, pages in upto_year_pages.items():
        year_page_counts[y] = len(pages)
    year_page_counts
    year_rows = []
    for y, page_count in year_page_counts.items():
        year_rows.append({'year': y, 'value': page_count, 'Quantity': 'Distinct Answers'})
        year_rows.append({'year': y, 'Quantity': 'Total Questions', 'value': upto_year_questions[y]})
    year_df = pd.DataFrame(year_rows)
    count_cat = CategoricalDtype(categories=['Total Questions', 'Distinct Answers'], ordered=True)
    year_df['Quantity'] = year_df['Quantity'].astype(count_cat)
    eprint(year_df[year_df.Quantity == 'Total Questions'])
    p = (
        ggplot(year_df)
        + aes(x='year', y='value', color='Quantity')
        + geom_line() + geom_point()
        + xlab('Year')
        + ylab('Count up to Year (inclusive)')
        + theme_fs()
        + scale_x_continuous(breaks=list(range(start_year, end_year + 1, 2)))
    )
    p.save(path.join(output_path, 'question_answer_counts.pdf'))
Exemple #31
0
def plot_test(ags):
    df = ags.query('boardsize == 9').groupby('run').apply(
        lambda df: df[df.idx == df.idx.max()]).copy()
    df['test_flops'] = df.test_nodes * (df.train_flops / df.samples)

    subset = df.query('test_nodes == 64').sort_values('test_flops')
    selection = [
        subset.loc[ELO * subset.elo > e].iloc[0].run
        for e in np.linspace(-2000, -500, 4)
    ]

    df = df[df.run.isin(selection)].copy()

    df['params'] = df.width**2 * df.depth
    df['arch'] = df.apply(lambda r: '{depth}×{width}'.format(**r), axis=1)
    labels = df.sort_values('test_flops').reset_index(
        drop=True).groupby('run').first().reset_index()
    return (pn.ggplot(
        df, pn.aes(x='test_flops', y='ELO*elo', color='params', group='run')) +
            pn.geom_point(size=.25, show_legend=False) +
            pn.geom_line(size=.5, show_legend=False) +
            pn.geom_text(pn.aes(label='test_nodes'),
                         nudge_y=-50,
                         show_legend=False,
                         size=4,
                         va='top') + pn.geom_text(pn.aes(label='test_nodes'),
                                                  nudge_y=-50,
                                                  show_legend=False,
                                                  size=4,
                                                  va='top') +
            pn.geom_text(pn.aes(label='arch'),
                         data=labels,
                         show_legend=False,
                         size=6,
                         nudge_x=-.1,
                         ha='right') + pn.scale_x_continuous(trans='log10') +
            pn.scale_color_cmap('plasma',
                                trans='log10',
                                limits=(df.params.min(), 10 * df.params.max()))
            + pn.coord_cartesian(
                (3.5, None)) + pn.labs(x='Test-time compute (FLOPS-seconds)',
                                       y='Elo v. perfect play') + plot.IEEE())
Exemple #32
0
    def create_state_plot(
            self,
            x: str,
            y: str,
            data: pd.DataFrame,
            aes_kwargs: Tuple[str, str] = None) -> plotnine.ggplot:
        """
        """
        aes_kwargs = ({} if aes_kwargs is None else aes_kwargs)

        x_breaks = data[x]

        max_y = max(data[y])
        y_interval = int(max_y / 20) + 1
        y_breaks = range(0, max_y + 1, y_interval)

        return (plotnine.ggplot(data) + plotnine.aes(x=x, y=y, **aes_kwargs) +
                plotnine.geom_line() + plotnine.geom_point() +
                plotnine.theme_light() +
                plotnine.scale_x_continuous(breaks=x_breaks) +
                plotnine.scale_y_continuous(breaks=y_breaks))
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1],
                            'color': all_color_data[:, 2]})

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float})

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
def analyze_nir_intensity(gray_img, mask, bins=256, histplot=False):
    """This function calculates the intensity of each pixel associated with the plant and writes the values out to
       a file. It can also print out a histogram plot of pixel intensity and a pseudocolor image of the plant.

    Inputs:
    gray_img     = 8- or 16-bit grayscale image data
    mask         = Binary mask made from selected contours
    bins         = number of classes to divide spectrum into
    histplot     = if True plots histogram of intensity values

    Returns:
    analysis_images = NIR histogram image

    :param gray_img: numpy array
    :param mask: numpy array
    :param bins: int
    :param histplot: bool
    :return analysis_images: list
    """

    params.device += 1

    # apply plant shaped mask to image
    mask1 = binary_threshold(mask, 0, 255, 'light')
    mask1 = (mask1 / 255)
    masked = np.multiply(gray_img, mask1)

    # calculate histogram
    if gray_img.dtype == 'uint16':
        maxval = 65536
    else:
        maxval = 256

    # Make a pseudo-RGB image
    rgbimg = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR)

    hist_nir, hist_bins = np.histogram(masked, bins, (1, maxval))

    hist_bins1 = hist_bins[:-1]
    hist_bins2 = [float(round(l, 2)) for l in hist_bins1]

    hist_nir1 = [float(l) for l in hist_nir]

    # make hist percentage for plotting
    pixels = cv2.countNonZero(mask1)
    hist_percent = (hist_nir / float(pixels)) * 100

    # No longer returning a pseudocolored image
    # make mask to select the background
    # mask_inv = cv2.bitwise_not(mask)
    # img_back = cv2.bitwise_and(rgbimg, rgbimg, mask=mask_inv)
    # img_back1 = cv2.applyColorMap(img_back, colormap=1)

    # mask the background and color the plant with color scheme 'jet'
    # cplant = cv2.applyColorMap(rgbimg, colormap=2)
    # masked1 = apply_mask(cplant, mask, 'black')
    masked1 = cv2.bitwise_and(rgbimg, rgbimg, mask=mask)
    # cplant_back = cv2.add(masked1, img_back1)
    if params.debug is not None:
        if params.debug == "print":
            print_image(masked1, os.path.join(params.debug_outdir, str(params.device) + "_masked_nir_plant.jpg"))
        if params.debug == "plot":
            plot_image(masked1)

    analysis_images = []

    if histplot is True:
        hist_x = hist_percent
        bin_labels = np.arange(0, bins)
        dataset = pd.DataFrame({'Grayscale pixel intensity': bin_labels,
                                'Proportion of pixels (%)': hist_x})
        fig_hist = (ggplot(data=dataset,
                           mapping=aes(x='Grayscale pixel intensity',
                                       y='Proportion of pixels (%)'))
                    + geom_line(color='red')
                    + scale_x_continuous(breaks=list(range(0, bins, 25))))

        analysis_images.append(fig_hist)
        if params.debug == "print":
            fig_hist.save(os.path.join(params.debug_outdir, str(params.device) + '_nir_hist.png'))
        elif params.debug == "plot":
            print(fig_hist)

    outputs.add_observation(variable='nir_frequencies', trait='near-infrared frequencies',
                            method='plantcv.plantcv.analyze_nir_intensity', scale='frequency', datatype=list,
                            value=hist_nir1, label=hist_bins2)

    # Store images
    outputs.images.append(analysis_images)

    return analysis_images
def analyze_color(rgb_img, mask, hist_plot_type=None):
    """Analyze the color properties of an image object
    Inputs:
    rgb_img          = RGB image data
    mask             = Binary mask made from selected contours
    hist_plot_type   = 'None', 'all', 'rgb','lab' or 'hsv'
    
    Returns:
    analysis_image   = histogram output
    
    :param rgb_img: numpy.ndarray
    :param mask: numpy.ndarray
    :param hist_plot_type: str
    :return analysis_images: list
    """

    params.device += 1

    if len(np.shape(rgb_img)) < 3:
        fatal_error("rgb_img must be an RGB image")

    # Mask the input image
    masked = cv2.bitwise_and(rgb_img, rgb_img, mask=mask)
    # Extract the blue, green, and red channels
    b, g, r = cv2.split(masked)
    # Convert the BGR image to LAB
    lab = cv2.cvtColor(masked, cv2.COLOR_BGR2LAB)
    # Extract the lightness, green-magenta, and blue-yellow channels
    l, m, y = cv2.split(lab)
    # Convert the BGR image to HSV
    hsv = cv2.cvtColor(masked, cv2.COLOR_BGR2HSV)
    # Extract the hue, saturation, and value channels
    h, s, v = cv2.split(hsv)

    # Color channel dictionary
    channels = {"b": b, "g": g, "r": r, "l": l, "m": m, "y": y, "h": h, "s": s, "v": v}

    # Histogram plot types
    hist_types = {"ALL": ("b", "g", "r", "l", "m", "y", "h", "s", "v"),
                  "RGB": ("b", "g", "r"),
                  "LAB": ("l", "m", "y"),
                  "HSV": ("h", "s", "v")}

    if hist_plot_type is not None and hist_plot_type.upper() not in hist_types:
        fatal_error("The histogram plot type was " + str(hist_plot_type) +
                    ', but can only be one of the following: None, "all", "rgb", "lab", or "hsv"!')
    # Store histograms, plotting colors, and plotting labels
    histograms = {
        "b": {"label": "blue", "graph_color": "blue",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["b"]], [0], mask, [256], [0, 255])]},
        "g": {"label": "green", "graph_color": "forestgreen",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["g"]], [0], mask, [256], [0, 255])]},
        "r": {"label": "red", "graph_color": "red",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["r"]], [0], mask, [256], [0, 255])]},
        "l": {"label": "lightness", "graph_color": "dimgray",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["l"]], [0], mask, [256], [0, 255])]},
        "m": {"label": "green-magenta", "graph_color": "magenta",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["m"]], [0], mask, [256], [0, 255])]},
        "y": {"label": "blue-yellow", "graph_color": "yellow",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["y"]], [0], mask, [256], [0, 255])]},
        "h": {"label": "hue", "graph_color": "blueviolet",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["h"]], [0], mask, [256], [0, 255])]},
        "s": {"label": "saturation", "graph_color": "cyan",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["s"]], [0], mask, [256], [0, 255])]},
        "v": {"label": "value", "graph_color": "orange",
              "hist": [float(l[0]) for l in cv2.calcHist([channels["v"]], [0], mask, [256], [0, 255])]}
    }

    # Create list of bin labels for 8-bit data
    binval = np.arange(0, 256)
    bin_values = [l for l in binval]

    analysis_images = []
    # Create a dataframe of bin labels and histogram data
    dataset = pd.DataFrame({'bins': binval, 'blue': histograms["b"]["hist"],
                            'green': histograms["g"]["hist"], 'red': histograms["r"]["hist"],
                            'lightness': histograms["l"]["hist"], 'green-magenta': histograms["m"]["hist"],
                            'blue-yellow': histograms["y"]["hist"], 'hue': histograms["h"]["hist"],
                            'saturation': histograms["s"]["hist"], 'value': histograms["v"]["hist"]})

    # Make the histogram figure using plotnine
    if hist_plot_type is not None:
        if hist_plot_type.upper() == 'RGB':
            df_rgb = pd.melt(dataset, id_vars=['bins'], value_vars=['blue', 'green', 'red'],
                             var_name='Color Channel', value_name='Pixels')
            hist_fig = (ggplot(df_rgb, aes(x='bins', y='Pixels', color='Color Channel'))
                        + geom_line()
                        + scale_x_continuous(breaks=list(range(0, 256, 25)))
                        + scale_color_manual(['blue', 'green', 'red'])
                        )
            analysis_images.append(hist_fig)

        elif hist_plot_type.upper() == 'LAB':
            df_lab = pd.melt(dataset, id_vars=['bins'],
                             value_vars=['lightness', 'green-magenta', 'blue-yellow'],
                             var_name='Color Channel', value_name='Pixels')
            hist_fig = (ggplot(df_lab, aes(x='bins', y='Pixels', color='Color Channel'))
                        + geom_line()
                        + scale_x_continuous(breaks=list(range(0, 256, 25)))
                        + scale_color_manual(['yellow', 'magenta', 'dimgray'])
                        )
            analysis_images.append(hist_fig)

        elif hist_plot_type.upper() == 'HSV':
            df_hsv = pd.melt(dataset, id_vars=['bins'],
                             value_vars=['hue', 'saturation', 'value'],
                             var_name='Color Channel', value_name='Pixels')
            hist_fig = (ggplot(df_hsv, aes(x='bins', y='Pixels', color='Color Channel'))
                        + geom_line()
                        + scale_x_continuous(breaks=list(range(0, 256, 25)))
                        + scale_color_manual(['blueviolet', 'cyan', 'orange'])
                        )
            analysis_images.append(hist_fig)

        elif hist_plot_type.upper() == 'ALL':
            s = pd.Series(['blue', 'green', 'red', 'lightness', 'green-magenta',
                           'blue-yellow', 'hue', 'saturation', 'value'], dtype="category")
            color_channels = ['blue', 'yellow', 'green', 'magenta', 'blueviolet',
                              'dimgray', 'red', 'cyan', 'orange']
            df_all = pd.melt(dataset, id_vars=['bins'], value_vars=s, var_name='Color Channel',
                             value_name='Pixels')
            hist_fig = (ggplot(df_all, aes(x='bins', y='Pixels', color='Color Channel'))
                        + geom_line()
                        + scale_x_continuous(breaks=list(range(0, 256, 25)))
                        + scale_color_manual(color_channels)
                        )
            analysis_images.append(hist_fig)

    # Hue values of zero are red but are also the value for pixels where hue is undefined
    # The hue value of a pixel will be undefined when the color values are saturated
    # Therefore, hue values of zero are excluded from the calculations below

    # Calculate the median hue value
    # The median is rescaled from the encoded 0-179 range to the 0-359 degree range
    hue_median = np.median(h[np.where(h > 0)]) * 2

    # Calculate the circular mean and standard deviation of the encoded hue values
    # The mean and standard-deviation are rescaled from the encoded 0-179 range to the 0-359 degree range
    hue_circular_mean = stats.circmean(h[np.where(h > 0)], high=179, low=0) * 2
    hue_circular_std = stats.circstd(h[np.where(h > 0)], high=179, low=0) * 2

    # Store into lists instead for pipeline and print_results
    # stats_dict = {'mean': circular_mean, 'std' : circular_std, 'median': median}

    # Plot or print the histogram
    if hist_plot_type is not None:
        if params.debug == 'print':
            hist_fig.save(os.path.join(params.debug_outdir, str(params.device) + '_analyze_color_hist.png'))
        elif params.debug == 'plot':
            print(hist_fig)

    # Store into global measurements
    # RGB signal values are in an unsigned 8-bit scale of 0-255
    rgb_values = [i for i in range(0, 256)]
    # Hue values are in a 0-359 degree scale, every 2 degrees at the midpoint of the interval
    hue_values = [i * 2 + 1 for i in range(0, 180)]
    # Percentage values on a 0-100 scale (lightness, saturation, and value)
    percent_values = [round((i / 255) * 100, 2) for i in range(0, 256)]
    # Diverging values on a -128 to 127 scale (green-magenta and blue-yellow)
    diverging_values = [i for i in range(-128, 128)]
    # outputs.measurements['color_data'] = {
    #     'histograms': {
    #         'blue': {'signal_values': rgb_values, 'frequency': histograms["b"]["hist"]},
    #         'green': {'signal_values': rgb_values, 'frequency': histograms["g"]["hist"]},
    #         'red': {'signal_values': rgb_values, 'frequency': histograms["r"]["hist"]},
    #         'lightness': {'signal_values': percent_values, 'frequency': histograms["l"]["hist"]},
    #         'green-magenta': {'signal_values': diverging_values, 'frequency': histograms["m"]["hist"]},
    #         'blue-yellow': {'signal_values': diverging_values, 'frequency': histograms["y"]["hist"]},
    #         'hue': {'signal_values': hue_values, 'frequency': histograms["h"]["hist"]},
    #         'saturation': {'signal_values': percent_values, 'frequency': histograms["s"]["hist"]},
    #         'value': {'signal_values': percent_values, 'frequency': histograms["v"]["hist"]}
    #     },
    #     'color_features': {
    #         'hue_circular_mean': hue_circular_mean,
    #         'hue_circular_std': hue_circular_std,
    #         'hue_median': hue_median
    #     }
    # }
    outputs.add_observation(variable='blue_frequencies', trait='blue frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["b"]["hist"], label=rgb_values)
    outputs.add_observation(variable='green_frequencies', trait='green frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["g"]["hist"], label=rgb_values)
    outputs.add_observation(variable='red_frequencies', trait='red frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["r"]["hist"], label=rgb_values)
    outputs.add_observation(variable='lightness_frequencies', trait='lightness frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["l"]["hist"], label=percent_values)
    outputs.add_observation(variable='green-magenta_frequencies', trait='green-magenta frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["m"]["hist"], label=diverging_values)
    outputs.add_observation(variable='blue-yellow_frequencies', trait='blue-yellow frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["y"]["hist"], label=diverging_values)
    outputs.add_observation(variable='hue_frequencies', trait='hue frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["h"]["hist"], label=hue_values)
    outputs.add_observation(variable='saturation_frequencies', trait='saturation frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["s"]["hist"], label=percent_values)
    outputs.add_observation(variable='value_frequencies', trait='value frequencies',
                            method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list,
                            value=histograms["v"]["hist"], label=percent_values)
    outputs.add_observation(variable='hue_circular_mean', trait='hue circular mean',
                            method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float,
                            value=hue_circular_mean, label='degrees')
    outputs.add_observation(variable='hue_circular_std', trait='hue circular standard deviation',
                            method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float,
                            value=hue_median, label='degrees')
    outputs.add_observation(variable='hue_median', trait='hue median',
                            method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float,
                            value=hue_median, label='degrees')

    # Store images
    outputs.images.append(analysis_images)

    return analysis_images
Exemple #36
0
    def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False):
        if self.y_max is not None:
            limits = [0, float(self.y_max)]
            eprint(f'Setting limits to: {limits}')
        else:
            limits = [0, 1]
        if expo:
            if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                with open('data/external/all_human_gameplay.json') as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]:
                        if self.merge_humans:
                            name = 'Human'
                        gameplay = all_gameplay[event]
                        if event != 'live':
                            control_correct_positions = gameplay['control_correct_positions']
                            control_wrong_positions = gameplay['control_wrong_positions']
                            control_positions = control_correct_positions + control_wrong_positions
                            control_positions = np.array(control_positions)
                            control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0])
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[argsort_control]
                            control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0]
                            control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x})
                            control_df['Dataset'] = 'Regular Test'
                            control_df['Guessing_Model'] = f' {name}'
                            frames.append(control_df)

                        adv_correct_positions = gameplay['adv_correct_positions']
                        adv_wrong_positions = gameplay['adv_wrong_positions']
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(adv_positions)
                        adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                        adv_df['Dataset'] = 'IR Adversarial'
                        adv_df['Guessing_Model'] = f' {name}'
                        frames.append(adv_df)

                        if len(gameplay['advneural_correct_positions']) > 0:
                            adv_correct_positions = gameplay['advneural_correct_positions']
                            adv_wrong_positions = gameplay['advneural_wrong_positions']
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(adv_positions)
                            adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                            adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                            adv_df['Dataset'] = 'RNN Adversarial'
                            adv_df['Guessing_Model'] = f' {name}'
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
                    human_vals = sort_humans(list(human_df['Guessing_Model'].unique()))
                    human_dtype = CategoricalDtype(human_vals, ordered=True)
                    human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype)
                    dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True)
                    human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype)

            if no_models:
                p = ggplot(human_df) + geom_point(shape='.')
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 1 - IR Adversarial']
                if 2 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 2 - IR Adversarial']
                    df = df[df['Dataset'] != 'Round 2 - RNN Adversarial']
                p = ggplot(df)
                if self.save_df is not None:
                    eprint(f'Saving df to: {self.save_df}')
                    df.to_json(self.save_df)

                if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                    eprint('Loading human data')
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap('Guessing_Model', ncol=1)
            else:
                facet_conf = facet_wrap('Guessing_Model', nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(method='mavg', se=False, method_args={'window': 400})
                else:
                    chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5)
            else:
                chart = None

            p = (
                p + facet_conf
                + aes(x='char_percent', y='correct', color='Dataset')
            )
            if chart is not None:
                p += chart
            p = (
                p
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + scale_x_continuous(breaks=[0, .5, 1])
                + coord_cartesian(ylim=limits)
                + xlab('Percent of Question Revealed')
                + ylab('Accuracy')
                + theme(
                    #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5})
                )
                + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')
            )
            if self.title != '':
                p += ggtitle(self.title)

            return p
        else:
            if self.save_df is not None:
                eprint(f'Saving df to: {self.save_df}')
                df.to_json(self.save_df)
            return (
                ggplot(self.char_plot_df)
                + aes(x='char_percent', y='correct', color='Guessing_Model')
                + stat_smooth(method='mavg', se=False, method_args={'window': 500})
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + coord_cartesian(ylim=limits)
            )
Exemple #37
0
def syntactic_diversity_plots():
    with open('data/external/syntactic_diversity_table.json') as f:
        rows = json.load(f)
    parse_df = pd.DataFrame(rows)
    parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses']
    melt_df = pd.melt(
        parse_df,
        id_vars=['dataset', 'depth', 'overlap', 'parses'],
        value_vars=['parse_ratio', 'unique_parses'],
        var_name='metric',
        value_name='y'
    )

    def label_facet(name):
        if name == 'parse_ratio':
            return 'Average Unique Parses per Instance'
        elif name == 'unique_parses':
            return 'Count of Unique Parses'

    def label_y(ys):
        formatted_ys = []
        for y in ys:
            y = str(y)
            if y.endswith('000.0'):
                formatted_ys.append(y[:-5] + 'K')
            else:
                formatted_ys.append(y)
        return formatted_ys
    p = (
    ggplot(melt_df)
        + aes(x='depth', y='y', color='dataset')
        + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet)
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth') + ylab('')
        + scale_color_discrete(name='Dataset')
        + scale_y_continuous(labels=label_y)
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'syn_div_plot.pdf'))
    p = (
    ggplot(parse_df)
        + aes(x='depth', y='unique_parses', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Count of Unique Parses')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'n_unique_parses.pdf'))
    p = (
        ggplot(parse_df)
        + aes(x='depth', y='parse_ratio', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Average Unique Parses per Instance')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10])
        + scale_y_continuous(limits=[0, 1])
        + theme_fs()
    )
    p.save(path.join(output_path, 'parse_ratio.pdf'))