Example #1
0
def test_jitterdodge():
    df = pd.DataFrame({
        'x': np.ones(n*2),
        'y': np.repeat(np.arange(n), 2),
        'letters': np.repeat(list(string.ascii_lowercase[:n]), 2)})
    position = position_jitterdodge(random_state=random_state)

    p = (ggplot(df, aes('x', 'y', fill='letters')) +
         geom_point(size=10, fill='black') +
         geom_point(size=10, position=position))
    assert p + _theme == 'jitterdodge'
Example #2
0
def test_position_from_geom():
    geom = geom_point(position='jitter')
    assert isinstance(position.from_geom(geom), position_jitter)

    geom = geom_point(position='position_jitter')
    assert isinstance(position.from_geom(geom), position_jitter)

    geom = geom_point(position=position_jitter())
    assert isinstance(position.from_geom(geom), position_jitter)

    geom = geom_point(position=position_jitter)
    assert isinstance(position.from_geom(geom), position_jitter)
Example #3
0
def test_no_fill():
    df = pd.DataFrame({'x': range(5), 'y': range(5)})

    p = (ggplot(df, aes('x', 'y'))
         + geom_point(color='red', fill=None, size=5, stroke=1.5)
         + geom_point(aes(y='y+1'),
                      color='blue', fill='none', size=5, stroke=1.5)
         + geom_point(aes(y='y+2'),
                      color='green', fill='', size=5, stroke=1.5)
         + geom_point(aes(y='y+3'),
                      color='yellow', fill='gray', size=5, stroke=1.5))

    assert p == 'no_fill'
Example #4
0
def test_aesthetics():
    df = pd.DataFrame({
            'a': range(5),
            'b': 2,
            'c': 3,
            'd': 4,
            'e': 5,
            'f': 6,
            'g': 7,
            'h': 8,
            'i': 9
        })

    p = (ggplot(df, aes(y='a')) +
         geom_point(aes(x='b')) +
         geom_point(aes(x='c', size='a')) +
         geom_point(aes(x='d', alpha='a'),
                    size=10, show_legend=False) +
         geom_point(aes(x='e', shape='factor(a)'),
                    size=10, show_legend=False) +
         geom_point(aes(x='f', color='factor(a)'),
                    size=10, show_legend=False) +
         geom_point(aes(x='g', fill='a'), stroke=0,
                    size=10, show_legend=False) +
         geom_point(aes(x='h', stroke='a'), fill='white',
                    color='green', size=10) +
         geom_point(aes(x='i', shape='factor(a)'),
                    fill='brown', stroke=2, size=10, show_legend=False) +
         theme(subplots_adjust={'right': 0.85}))

    assert p == 'aesthetics'
Example #5
0
def test_bool_mapping():
    df = pd.DataFrame({
        'x': [1, 2, 3],
        'y': [True, False, False]
    })
    p = ggplot(df, aes('x', 'y')) + geom_point()
    assert p == 'bool_mapping'
Example #6
0
def test_continuous_x():
    n = len(df_continuous_x)
    p = (ggplot(df_continuous_x, aes('x', 'y'))
         + geom_point()
         + geom_smooth(df_continuous_x[3:n-3], method='loess',
                       color='blue', fullrange=False))
    assert p == 'continuous_x'
Example #7
0
def test_legend_fill_ratio():
    p = (ggplot(df_linear, aes('x', color='x<0.5'))
         + geom_point(aes(y='y_noisy'))
         + geom_smooth(aes(y='y_noisy'), method='lm', size=0.5, span=.3)
         )

    assert p == 'legend_fill_ratio'
Example #8
0
def test_expand_limits():
    df = pd.DataFrame({'x': range(5, 11), 'y': range(5, 11)})
    p = (ggplot(aes('x', 'y'), data=df)
         + geom_point()
         + expand_limits(y=(0, None))
         )
    assert p == 'expand_limits'
Example #9
0
def test_hull():
    p = (ggplot(mtcars)
         + aes('wt', 'mpg', color='factor(cyl)')
         + geom_point()
         + stat_hull(size=1)
         )

    assert p + _theme == 'hull'
Example #10
0
def test_aes_inheritance():
    # A default line (intercept = 0, slope = 1)
    p = (ggplot(df, aes('x', 'y', color='factor(z)',
                        slope='slope', intercept='intercept')) +
         geom_point(size=10, show_legend=False) +
         geom_abline(size=2))

    assert p == 'aes_inheritance'
Example #11
0
def test_non_linear_smooth_no_ci():
    p = (ggplot(df_linear, aes('x'))
         + geom_point(aes(y='y_noisy'))
         + geom_smooth(aes(y='y_noisy'), method='loess', span=.3,
                       color='blue', se=False)
         )

    assert p == 'non_linear_smooth_no_ci'
Example #12
0
def test_linear_smooth():
    p = (ggplot(df_linear, aes('x'))
         + geom_point(aes(y='y_noisy'))
         + geom_smooth(aes(y='y_noisy'), method='lm', span=.3,
                       color='blue')
         )

    assert p == 'linear_smooth'
Example #13
0
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
Example #14
0
def test_scale_without_a_mapping():
    df = pd.DataFrame({
        'x': [1, 2, 3],
    })
    p = (ggplot(df, aes('x', 'x'))
         + geom_point()
         + scale_color.scale_color_continuous())
    with pytest.warns(UserWarning):
        p.draw_test()
Example #15
0
def test_ellipse():
    p = (ggplot(df, aes('x', 'y'))
         + geom_point()
         + stat_ellipse(type='t')
         + stat_ellipse(type='norm', color='red')
         + stat_ellipse(type='euclid', color='blue')
         )

    assert p == 'ellipse'
Example #16
0
def test_points():
    p = (p0
         + geom_point(
             aes(fill='calc(density)', size='calc(density)'),
             stat='density_2d',
             stroke=0, n=16, contour=False)
         + scale_size_radius(range=(0, 6)))

    assert p == 'points'
Example #17
0
def test_lines():
    p = (ggplot(df, aes(x='x', y='y')) +
         geom_point(alpha=.5) +
         geom_quantile(quantiles=[.001, .5, .999], formula='y~x',
                       size=2))

    # Two (.001, .999) quantile lines should bound the points
    # from below and from above, and the .5 line should go
    # through middle (approximately).
    assert p == 'lines'
Example #18
0
    def test_addition(self):
        p = ggplot(df, aes('x', 'y'))
        p1 = p + self.lyrs[0] + self.lyrs[1] + self.lyrs[2]
        assert _get_colors(p1) == colors

        p2 = p + self.lyrs
        assert _get_colors(p2) == colors

        # Real layers
        lyrs = Layers(layer.from_geom(obj) for obj in self.lyrs)
        p3 = p + lyrs
        assert _get_colors(p3) == colors

        p += self.lyrs
        assert _get_colors(p) == colors

        with pytest.raises(PlotnineError):
            geom_point() + layer.from_geom(geom_point())

        with pytest.raises(PlotnineError):
            geom_point() + self.lyrs
Example #19
0
def test_watermark():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filename = os.path.join(dir_path, 'images/plotnine-watermark.png')
    df = pd.DataFrame({'x': [1, 2, 3],
                       'y': [1, 2, 3]})
    p = (ggplot(df)
         + geom_point(aes('x', 'y'))
         + watermark(filename, 150, 160)
         + watermark(filename, 150, 210, 0.5)
         )

    assert p == 'watermark'
Example #20
0
def test_jitter():
    df1 = pd.DataFrame({'x': [1, 2, 1, 2],
                        'y': [1, 1, 2, 2]})
    p = (ggplot(df1, aes('x', 'y')) +
         geom_point(size=10) +
         geom_jitter(size=10, color='red', random_state=random_state) +
         geom_jitter(size=10, color='blue', width=0.1,
                     height=0.1, random_state=random_state))
    assert p + _theme == 'jitter'

    with pytest.raises(PlotnineError):
        geom_jitter(position=position_jitter(), width=0.1)
Example #21
0
def test_multiple_annotation_geoms():
    p = (ggplot(df, aes('x', 'y')) +
         geom_point() +
         annotate('point', 0, 1, color='red', size=5) +
         annotate('text', 1, 2, label='Text', color='red',
                  size=15, angle=45) +
         annotate('rect', xmin=1.8, xmax=2.2, ymin=2.8,
                  ymax=3.2, size=1, color='red', alpha=0.3) +
         annotate('segment', x=2.8, y=3.8, xend=3.2,
                  yend=4.2, color='red', size=1))

    assert p == 'multiple_annotation_geoms'
Example #22
0
def test_aesthetics():
    p = (ggplot(df) +
         geom_point(aes('x', 'y')) +
         geom_hline(aes(yintercept='yintercept'), size=2) +
         geom_hline(aes(yintercept='yintercept+.1', alpha='z'),
                    size=2) +
         geom_hline(aes(yintercept='yintercept+.2',
                        linetype='factor(z)'),
                    size=2) +
         geom_hline(aes(yintercept='yintercept+.3',
                        color='factor(z)'),
                    size=2) +
         geom_hline(aes(yintercept='yintercept+.4', size='z')))

    assert p + _theme == 'aesthetics'
Example #23
0
File: jmlr.py Project: Pinafore/qb
def yoy_growth():
    """
    This creates figures showing the number of questions versus year in dataset
    """
    with open('data/external/datasets/qanta.mapped.2018.04.18.json') as f:
        year_pages = defaultdict(set)
        year_questions = Counter()
        for q in json.load(f)['questions']:
            if q['page'] is not None:
                year_pages[q['year']].add(q['page'])
                year_questions[q['year']] += 1
    start_year = min(year_pages)
    # 2017 is the earlier year we have a full year's worth of data, including partial 2018 isn't accurate
    end_year = min(2017, max(year_pages))
    upto_year_pages = defaultdict(set)
    upto_year_questions = Counter()
    for upto_y in range(start_year, end_year + 1):
        for curr_y in range(start_year, upto_y + 1):
            upto_year_questions[upto_y] += year_questions[curr_y]
            for page in year_pages[curr_y]:
                upto_year_pages[upto_y].add(page)
    year_page_counts = {}
    for y, pages in upto_year_pages.items():
        year_page_counts[y] = len(pages)
    year_page_counts
    year_rows = []
    for y, page_count in year_page_counts.items():
        year_rows.append({'year': y, 'value': page_count, 'Quantity': 'Distinct Answers'})
        year_rows.append({'year': y, 'Quantity': 'Total Questions', 'value': upto_year_questions[y]})
    year_df = pd.DataFrame(year_rows)
    count_cat = CategoricalDtype(categories=['Total Questions', 'Distinct Answers'], ordered=True)
    year_df['Quantity'] = year_df['Quantity'].astype(count_cat)
    eprint(year_df[year_df.Quantity == 'Total Questions'])
    p = (
        ggplot(year_df)
        + aes(x='year', y='value', color='Quantity')
        + geom_line() + geom_point()
        + xlab('Year')
        + ylab('Count up to Year (inclusive)')
        + theme_fs()
        + scale_x_continuous(breaks=list(range(start_year, end_year + 1, 2)))
    )
    p.save(path.join(output_path, 'question_answer_counts.pdf'))
Example #24
0
    def fit_curve(self):
        df, questions = load_protobowl()
        # convert prompt to false
        df.result = df.result.apply(lambda x: x is True)

        xy = list(zip(df.relative_position.tolist(), df.result.tolist()))
        xy = sorted(xy, key=lambda x: x[0])
        ratios = dict()
        cnt = 0
        for x, y in xy:
            x = int(x*1000)
            ratios[x] = cnt
            cnt += y
        ratios = sorted(ratios.items(), key=lambda x: x[0])
        ratios = [(x / 1000, y) for x, y in ratios]

        ttl_correct = df.result.tolist().count(True)
        ttl_correct = len(xy)
        curve = [(x, 1 - y / ttl_correct) for x, y in ratios]
        X, y = list(map(list, zip(*curve)))

        X = np.asarray(X)
        y = np.asarray(y)
        degree = 3
        polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
        linear_regression = LinearRegression()
        pipeline = Pipeline([("polynomial_features", polynomial_features),
                             ("linear_regression", linear_regression)])
        pipeline.fit(X[:, np.newaxis], y)
        print(pipeline.steps[1][1].coef_)

        def get_weight(x):
            return pipeline.predict(np.asarray([[x]]))[0]

        ddf = pd.DataFrame({'x': X, 'y': y})
        p0 = ggplot(ddf, aes(x='x', y='y')) \
            + geom_point(size=0.3, color='blue', alpha=0.5, shape='+') \
            + stat_function(fun=get_weight, color='red', size=2, alpha=0.5) \
            + labs(x='Position', y='Weight')
        p0.save('output/reporting/curve_score.pdf')
        p0.draw()

        return pipeline
Example #25
0
def accPlot(accsByNFeats):
    plotdata = []
    for s in accsByNFeats:
        plotdata.append(pd.concat([DataFrame({"p" : p,
                                              "acc" : accsByNFeats[s][p],
                                              "set" : s},
                                             index = [str(p)])
                                   for p in accsByNFeats[s]],
                                  axis = 0))
    ggd = pd.concat(plotdata)
    ggd['acc'] = ggd['acc'].astype(float)
    ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set'))
    ggo += gg.geom_line(alpha=0.5)
    ggo += gg.geom_point()
    ggo += gg.theme_bw()
    ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000])
    ggo += gg.scale_color_manual(values=['darkgray', 'black',
                                         'red', 'dodgerblue'])
    ggo += gg.ylab('Accuracy (5-fold CV)')
    print(ggo)
    def fitted_actual(self, figure_size=(4, 4), sample_frac=1.0):
        """Plot fitted values against actual values

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(4, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        return (
            ggplot(self.df.sample(frac=sample_frac), aes(x="y", y="yhat")) +
            geom_point(alpha=0.25) +
            geom_abline(slope=1, intercept=0, color="red", linetype="dashed") +
            labs(title="Fitted vs Actual (R2 = {:.3f})".format(self.r2_score),
                 x="Actual",
                 y="Fitted") + theme(figure_size=figure_size))
 def plot(self,
          pc1=1,
          pc2=2,
          selection=None,
          color='cell type',
          shape=None,
          filter_on='cell type',
          alpha=.8,
          bl_rm=False,
          data=None):
     if data is None:
         data = self.plot_data
     if selection is not None:
         ind = [x in selection for x in data[filter_on]]
         data = data.loc[ind, ]
     if bl_rm is True:
         data = data.loc[~self.blacklisted, ]
     pl = pn.ggplot(pn.aes('PC ' + str(pc1), 'PC ' + str(pc2), color=color),
                    data) + pn.geom_point(alpha=alpha)
     if shape is not None:
         pl = pl + pn.aes(shape=shape)
     return pl
Example #28
0
def plot_scale(df: pd.DataFrame,
               sweep_vars: Sequence[Text] = None) -> gg.ggplot:
    """Plots the best episode observed by height_threshold."""
    df = cp_swingup_preprocess(df_in=df)

    group_vars = ['height_threshold']
    if sweep_vars:
        group_vars += sweep_vars
    plt_df = df.groupby(group_vars)['best_episode'].max().reset_index()

    p = (
        gg.ggplot(plt_df) +
        gg.aes(x='factor(height_threshold)',
               y='best_episode',
               colour='best_episode > {}'.format(GOOD_EPISODE)) +
        gg.geom_point(size=5, alpha=0.8) +
        gg.scale_colour_manual(values=['#d73027', '#313695']) +
        gg.geom_hline(gg.aes(yintercept=0.0), alpha=0)  # axis hack
        + gg.scale_x_discrete(breaks=[0, 0.25, 0.5, 0.75, 1.0]) +
        gg.ylab('best return in first {} episodes'.format(NUM_EPISODES)) +
        gg.xlab('height threshold'))
    return plotting.facet_sweep_plot(p, sweep_vars)
Example #29
0
class TestOther(object):
    p = ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy'))

    def test_wls(self):
        p = self.p + geom_smooth(aes(y='y_noisy'), method='wls')
        p.draw_test()

    def test_rlm(self):
        p = self.p + geom_smooth(aes(y='y_noisy'), method='rlm')
        with pytest.warns(UserWarning):
            p.draw_test()

    def test_glm(self):
        p = self.p + geom_smooth(aes(y='y_noisy'), method='glm')
        p.draw_test()

    def test_gls(self):
        p = self.p + geom_smooth(aes(y='y_noisy'), method='gls')
        p.draw_test()

    def test_lowess(self):
        p = self.p + geom_smooth(aes(y='y_noisy'), method='lowess')
        with pytest.warns(UserWarning):
            p.draw_test()

    def test_mavg(self):
        p = self.p + geom_smooth(
            aes(y='y_noisy'), method='mavg', method_args={'window': 10})
        p.draw_test()

    def test_gpr(self):
        try:
            from sklearn import gaussian_process  # noqa:401
        except ImportError:
            return

        p = self.p + geom_smooth(aes(y='y_noisy'), method='gpr')
        p.draw_test()
    def qq_plot(self, figure_size=(6, 4), sample_frac=1.0):
        """QQ plot of residuals

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(6, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        # Normal distribution quantiles
        q = stats.norm.ppf([(x + 1) / (len(self.y) + 1)
                            for x in range(len(self.y))])

        # Get gradient and intercept of QQ line
        r_quantiles = np.quantile(self.df.residual, [0.25, 0.75])
        norm_quantiles = stats.norm.ppf([0.25, 0.75])
        qq_grad = (r_quantiles[1] - r_quantiles[0]) / (norm_quantiles[1] -
                                                       norm_quantiles[0])
        qq_int = r_quantiles[0] - qq_grad * norm_quantiles[0]

        # data frame to hold the plot data
        qq = pd.DataFrame(zip(self.df.residual.sort_values(ascending=True), q),
                          columns=["x", "norm_q"])

        return (
            ggplot(qq.sample(frac=sample_frac), aes(x="norm_q", y="x")) +
            geom_point(alpha=0.25) + geom_abline(intercept=qq_int,
                                                 slope=qq_grad,
                                                 color="red",
                                                 linetype="dashed") +
            labs(title="QQ Plot", x="Normal Quantiles", y="Sample Quantiles") +
            theme(figure_size=figure_size))
Example #31
0
def generate_scatter_plots(
        data,
        x="pca1",
        y="pca2",
        nsample=200,
        random_state=100,
        selected_categories=['bioinformatics', 'neuroscience'],
        color_palette=['#a6cee3', '#1f78b4'],
        save_file_path="output/pca_plots/scatterplot_files/pca01_v_pca02.png"):
    g = (p9.ggplot(
        data.query(f"category in {selected_categories}").groupby("category").
        apply(lambda x: x.sample(nsample, random_state=random_state)
              if len(x) > nsample else x).reset_index(drop=True)) +
         p9.aes(x=x, y=y, color="factor(category)") + p9.geom_point() +
         p9.scale_color_manual({
             category: color
             for category, color in zip(selected_categories, color_palette)
         }) + p9.labs(title="PCA of BioRxiv (Word Dim: 300)",
                      color="Article Category") +
         p9.theme(figure_size=(4.59, 3.44), dpi=300))

    g.save(save_file_path)
    print(g)
Example #32
0
def ikuya_sys_plot():
    nips_df = load_ikuya_nips()
    with open('2019_tacl_trick/data/ikuya_cdf.json') as f:
        df = pd.DataFrame(json.load(f))
        df = pd.concat([df, nips_df])
        df['model'] = df['model'].map(relabel)
        model_dtype = CategoricalDtype(
            ['Regular Test', 'IR Adversarial', 'RNN Adversarial'],
            ordered=True)
        df['model'] = df['model'].astype(model_dtype)
        p = (
            ggplot(df) + aes(x='x', y='y', color='model', xmin='x', xmax='x') +
            geom_point(size=1.0, shape='.') +
            xlab('Percent of Question Revealed') + ylab('Accuracy') +
            scale_y_continuous(breaks=np.linspace(0, 1, 6), limits=[0, 1]) +
            theme(
                legend_position=(.335, .7),
                legend_background=element_blank(
                ),  #element_rect(alpha=1, fill='#EEEFEE', color='white'),                                
                #legend_key=element_rect(alpha=0),
                legend_box_margin=0,
                legend_title=element_blank()))
    p.save('2019_tacl_trick/auto_fig/ikuya_cdf.pdf', width=3.5, height=2.5)
Example #33
0
def test_coord_trans():
    df = pd.DataFrame({
        'x': range(10),
        'y': range(10)
    })
    rdf = pd.DataFrame({
        'xmin': [3],
        'xmax': 7,
        'ymin': -np.inf,
        'ymax': np.inf,
    })

    p = (ggplot(df, aes('x', 'y'))
         + geom_point()
         + geom_rect(
             data=rdf,
             mapping=aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax'),
             alpha=0.2,
             inherit_aes=False)
         + coord_trans()
         )

    assert p == 'coord-trans'
def go_to_time_plot1(go_to_time_probs_new: list, go_to_time_probs_old: list,
                     average_minutes_per_game_values: list):
    """ Plot go-to-time probability, new vs. old rules, no blowouts, 85 matches/round """

    time_prob_data = pd.DataFrame({
        'Average minutes per game':
        np.concatenate(
            [average_minutes_per_game_values,
             average_minutes_per_game_values]),
        'P(Go to time)':
        np.concatenate([go_to_time_probs_new, go_to_time_probs_old]),
        'Rules':
        np.concatenate([
            np.repeat('New', len(average_minutes_per_game_values)),
            np.repeat('Old', len(average_minutes_per_game_values))
        ])
    })
    (plt.ggplot(
        time_prob_data,
        plt.aes(x='Average minutes per game', y='P(Go to time)',
                color='Rules')) + plt.geom_line() + plt.geom_point() +
     plt.ylim([0, 1]) +
     plt.theme_classic()).save(filename='figures/go_to_time_prob_plot.png')
Example #35
0
def ggfuntile(f,
              d,
              xrng=(0, 1),
              yrng=(0, 1),
              limits=(0, 1),
              density=51,
              xlab="x",
              ylab="y",
              zlab="f",
              breaks=None,
              **kwargs):
    od = OrderedDict()
    od[xlab] = np.arange(xrng[0], xrng[1],
                         (xrng[1] - xrng[0]) / (density - 1.0))
    od[ylab] = np.arange(yrng[0], yrng[1],
                         (yrng[1] - yrng[0]) / (density - 1.0))
    ggdata = expandGrid(od)
    ggdata["z"] = [
        f(ggdata.iloc[i, 0], ggdata.iloc[i, 1]) for i in range(ggdata.shape[0])
    ]
    gg = ggplot(ggdata, aes(x=xlab, y=ylab))
    gg += geom_tile(aes(fill="z"))
    gg += scale_fill_gradientn(colors=[
        "black", "#202020", "#404040", "#808080", "white", "dodgerblue",
        "blue", "darkblue", "midnightblue"
    ],
                               name=zlab,
                               limits=limits)
    gg += theme_classic()
    gg += geom_point(data=d,
                     mapping=aes(shape="class"),
                     color="red",
                     size=2,
                     alpha=0.8)
    gg += scale_shape_manual(values=["x", "^"])
    return gg
Example #36
0
def create_highlighted_scatter_plot(gwas_df):
    """This function creates a scatter plot with certain points highlighted.
    Doing so will require modifying gwas_df to reflect which points should be highlighted,
    then actually creating the scatter plot

    Inputs
    ------
    gwas_df: pandas.DataFrame
        A dataframe containing information from a genome-wide association study

    Return
    ------
    plot: plotnine.ggplot
        The scatter plot representing the GWAS data
    """
    snps_of_interest = [
        'rs12752601', 'rs117018967', 'rs188695075', 'rs6604965', 'rs542289952'
    ]

    gwas_df['snp_of_interest'] = gwas_df['rsid'].isin(snps_of_interest)

    return ggplot(gwas_df, aes(x='Position_hg19', y='neg_log_p', color='snp_of_interest'))\
           + geom_point()\
           + geom_hline(yintercept=6, color='red')
def lineplot_celldiv_moment(adata):
    """ Plots total_counts as a function of the principal circle nodes to
    visualize the moment of cell division.
    
    Parameters
    ----------------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.celldiv_moment`.
        
    Returns
    ------------
    A plotnine line-plot to help visualize the moment of cell division and
    direction of the cell cycle. The moment of cell division is defined by the 
    largest drop in total_counts. The changes in counts are represented by the
    bars at the bottom, and the suggested moment of cell division is marked in
    red. The cell cycle should follow an incremental increase in total counts
    until around the moment of cell division.
    """
    edge_to_0 = adata.uns['scycle']['cell_div_moment']['cell_div_edge'][0]
    edges = adata.uns['princirc_gr']['edges']
    edges['cell_div'] = edges['e1'] == edge_to_0
    ref_var = adata.uns['scycle']['cell_div_moment']['ref_var']

    cell_div_count = edges[edges['e1'] == edge_to_0]['mean_var']

    cell_div_plot = (
        ggplot(edges, aes('e1', 'mean_var')) +
        geom_point(aes(y='mean_var'), size=2) + geom_path(aes(y='mean_var')) +
        geom_smooth(aes(y='mean_var'), method='lm', linetype='dashed') +
        annotate("point", x=edge_to_0, y=cell_div_count, color='red', size=2) +
        labs(x='Edge position', y=ref_var) +
        geom_col(aes(y='diff_var', fill='cell_div')) +
        scale_fill_manual(values=['darkgrey', 'red'], guide=False) + theme_std)

    return cell_div_plot
input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded,
                                         index=normalized_compendium.index,
                                         columns=['1','2'])
# Add label
input_data_UMAPencoded_df['dataset'] = 'training'
input_data_UMAPencoded_df.loc[val_samples,'dataset'] = 'validation'

input_data_UMAPencoded_df


# In[12]:


# Plot
fig = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2'))
fig += geom_point(aes(color='dataset'), alpha=0.2)
fig += labs(x ='UMAP 1',
            y = 'UMAP 2',
            title = 'UMAP of normalized compendium')
fig += theme_bw()
fig += theme(
    legend_title_align = "center",
    plot_background=element_rect(fill='white'),
    legend_key=element_rect(fill='white', colour='white'), 
    legend_title=element_text(family='sans-serif', size=15),
    legend_text=element_text(family='sans-serif', size=12),
    plot_title=element_text(family='sans-serif', size=15),
    axis_text=element_text(family='sans-serif', size=12),
    axis_title=element_text(family='sans-serif', size=15)
    )
fig += guides(colour=guide_legend(override_aes={'alpha': 1}))
Example #39
0
        .reset_index(drop=True)
    ),
}

# # UMAP of the Documents

# This section is to highlight the differences between embedding models using UMAP.
# The three models being compared are:
# 1. initialized Word2Vec Model
# 2. Doc2vec model
# 3. Pretrained Word2Vec Model - first trained on Google news dataset 300 dim, then trained on bioRxiv

g = (
    p9.ggplot(biorxiv_umap_models_latest["original"])
    + p9.aes(x="umap1", y="umap2", color="factor(category)")
    + p9.geom_point()
    + p9.labs(title="UMAP of BioRxiv (Word dim: 300)", color="Article Category")
)
g.save("output/embedding_output/umap/figures/biorxiv_umap_300.png", dpi=500)
print(g)

g = (
    p9.ggplot(biorxiv_umap_models_latest["doc2vec"])
    + p9.aes(x="umap1", y="umap2", color="factor(category)")
    + p9.geom_point()
    + p9.labs(title="UMAP of BioRxiv (Doc2vec Word dim: 300)", color="Article Category")
)
g.save("output/embedding_output/umap/figures/biorxiv_umap_300_doc2vec.png", dpi=500)
print(g)

g = (
Example #40
0
                                   x['k'],
                                   x['resubAccuracy'],
                                   x['testAccuracy'])
                                  for x in repeatedKnnResults],
                                 columns = ['p',
                                            'k',
                                            'resubAccuracy',
                                            'testAccuracy'])

ggdata = pd.concat(
    [DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'resub',
                'Accuracy' : knnResultsSimplified.resubAccuracy}),
     DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'test',
                'Accuracy' : knnResultsSimplified.testAccuracy})],
    axis = 0
)

plt.close()
ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy',
                               color='type', group='type', linetype='type'))
ggo += gg.facet_wrap('~ k')
ggo += gg.scale_x_log10()
ggo += gg.geom_point(alpha=0.6)
ggo += gg.stat_smooth()
ggo += gg.theme_bw()
print(ggo)
Example #41
0
def ologram_merge_stats(inputfiles=None,
                        pdf_width=None,
                        pdf_height=None,
                        output=None,
                        labels=None):
    # -------------------------------------------------------------------------
    # Check user provided labels
    # -------------------------------------------------------------------------

    if labels is not None:

        labels = labels.split(",")

        for elmt in labels:
            if not re.search("^[A-Za-z0-9_]+$", elmt):
                message(
                    "Only alphanumeric characters and '_' allowed for --more-bed-labels",
                    type="ERROR")
        if len(labels) != len(inputfiles):
            message("--labels: the number of labels should be"
                    " the same as the number of input files ", type="ERROR")

        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    # Loop over input files
    # -------------------------------------------------------------------------

    df_list = list()
    df_label = list()

    for pos, infile in enumerate(inputfiles):
        message("Reading file : " + infile.name)
        # Read the dataset into a temporay dataframe
        df_tmp = pd.read_csv(infile, sep='\t', header=0, index_col=None)
        # Change name of 'feature_type' column.
        df_tmp = df_tmp.rename(index=str, columns={"feature_type": "Feature"})
        # Assign the name of the dataset to a new column

        if labels is None:
            file_short_name = os.path.basename(os.path.normpath(os.path.dirname(infile.name)))
            df_label += [file_short_name]
        else:
            file_short_name = labels[pos]
            df_label += [labels[pos]]

        df_tmp = df_tmp.assign(**{"dataset": [file_short_name] * df_tmp.shape[0]})
        # Pval set to 0 or -1 are changed to 1e-320 and NaN respectively
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan
        # Compute -log10(pval)
        df_tmp = df_tmp.assign(**{"-log_10(pval)": -np.log10(df_tmp.summed_bp_overlaps_pvalue)})

        # Which p-values are signifcant ?
        # TODO: For now, draws all p-values. Add Benjamini-Hochberg correction, and distinguish between NaN and 0.
        df_tmp = df_tmp.assign(**{"pval_signif": df_tmp.summed_bp_overlaps_pvalue > 0})

        # Add the df to the list to be subsequently merged
        df_list += [df_tmp]



    if len(set(df_label)) < len(df_label):
        message('Enclosing directories are ambiguous and cannot be used as labels. You may use "--labels".',
                type="ERROR")

    # -------------------------------------------------------------------------
    # Concatenate dataframes (row bind)
    # -------------------------------------------------------------------------

    message("Merging dataframes.")
    df_merged = pd.concat(df_list, axis=0)

    # -------------------------------------------------------------------------
    # Plotting
    # -------------------------------------------------------------------------

    message("Plotting")
    my_plot = ggplot(data=df_merged,
                     mapping=aes(y='Feature', x='dataset'))
    my_plot += geom_tile(aes(fill = 'summed_bp_overlaps_log2_fold_change'))
    my_plot += scale_fill_gradient2()
    my_plot += labs(fill = "log2(fold change) for summed bp overlaps")

    # Points for p-val. Must be after geom_tile()
    my_plot += geom_point(data = df_merged.loc[df_merged['pval_signif']],
        mapping = aes(x='dataset',y='Feature',color = '-log_10(pval)'), size=4, shape ='D', inherit_aes = False)
    my_plot += scale_color_gradientn(colors = ["#160E00","#FFB025","#FFE7BD"])
    my_plot += labs(color = "-log10(p-value)")

    # Theming
    my_plot += theme_bw()
    my_plot += theme(panel_grid_major=element_blank(),
                     axis_text_x=element_text(rotation=90),
                     panel_border=element_blank(),
                     axis_ticks=element_blank())

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    message("Saving")
    nb_ft = len(list(df_merged['Feature'].unique()))
    nb_datasets = len(list(df_merged['dataset'].unique()))

    if pdf_width is None:
        panel_width = 0.6
        pdf_width = panel_width * nb_datasets

        if pdf_width > 100:
            pdf_width = 100
            message("Setting --pdf-width to 100 (limit)")

    if pdf_height is None:
        panel_height = 0.6
        pdf_height = panel_height * nb_ft

        if pdf_height > 500:
            pdf_height = 500
            message("Setting --pdf-height to 500 (limit)")

    message("Page width set to " + str(pdf_width))
    message("Page height set to " + str(pdf_height))
    figsize = (pdf_width, pdf_height)

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()

        message("Saving diagram to file : " + output.name)
        message("Be patient. This may be long for large datasets.")

        # NOTE : We must manually specify figure size with save_as_pdf_pages
        save_as_pdf_pages(filename=output.name,
                          plots=[my_plot + theme(figure_size=figsize)],
                          width=pdf_width,
                          height=pdf_height)
Example #42
0
import pandas as pd
import pytest

from plotnine import ggplot, aes, geom_point, facet_grid, facet_wrap
from plotnine import geom_abline, theme

n = 10
df = pd.DataFrame({'x': range(n),
                   'y': range(n),
                   'var1': np.repeat(range(n//2), 2),
                   'var2': np.tile(['a', 'b'], n//2),
                   })
df['class'] = df['var1']  # python keyword as column

g = (ggplot(df, aes('x', 'y')) +
     geom_point(aes(color='factor(var1)'),
                size=5, show_legend=False))


# facet_wrap

def test_facet_wrap_one_var():
    p = g + facet_wrap('~var1')
    p2 = g + facet_wrap('~class')  # python keyword in formula
    assert p == 'facet_wrap_one_var'
    assert p2 == 'facet_wrap_one_var'


# https://github.com/pandas-dev/pandas/issues/16276
@pytest.mark.xfail
def test_facet_wrap_expression():
    p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
Example #43
0
     geom_line(size=0.5) + ylim(0, 202) + labs(x="time", y="$[S^{**}]$") +
     scale_color_distiller(
         palette='RdYlBu', type="diverging", name="$B_{tot}$") +
     facet_wrap('~dir') + theme_bw())
g.save(filename="./num_cont_graphs/sim_fwd_rev2.png",
       format="png",
       width=8,
       height=4,
       units='in',
       verbose=False)

eq = out[out.time == max(out.time)]

g = (ggplot(eq) + aes(x='signal', y=response, color='dir') +
     labs(x="$B_{tot}$", y="$[S^{**}]$", color="") +
     geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw() +
     geom_point(color="black") + annotate("point",
                                          x=plot_specifications[2][0][0],
                                          y=plot_specifications[2][0][1],
                                          colour="red",
                                          shape="*",
                                          size=3.5) +
     annotate("text",
              x=plot_specifications[2][0][0],
              y=plot_specifications[2][0][1],
              label=plot_specifications[2][0][2]))
# + annotate("point", x=plot_specifications[2][1][0], y=plot_specifications[2][1][1], colour="red", shape="*",
#            size=3.5)
# + annotate("text", x=plot_specifications[2][1][0], y=plot_specifications[2][1][1],
#            label=plot_specifications[2][1][2]))
g.save(filename="./num_cont_graphs/sim_bif_diag2.png",
Example #44
0
    def plot(self):
        """Plot the figures using R"""
        df = pandas.DataFrame(
            self.data,
            columns=self.datacols,
        )
        with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "):
            df.columns = make_unique(df.columns.tolist())

        if self.savedata:
            datafile = self.outprefix + ".csv"
            logger.info(
                "[r]%s[/r]: Saving data to: %r",
                self.title,
                datafile,
                extra={"markup": True},
            )
            df.to_csv(datafile, index=False)

        if df.shape[0] == 0:
            logger.warning("No data points to plot")
            return

        aes_for_geom_fill = None
        aes_for_geom_color = None
        theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2))
        if df.shape[1] > 2:
            aes_for_geom_fill = p9.aes(fill=df.columns[2])
            aes_for_geom_color = p9.aes(color=df.columns[2])
        plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1]))
        if self.figtype == "scatter":
            plt = plt + p9.geom_point(aes_for_geom_color)
            theme_elems = None
        elif self.figtype == "line":
            pass
        elif self.figtype == "bar":
            plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0]))
        elif self.figtype == "col":
            plt = plt + p9.geom_col(aes_for_geom_fill)
        elif self.figtype == "pie":
            logger.warning("Pie chart is not support by plotnine yet, "
                           "plotting bar chart instead.")
            col0 = df.iloc[:, 0]
            if df.shape[1] > 2:
                plt = plt + p9.geom_bar(
                    p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]),
                    stat="identity"
                    # aes_for_geom_fill,
                    # x=df.Group,
                    # y=col0,
                    # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"),
                    # show_legend=False,
                    # position=p9.position_adjust_text(),
                )
            else:
                col0 = factor(col0, levels=rev(unique(as_character(col0))))
                fills = rev(levels(col0))
                sums = map(lambda x: sum(col0 == x), fills)
                print(col0)
                print(fills)
                plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) +
                       p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label(
                           x=1,
                           y=cumsum(sums) - sums / 2,
                           label=paste0(round(sums / sum(sums) * 100, 1), "%"),
                           show_legend=False,
                       ))
                theme_elems = p9.theme(
                    axis_title_x=p9.element_blank(),
                    axis_title_y=p9.element_blank(),
                    axis_text_y=p9.element_blank(),
                )
        elif self.figtype == "violin":
            plt = plt + p9.geom_violin(aes_for_geom_fill)
        elif self.figtype == "boxplot":
            plt = plt + p9.geom_boxplot(aes_for_geom_fill)
        elif self.figtype in ("histogram", "density"):
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            geom = getattr(p9, f"geom_{self.figtype}")
            if df.columns[1] != "ONE":
                plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6)
                theme_elems = None
            else:
                plt = plt + geom(alpha=0.6)
                theme_elems = p9.theme(legend_position="none")
        elif self.figtype == "freqpoly":
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            if df.columns[1] != "ONE":
                plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1]))
            else:
                plt = plt + p9.geom_freqpoly()
            theme_elems = None
        else:
            raise ValueError(f"Unknown figure type: {self.figtype}")

        plt = plt + p9.ggtitle(self.title)
        self.save_plot(plt, theme_elems)
Example #45
0
from plotnine.data import economics
from plotnine import ggplot, aes, facet_grid, labs, geom_point, geom_smooth, xlab, ylab

g=(ggplot(economics)
        + aes(x="date", y="uempmed")
        + geom_point()
        + geom_smooth(color="red", span=0.5)
        + xlab("date (year)")
        + ylab("unemploynment"))

g.save("19.png")
Example #46
0
def scatter_cell_cycle(
    adata,
    scores=["signatures", "components"][0],
    size=1.5,
    alpha=1,
    curvature_shrink=1,
    lab_ypos=2,
):
    """Plots cell cycle signatures vs pseudotime

    Parameters
    ----------------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.cell_cycle_phase`.
    scores: str
        A string indicating what to plot as cell cycle scores against pseudotime.
        If 'signatures', standard S-phase, G2-M and Histones signatures are used;
        if 'components', the 4 cell cycle related components are used.
    size: float
        Controls the point size of the plot.
    alpha: float
        A value between 0 and 1. Controls point transparency.
    lab_ypos: float
        Controls the y-axis position of the cell cycle phase annotation.

    Returns
    --------------
    A plotnine scatter plot of pseudotime vs 3 cell cycle signatures.

    """
    if scores == "signatures":
        y = ["S-phase", "G2-M", "Histones"]
        colors = ["#66c2a5", "#fc8d62", "#8da0cb", "black"]
    elif scores == "components":
        _add_compScores(adata)
        y = ["G1/S comp", "G2/M+ comp", "G2/M- comp", "Histones comp"]
        colors = ["#66c2a5", "#fc8d62", "#8da0cb", "#e5c494", "black"]

    time_scatter = scatter_pseudotime(
        adata, y=y, size=size, alpha=alpha) + labs(
            x="Pseudotime", y="Signature scores", color="Signature")

    # -- Add cell cycle annotations
    if "cell_cycle_division" in adata.uns["scycle"]:
        cc_divs = adata.uns["scycle"]["cell_cycle_division"]

        # -- Curvature data
        curv_data = cc_divs["curvature"]
        curv = curv_data["curvature"].values
        cvz = zscore(curv) / curvature_shrink
        cvz = cvz - np.max(cvz)
        curv_data.loc[:, "curvature"] = cvz
        curv_data.loc[:, "signature"] = "Curvature"

        # -- Peak data (for segments)
        gr_min = np.min(curv_data["curvature"])
        pk_data = curv_data[curv_data["ispeak"] == "peak"]
        pk_data.loc[:, "ymin"] = gr_min

        # -- Cell cycle annotation
        cc_phase = pd.DataFrame(
            dict(
                starts=[
                    None,
                    cc_divs["s_start"],
                    cc_divs["g2_start"],
                    cc_divs["m_start"],
                ],
                labels=["G1", "S", "G2", "M"],
                labpos=[
                    np.mean([0, cc_divs["s_start"]]),
                    np.mean([cc_divs["s_start"], cc_divs["g2_start"]]),
                    np.mean([cc_divs["g2_start"], cc_divs["m_start"]]),
                    np.mean([cc_divs["m_start"], 1]),
                ],
                y=lab_ypos,
            ))

        cell_cycle_plt = (
            time_scatter +
            geom_point(aes("pseudotime", "curvature", color="signature"),
                       data=curv_data) +
            geom_line(aes("pseudotime", "curvature"), data=curv_data) +
            scale_color_manual(values=colors) + geom_segment(
                aes(x="pseudotime",
                    xend="pseudotime",
                    y="ymin",
                    yend="curvature"),
                linetype="dotted",
                data=pk_data,
            ) + geom_vline(
                aes(xintercept="starts"), linetype="dashed", data=cc_phase) +
            geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase))

        return cell_cycle_plt
    else:
        return time_scatter
Example #47
0
class TestThemes(object):
    g = (ggplot(mtcars, aes(x='wt', y='mpg', color='factor(gear)')) +
         geom_point() + facet_grid('vs ~ am'))

    def test_theme_538(self):
        p = self.g + labs(title='Theme 538') + theme_538()

        assert p + _theme == 'theme_538'

    def test_theme_bw(self):
        p = self.g + labs(title='Theme BW') + theme_bw()

        assert p + _theme == 'theme_bw'

    def test_theme_classic(self):
        p = self.g + labs(title='Theme Classic') + theme_classic()

        assert p + _theme == 'theme_classic'

    def test_theme_dark(self):
        p = self.g + labs(title='Theme Dark') + theme_dark()

        assert p + _theme == 'theme_dark'

    def test_theme_gray(self):
        p = self.g + labs(title='Theme Gray') + theme_gray()

        assert p + _theme == 'theme_gray'

    def test_theme_light(self):
        p = self.g + labs(title='Theme Light') + theme_light()

        assert p + _theme == 'theme_light'

    def test_theme_linedraw(self):
        p = self.g + labs(title='Theme Linedraw') + theme_linedraw()

        if six.PY2:
            # Small displacement in title
            assert p + _theme == ('theme_linedraw', {'tol': 8})
        else:
            assert p + _theme == 'theme_linedraw'

    def test_theme_matplotlib(self):
        p = self.g + labs(title='Theme Matplotlib') + theme_matplotlib()

        assert p + _theme == 'theme_matplotlib'

    def test_theme_minimal(self):
        p = self.g + labs(title='Theme Minimal') + theme_minimal()

        assert p + _theme == 'theme_minimal'

    def test_theme_seaborn(self):
        p = self.g + labs(title='Theme Seaborn') + theme_seaborn()

        assert p + _theme == 'theme_seaborn'

    def test_theme_void(self):
        p = self.g + labs(title='Theme Void') + theme_void()

        assert p + _theme == 'theme_void'

    def test_theme_xkcd(self):
        p = self.g + labs(title='Theme Xkcd') + theme_xkcd()

        if os.environ.get('TRAVIS'):
            # Travis does not have the fonts, we still check
            # to catch any other errors
            assert p + _theme != 'theme_gray'
        else:
            assert p + _theme == 'theme_xkcd'
Example #48
0
    )
    # .assign(
    #    abstract_only_distance_log10=lambda x: -np.log10(x.abstract_only_distance),
    #    full_text_distance_log10=lambda x: -np.log10(x.full_text_distance),
    # )
)
plot_df.head()

# Pearson's R for correlation
# Shows a weak but positive correlation
scipy.stats.pearsonr(plot_df.full_text_distance, plot_df.abstract_only_distance)

g = (
    p9.ggplot(plot_df)
    + p9.aes(x="full_text_distance", y="abstract_only_distance")
    + p9.geom_point(fill="#a6cee3")
    + p9.scale_y_continuous(trans="log10")
    + p9.labs(x="Full Text Distance", y="Abstract Only Distance")
    + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=1.35)
)
g.save("output/figures/biorxiv_full_text_v_abstract_only.svg", dpi=250)
g.save("output/figures/biorxiv_full_text_v_abstract_only.png", dpi=250)
print(g)

# Remove outliers for shape of distribution
g = (
    p9.ggplot(plot_df.query("abstract_only_distance>1e-3"))
    + p9.aes(x="full_text_distance", y="abstract_only_distance")
    + p9.geom_point(fill="#a6cee3")
    + p9.scale_y_continuous(trans="log10")
    + p9.labs(x="Full Text Distance", y="Abstract Only Distance")
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1],
                            'color': all_color_data[:, 2]})

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float})

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
Example #50
0
File: jmlr.py Project: Pinafore/qb
def syntactic_diversity_plots():
    with open('data/external/syntactic_diversity_table.json') as f:
        rows = json.load(f)
    parse_df = pd.DataFrame(rows)
    parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses']
    melt_df = pd.melt(
        parse_df,
        id_vars=['dataset', 'depth', 'overlap', 'parses'],
        value_vars=['parse_ratio', 'unique_parses'],
        var_name='metric',
        value_name='y'
    )

    def label_facet(name):
        if name == 'parse_ratio':
            return 'Average Unique Parses per Instance'
        elif name == 'unique_parses':
            return 'Count of Unique Parses'

    def label_y(ys):
        formatted_ys = []
        for y in ys:
            y = str(y)
            if y.endswith('000.0'):
                formatted_ys.append(y[:-5] + 'K')
            else:
                formatted_ys.append(y)
        return formatted_ys
    p = (
    ggplot(melt_df)
        + aes(x='depth', y='y', color='dataset')
        + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet)
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth') + ylab('')
        + scale_color_discrete(name='Dataset')
        + scale_y_continuous(labels=label_y)
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'syn_div_plot.pdf'))
    p = (
    ggplot(parse_df)
        + aes(x='depth', y='unique_parses', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Count of Unique Parses')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'n_unique_parses.pdf'))
    p = (
        ggplot(parse_df)
        + aes(x='depth', y='parse_ratio', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Average Unique Parses per Instance')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10])
        + scale_y_continuous(limits=[0, 1])
        + theme_fs()
    )
    p.save(path.join(output_path, 'parse_ratio.pdf'))
import pandas as pd
df = pd.read_csv("/home/shaury/Downloads/nptel/winequality-red.csv",
                 delimiter=",")

from plotnine import ggplot, geom_point, aes
ggplot(df) + geom_point(
    mapping=aes(x=df["fixed acidity"], y=df["sulphates"], color=df["quality"]))

from sklearn.model_selection import train_test_split
x, y = df[[
    "fixed acidity", "volatile acidity", "citric acid", "residual sugar",
    "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density",
    "pH", "sulphates", "alcohol"
]], df["quality"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15)

from sklearn.linear_model import LogisticRegression as LR
ld = LR(solver='liblinear', random_state=0)
ld.fit(x_train, y_train)
y_pred = ld.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_pred, y_test))

print(confusion_matrix(y_pred, y_test))
Example #52
0
inc_start = iglo_nest[iglo_nest.type == "incubation"].julian.min()
inc_end = iglo_nest[iglo_nest.type == "incubation"].julian.max()
inc_lbl_pos = inc_start + (inc_end - inc_start) / 2
hatch_start = iglo_nest[iglo_nest.type == "hatch"].julian.min()
hatch_end = min(iglo_nest[iglo_nest.type == "hatch"].julian.max(),
                iglo.julian.max() + 2)
hatch_lbl_pos = hatch_start + (hatch_end - hatch_start) / 2

xmin = min(inc_start, iglo.julian.min())
xmax = min(iglo_nest[iglo_nest.type == "hatch"].julian.max(),
           iglo.julian.max() + 2)

(ggplot(data=iglo, mapping=aes(x='julian', y='ACI_mean', colour='site'))
 #+ facet_grid("panel~", scales="free")
 + xlab("Day") + ylab("Mean daily ACI (standardized)") + geom_point() +
 theme(legend_position="none")
 # + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std"))
 + geom_smooth(method="mavg",
               se=False,
               method_args={
                   "window": 4,
                   "center": True,
                   "min_periods": 1
               }) + annotate("rect",
                             xmin=[inc_start, hatch_start],
                             xmax=[inc_end, hatch_end],
                             ymin=-math.inf,
                             ymax=math.inf,
                             alpha=0.1,
                             fill=["red", "blue"]) +
Example #53
0
def day_night_attacks(Data, Data_m):
    print('======= Creating day_night_attacks =======')
    #Filter montlhy and ever Symptomes
    freq_all = Data[(Data.Group == 'sy')]
    freq_m = Data_m[(Data_m.Group == 'sy')]
    
    test = freq_all[(pd.isna(freq_all.year) == 0) & (pd.isna(freq_all.month) == 0)]
    Test_3 = pd.DataFrame(test.groupby("hour", as_index = False).count())
    Test_3 = Test_3.iloc[:, 0:2]
    Test_3 = Test_3.rename(columns = {"Unnamed: 0": "n"})

    test_m = freq_m[(pd.isna(freq_m.year) == 0) & (pd.isna(freq_m.month) == 0)]
    Test_3_m = pd.DataFrame(test_m.groupby("hour", as_index = False).count())
    Test_3_m = Test_3_m.iloc[:, 0:2]
    Test_3_m = Test_3_m.rename(columns = {"Unnamed: 0": "n"})
    
    
    plot =(p9.ggplot(data=Test_3,
                     mapping=p9.aes(x='hour', y = 'n'))
        + p9.geom_point(color = 'red', size = 10)
        + p9.geom_line(color = 'red', size = 1)        
        #+ p9.geom_point(color = 'red', size = 10)
        #+ p9.geom_line(color = 'red', size = 1)
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size=40),
                   axis_title = p9.element_text(size = 40,face = 'bold'))
        + p9.coord_cartesian(xlim = (1,25))
        + p9.labs(x='Hours',y='No. of attacks')
        + p9.scale_x_discrete(limits = (range(1,25)))
        )
    plot_month =(p9.ggplot(data=Test_3_m,
                     mapping=p9.aes(x='hour', y = 'n'))
        #+ p9.geom_line(color = 'red', size = 5)
        + p9.geom_point(color = 'red', size = 10)
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size=40),
                   axis_title = p9.element_text(size = 40,face = 'bold'))
        + p9.coord_cartesian(xlim = (1,25))
        + p9.labs(x='Hours',y='No. of attacks')
        + p9.scale_x_discrete(limits = (range(1,25)))
        )

    #Creating and saving MONTHLY Grap_3
    if (len(Test_3_m) > 0):
        #G3 = graph_3(freq_m)
        plot_month.save(filename = 'Graph_3.jpeg',
                 plot = plot_month,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')

    #Creating and saving EVER Grap_3
    if (len(freq_all) > 0):
        #G3 = graph_3(freq_all)
        plot.save(filename = 'Graph_ALL_3.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')

    return(print('=================================day_night_attacks DONE ============================='))    
def plot_scatter(dat, figsize=(16, 12)):
    return (pn.ggplot(dat, pn.aes(x='val', y='response')) + pn.geom_point() +
            pn.geom_smooth(method='lm') +
            pn.facet_wrap("var", scales='free_x') + pn.theme_bw() +
            pn.theme(figure_size=figsize, subplots_adjust={'hspace': 0.25}))
Example #55
0
    def plot_char_percent_vs_accuracy_smooth(
        self, expo=False, no_models=False, columns=False
    ):
        if self.y_max is not None:
            limits = [0, float(self.y_max)]
            eprint(f"Setting limits to: {limits}")
        else:
            limits = [0, 1]
        if expo:
            if (
                os.path.exists("data/external/all_human_gameplay.json")
                and not self.no_humans
            ):
                with open("data/external/all_human_gameplay.json") as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [
                        ("parents", "Intermediate"),
                        ("maryland", "Expert"),
                        ("live", "National"),
                    ]:
                        if self.merge_humans:
                            name = "Human"
                        gameplay = all_gameplay[event]
                        if event != "live":
                            control_correct_positions = gameplay[
                                "control_correct_positions"
                            ]
                            control_wrong_positions = gameplay[
                                "control_wrong_positions"
                            ]
                            control_positions = (
                                control_correct_positions + control_wrong_positions
                            )
                            control_positions = np.array(control_positions)
                            control_result = np.array(
                                len(control_correct_positions) * [1]
                                + len(control_wrong_positions) * [0]
                            )
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[argsort_control]
                            control_y = (
                                control_sorted_result.cumsum()
                                / control_sorted_result.shape[0]
                            )
                            control_df = pd.DataFrame(
                                {"correct": control_y, "char_percent": control_x}
                            )
                            control_df["Dataset"] = "Regular Test"
                            control_df["Guessing_Model"] = f" {name}"
                            frames.append(control_df)

                        adv_correct_positions = gameplay["adv_correct_positions"]
                        adv_wrong_positions = gameplay["adv_wrong_positions"]
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(adv_positions)
                        adv_result = np.array(
                            len(adv_correct_positions) * [1]
                            + len(adv_wrong_positions) * [0]
                        )
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({"correct": adv_y, "char_percent": adv_x})
                        adv_df["Dataset"] = "IR Adversarial"
                        adv_df["Guessing_Model"] = f" {name}"
                        frames.append(adv_df)

                        if len(gameplay["advneural_correct_positions"]) > 0:
                            adv_correct_positions = gameplay[
                                "advneural_correct_positions"
                            ]
                            adv_wrong_positions = gameplay["advneural_wrong_positions"]
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(adv_positions)
                            adv_result = np.array(
                                len(adv_correct_positions) * [1]
                                + len(adv_wrong_positions) * [0]
                            )
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = (
                                adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                            )
                            adv_df = pd.DataFrame(
                                {"correct": adv_y, "char_percent": adv_x}
                            )
                            adv_df["Dataset"] = "RNN Adversarial"
                            adv_df["Guessing_Model"] = f" {name}"
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
                    human_vals = sort_humans(list(human_df["Guessing_Model"].unique()))
                    human_dtype = CategoricalDtype(human_vals, ordered=True)
                    human_df["Guessing_Model"] = human_df["Guessing_Model"].astype(
                        human_dtype
                    )
                    dataset_dtype = CategoricalDtype(
                        ["Regular Test", "IR Adversarial", "RNN Adversarial"],
                        ordered=True,
                    )
                    human_df["Dataset"] = human_df["Dataset"].astype(dataset_dtype)

            if no_models:
                p = ggplot(human_df) + geom_point(shape=".")
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df["Dataset"] != "Round 1 - IR Adversarial"]
                if 2 not in self.rounds:
                    df = df[df["Dataset"] != "Round 2 - IR Adversarial"]
                    df = df[df["Dataset"] != "Round 2 - RNN Adversarial"]
                p = ggplot(df)
                if self.save_df is not None:
                    eprint(f"Saving df to: {self.save_df}")
                    df.to_json(self.save_df)

                if (
                    os.path.exists("data/external/all_human_gameplay.json")
                    and not self.no_humans
                ):
                    eprint("Loading human data")
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap("Guessing_Model", ncol=1)
            else:
                facet_conf = facet_wrap("Guessing_Model", nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(
                        method="mavg", se=False, method_args={"window": 400}
                    )
                else:
                    chart = stat_summary_bin(
                        fun_data=mean_no_se,
                        bins=20,
                        shape=".",
                        linetype="None",
                        size=0.5,
                    )
            else:
                chart = None

            p = p + facet_conf + aes(x="char_percent", y="correct", color="Dataset")
            if chart is not None:
                p += chart
            p = (
                p
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + scale_x_continuous(breaks=[0, 0.5, 1])
                + coord_cartesian(ylim=limits)
                + xlab("Percent of Question Revealed")
                + ylab("Accuracy")
                + theme(
                    # legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={"t": 6, "b": 6, "l": 1, "r": 5})
                )
                + scale_color_manual(
                    values=["#FF3333", "#66CC00", "#3333FF", "#FFFF33"],
                    name="Questions",
                )
            )
            if self.title != "":
                p += ggtitle(self.title)

            return p
        else:
            if self.save_df is not None:
                eprint(f"Saving df to: {self.save_df}")
                df.to_json(self.save_df)
            return (
                ggplot(self.char_plot_df)
                + aes(x="char_percent", y="correct", color="Guessing_Model")
                + stat_smooth(method="mavg", se=False, method_args={"window": 500})
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + coord_cartesian(ylim=limits)
            )
def plot_predictions_actual(pred_df, figsize):
    return (pn.ggplot(pred_df, pn.aes(x='y', y='pred')) + pn.geom_point() +
            pn.geom_ribbon(pn.aes(ymin='lb', ymax='ub'), alpha=0.3) +
            pn.geom_abline(slope=1, intercept=0) + pn.theme_bw() +
            pn.theme(figure_size=figsize))
Example #57
0
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({
        'source': all_color_data[:, 0],
        'target': all_color_data[:, 1],
        'color': all_color_data[:, 2]
    })

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({
        'color': str,
        'chip': str,
        'target': float,
        'source': float
    })

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Autoincrement the device counter
    params.device += 1

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
def plot_predictions_residuals(pred_df, figsize):
    return (pn.ggplot(pred_df, pn.aes(x='y', y='resid')) + pn.geom_point() +
            pn.geom_hline(yintercept=0) + pn.theme_bw() +
            pn.theme(figure_size=figsize))
#
#  (C) Copyright 2021  Pavel Tisnovsky
#
#  All rights reserved. This program and the accompanying materials
#  are made available under the terms of the Eclipse Public License v1.0
#  which accompanies this distribution, and is available at
#  http://www.eclipse.org/legal/epl-v10.html
#
#  Contributors:
#      Pavel Tisnovsky
#

from plotnine import ggplot, geom_point, aes, stat_smooth
from plotnine.data import mtcars

g = (ggplot(mtcars, aes("wt", "mpg", color="factor(gear)")) + geom_point() +
     stat_smooth(method="lm"))

g.save("10.png")
Example #60
0
user_stat = user_stat.loc[user_stat.n_records > 20]
print(len(user_stat))
print(len(df.loc[df.user_n_records > 20]))
print(len(df))
print(len(set(df.qid)))

user_stat['log_n_records'] = pd.Series(user_stat.n_records.apply(np.log),
                                       index=user_stat.index)
max_color = user_stat.log_n_records.max()
user_stat['alpha'] = pd.Series(
    user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)


p0 = ggplot(user_stat) \
        + geom_point(aes(x='ratio', y='accuracy',
                     size='n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + theme(aspect_ratio=1)
p0.save('protobowl_users.pdf')
# p0.draw()
print('p0 done')


p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + theme(aspect_ratio=0.3)
p1.save('protobowl_hist.pdf')
# p1.draw()
print('p1 done')