Exemple #1
0
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df, questions = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    print('{} questions'.format(len(set(df.qid))))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color),
        index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
Exemple #2
0
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
Exemple #3
0
def test_few_datapoints():
    df = pd.DataFrame({'x': [1, 2, 2, 3, 3, 3], 'z': list('abbccc')})

    # Bandwidth not set
    p = (ggplot(df, aes('x', color='z')) + geom_density() + lims(x=(-3, 9)))
    with pytest.warns(PlotnineWarning) as record:
        p.draw_test()

    record = list(record)  # iterate more than 1 time
    assert any('e.g `bw=0.1`' in str(r.message) for r in record)
    assert any('Groups with fewer than 2' in str(r.message) for r in record)

    p = (ggplot(df, aes('x', color='z')) + geom_density(bw=.1) +
         lims(x=(0, 4)))
    assert p == 'few_datapoints'
def plot_replicate_density(
    df,
    batch,
    plate,
    output_file_base=None,
    output_file_extensions=[".png", ".pdf", ".svg"],
    dpi=300,
    height=1.5,
    width=2,
):
    density_gg = (
        gg.ggplot(df, gg.aes(x="pairwise_correlation", fill="replicate_info"))
        + gg.geom_density(alpha=0.3) + gg.scale_fill_manual(
            name="Replicate",
            labels={
                "True": "True",
                "False": "False"
            },
            values=["#B99638", "#2DB898"],
        ) + gg.xlab("Pearson Correlation") + gg.ylab("Density") +
        gg.ggtitle("{}: {}".format(batch, plate)) + gg.theme_bw() + gg.theme(
            title=gg.element_text(size=9),
            axis_text=gg.element_text(size=5),
            axis_title=gg.element_text(size=8),
            legend_text=gg.element_text(size=6),
            legend_title=gg.element_text(size=7),
            strip_text=gg.element_text(size=4, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        ))

    if output_file_base:
        save_figure(density_gg, output_file_base, output_file_extensions, dpi,
                    height, width)

    return density_gg
Exemple #5
0
    def show_community_prediction(
        self,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        num_samples: int = 1000,
    ):
        """
        Plot samples from the community prediction on this question

        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param num_samples: number of samples from the community
        :return: ggplot graphics object
        """
        community_samples = pd.DataFrame(data={
            "samples":
            [self.sample_community() for _ in range(0, num_samples)]
        }  # type: ignore
                                         )

        (_xmin,
         _xmax) = self.get_central_quantiles(community_samples,
                                             percent_kept=percent_kept,
                                             side_cut_from=side_cut_from)
        title_name = (
            f"Q: {self.name}" if self.name else
            "\n".join(textwrap.wrap(self.data["title"], 60)) +
            "\n\n"  # type: ignore
        )
        return (ggplot(community_samples, aes("samples")) +
                geom_density(fill="#b3cde3", alpha=0.8) + xlim(_xmin, _xmax) +
                self._scale_x() +
                labs(x="Prediction",
                     y="Density",
                     title=title_name + "Community Predictions") + ergo_theme)
Exemple #6
0
 def comparison_plot(self,
                     df: pd.DataFrame,
                     xmin=None,
                     xmax=None,
                     bw="normal_reference",
                     **kwargs):
     return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) +
             scale_fill_brewer(type="qual", palette="Pastel1") +
             geom_density(bw=bw, alpha=0.8) + ggtitle(self.plot_title) +
             self._scale_x(xmin, xmax) + ergo_theme)
def density_plot1(num_matches_per_round: int,
                  match_lengths_from_one_round: list):
    """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """

    match_lengths = pd.DataFrame(
        {'Match length': match_lengths_from_one_round})
    (plt.ggplot(match_lengths, plt.aes(x='Match length')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) +
     plt.theme_classic() +
     plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
Exemple #8
0
 def density_plot(
     self,
     df: pd.DataFrame,
     xmin=None,
     xmax=None,
     fill: str = "#fbb4ae",
     bw="normal_reference",
     **kwargs,
 ):
     return (ggplot(df, aes(df.columns[0])) +
             geom_density(fill=fill, alpha=0.8) + ggtitle(self.plot_title) +
             self._scale_x(xmin, xmax) + ergo_theme)
Exemple #9
0
def plot_continuous_distribution(data_table,
                                 continuous_metric_name,
                                 segment_name,
                                 title,
                                 xlim=None):
    filtered_data = data_table[
        pd.notnull(data_table[continuous_metric_name]) & pd.notnull(data_table[continuous_metric_name])]
    result = plot.ggplot(data=filtered_data) + plot.aes(x=continuous_metric_name, color=segment_name) + \
             plot.geom_density() + plot.labs(x=continuous_metric_name, title=title, fill=segment_name)

    if pd.notnull(xlim):
        result = result + plot.xlim(xlim)
    return result
Exemple #10
0
def create_confidence_plot(conf_df):
    plt = (ggplot(conf_df) + aes(x='x', color='Method', fill='Method') +
           geom_density(alpha=.45) + facet_wrap('Task', nrow=4) +
           xlab('Confidence') + scale_color_manual(values=COLORS) +
           scale_fill_manual(values=COLORS) + theme_fs() + theme(
               axis_text_y=element_blank(),
               axis_ticks_major_y=element_blank(),
               axis_title_y=element_blank(),
               legend_title=element_blank(),
               legend_position='top',
               legend_box='horizontal',
           ))
    return plt
Exemple #11
0
def plot_replicate_density(
    df,
    batch,
    plate,
    cutoff,
    percent_strong,
    output_file_base=None,
    output_file_extensions=[".png", ".pdf", ".svg"],
    dpi=300,
    height=1.5,
    width=2,
    return_plot=False,
):
    density_gg = (
        gg.ggplot(df, gg.aes(x="similarity_metric", fill="group_replicate"))
        + gg.geom_density(alpha=0.3)
        + gg.scale_fill_manual(
            name="Replicate",
            labels={"True": "True", "False": "False"},
            values=["#B99638", "#2DB898"],
        )
        + gg.xlab("Pearson Correlation")
        + gg.ylab("Density")
        + gg.geom_vline(xintercept=cutoff, color="red", linetype="dashed")
        + gg.ggtitle(
            f"{batch}; Plate: {plate}\n\nPercent Replicating: {np.round(percent_strong * 100, 2)}%"
        )
        + gg.theme_bw()
        + gg.theme(
            title=gg.element_text(size=3.5),
            axis_text=gg.element_text(size=4),
            axis_title=gg.element_text(size=4),
            legend_text=gg.element_text(size=4),
            legend_title=gg.element_text(size=4),
            strip_text=gg.element_text(size=4, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        )
    )

    if output_file_base:
        save_figure(
            density_gg, output_file_base, output_file_extensions, dpi, height, width
        )

    if return_plot:
        return density_gg
def plot_trace(data_in, figure_size=(15, 5)):
    """
    Returns trace and density plot of mcmc samples from data_in.
    Note: the values 'chain', 'sample_i', 'parameter; and 'value' must be in the inputted pd.DataFrame

    Parameters
    ----------
    data_in : pd.DataFrame
        DataFrame containing samples from the sampler with columns: sample_i, chain, sample_i, and parameter
    figure_size : tuple, default = (15,5)
        Optional input for figure size
    
    Returns
    -------
    None:
        Prints out the trace and density plot for mcmc chains(s)
    """

    # Column validation
    name_check = set(data_in.columns)
    if name_check != set(['chain', 'sample_i', 'parameter', 'value']):
        raise MyValidationError(
            "Incorrect column names in data_in please check")

    # Set figure size
    pn.options.figure_size = figure_size

    # Trace plot
    plot_out_trace = pn.ggplot(pn.aes(x = 'sample_i', y = 'value', color = 'chain'), data = data_in)\
        + pn.geom_line()\
        + pn.facet_grid('parameter ~ .')\
        + pn.labs(x = 'Sample', y = 'Parameter Value')

    # Distribution plot
    plot_out_distribution = pn.ggplot(pn.aes(x = 'value',
                                             color = 'chain'), data = data_in)\
        + pn.geom_density()\
        + pn.facet_grid('parameter ~ .')\
        + pn.labs(x = 'Parameter Value', y = 'Density')
    print(plot_out_trace)
    print(plot_out_distribution)
    return (None)
Exemple #13
0
def create_confidence_plot(conf_df):
    plt = (
        ggplot(conf_df)
        + aes(x='x', color='Method', fill='Method')
        + geom_density(alpha=.45)
        + facet_wrap('Task', nrow=4)
        + xlab('Confidence')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            axis_text_y=element_blank(),
            axis_ticks_major_y=element_blank(),
            axis_title_y=element_blank(),
            legend_title=element_blank(),
            legend_position='top',
            legend_box='horizontal',
        )
    )
    return plt
def density_plot2(num_matches_per_round: int,
                  match_lengths_from_one_round: list,
                  match_lengths_from_one_round_with_blowouts: list):
    """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """

    match_lengths_blowout = pd.DataFrame({
        'Match length':
        np.concatenate([
            match_lengths_from_one_round,
            match_lengths_from_one_round_with_blowouts
        ]),
        'Blowouts':
        np.concatenate([
            np.repeat('No', num_matches_per_round),
            np.repeat('Yes', num_matches_per_round)
        ])
    })
    (plt.ggplot(match_lengths_blowout,
                plt.aes(x='Match length', color='Blowouts')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) +
     plt.theme_classic()).save(
         filename='figures/match_length_with_blowout_density_plot.png')
Exemple #15
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
            prediction_true_scale_samples = self.denormalize_samples(
                prediction_normed_samples)
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_true_scale_samples = samples

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_community() for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_true_scale_samples,
                })
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            df = pd.DataFrame(
                data={"prediction": prediction_true_scale_samples})
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)

            return (ggplot(df, aes("prediction")) +
                    geom_density(fill="#b3cde3", alpha=0.8) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
Exemple #16
0
def test_triangular():
    p3 = p + geom_density(kernel='triangular', alpha=.3)  # other
    assert p3 + _theme == 'triangular'
)

# In[18]:

gg.options.figure_size = (6.4, 4.8)

# Make sure to drop duplicates of redundant gene, perturbation, and cell line columns
# Not removing replicates will put more weight on genes with more measurements

cor_density_gg = (
    gg.ggplot(
        summary_corr_df.drop_duplicates(
            ["Metadata_cell_line", "Metadata_gene_name", "replicate_type"]
        ),
        gg.aes(x="correlation_guide")) + \
        gg.geom_density(gg.aes(fill="Metadata_cell_line"),
                        alpha=0.4) + \
        gg.geom_rug(gg.aes(color="Metadata_cell_line"),
                    show_legend={'color': False}) + \
        gg.theme_bw() + \
    gg.theme(
            subplots_adjust={"wspace": 0.2},
            axis_text=gg.element_text(size=7),
            axis_title=gg.element_text(size=9),
            strip_text=gg.element_text(size=6, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        ) + \
        gg.xlim([-0.5, 1]) + \
        gg.xlab("Median Correlation of All Guides Across Genes") + \
        gg.ylab("Density") + \
        gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \
        gg.scale_fill_manual(name="Cell Line",
Exemple #18
0
def density_plot(df,
                 x,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 position='overlay',
                 sort_groups=True,
                 base_size=10,
                 figure_size=(6, 3),
                 **stat_kwargs):
    '''
    Plot a 1-d density plot

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    position : str
      if groups are present, choose between `stack` or `overlay`
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    stat_kwargs : kwargs
      kwargs for the density stat

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=False)
    gdata = gdata[[
        c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns
    ]]

    # start plotting
    g = EZPlot(gdata)

    # determine order and create a categorical type
    colors = ez_colors(g.n_groups('group'))

    # set groups
    if group is None:
        g += p9.geom_density(p9.aes(x="x"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             colour=ez_colors(1)[0],
                             fill=ez_colors(1)[0],
                             **POSITION_KWARGS[position])
    else:
        g += p9.geom_density(p9.aes(x="x",
                                    group="factor(group)",
                                    colour="factor(group)",
                                    fill="factor(group)"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             **POSITION_KWARGS[position])
        g += p9.scale_fill_manual(values=colors, reverse=False)
        g += p9.scale_color_manual(values=colors, reverse=False)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab('Density')

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True))

    return g
Exemple #19
0

p0 = ggplot(user_stat) \
        + geom_point(aes(x='ratio', y='accuracy',
                     size='n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + theme(aspect_ratio=1)
p0.save('protobowl_users.pdf')
# p0.draw()
print('p0 done')


p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + theme(aspect_ratio=0.3)
p1.save('protobowl_hist.pdf')
# p1.draw()
print('p1 done')


p2 = ggplot(user_stat, aes(x='accuracy', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density(aes(x='accuracy')) \
        + theme(aspect_ratio=0.3)
p2.save('protobowl_acc.pdf')
# p2.draw()
print('p2 done')

predict_df = pd.DataFrame()
for model, pipeline in final_pipelines.items():
    df = pd.DataFrame.from_items([
        ('feature_set', model), ('sample_id', X.index),
        ('test_set', X.index.isin(X_test.index).astype(int)), ('status', y),
        ('decision_function', pipeline.decision_function(X)),
        ('probability', pipeline.predict_proba(X)[:, 1])
    ])
    predict_df = predict_df.append(df)

predict_df['probability_str'] = predict_df['probability'].apply(
    '{:.1%}'.format)

# In[27]:

# Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy)
(predict_df.sort_values(
    'decision_function',
    ascending=False).query("status == 0 and feature_set == 'full'").head(10))

# In[28]:

predict_df['status_'] = predict_df['status'].map(lambda x: 'negative'
                                                 if x == 0 else 'positive')

(gg.ggplot(predict_df, gg.aes(x='probability', fill='status_')) +
 gg.geom_density(alpha=0.6) + gg.facet_wrap('~feature_set', ncol=1) +
 gg.labs(x='probability', y='density') +
 gg.guides(fill=gg.guide_legend(title="")) + theme_cognoma())
def test_gaussian_trimmed():
    p2 = p + geom_density(kernel='gaussian', alpha=.3, trim=True)
    assert p2 + _theme == 'gaussian-trimmed'
Exemple #22
0
def test_gaussian_weighted():
    p1 = p + geom_density(aes(weight='x'), kernel='gaussian', alpha=.3)
    assert p1 + _theme == 'gaussian_weighted'
Exemple #23
0
def plot():
    outdir = "output/protobowl/"
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df["log_n_records"] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby("uid")
    user_stat = df_user_grouped.agg(np.mean)
    print("{} users".format(len(user_stat)))
    print("{} records".format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat["alpha"] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color),
        index=user_stat.index)

    # 2D user plot
    p0 = (ggplot(user_stat) + geom_point(
        aes(
            x="relative_position",
            y="result",
            size="user_n_records",
            color="log_n_records",
            alpha="alpha",
        ),
        show_legend={
            "color": False,
            "alpha": False,
            "size": False
        },
    ) + scale_color_gradient(high="#e31a1c", low="#ffffcc") +
          labs(x="Average buzzing position", y="Accuracy") +
          theme(aspect_ratio=1))
    p0.save(os.path.join(outdir, "protobowl_users.pdf"))
    # p0.draw()
    print("p0 done")

    # histogram of number of records
    p1 = (ggplot(user_stat, aes(x="log_n_records", y="..density..")) +
          geom_histogram(color="#e6550d", fill="#fee6ce") + geom_density() +
          labs(x="Log number of records", y="Density") +
          theme(aspect_ratio=0.3))
    p1.save(os.path.join(outdir, "protobowl_hist.pdf"))
    # p1.draw()
    print("p1 done")

    # histogram of accuracy
    p2 = (ggplot(user_stat, aes(x="result", y="..density..")) +
          geom_histogram(color="#31a354", fill="#e5f5e0") + geom_density() +
          labs(x="Accuracy", y="Density") + theme(aspect_ratio=0.3))
    p2.save(os.path.join(outdir, "protobowl_acc.pdf"))
    # p2.draw()
    print("p2 done")

    # histogram of buzzing position
    p3 = (ggplot(user_stat, aes(x="relative_position", y="..density..")) +
          geom_histogram(color="#3182bd", fill="#deebf7") + geom_density() +
          labs(x="Average buzzing position", y="Density") +
          theme(aspect_ratio=0.3))
    p3.save(os.path.join(outdir, "protobowl_pos.pdf"))
    # p3.draw()
    print("p3 done")
Exemple #24
0
def density(df, key, figsize=(8, 6), vertical=False):
    p9.options.figure_size = figsize
    fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df)
    fig += p9.geom_density(alpha=0.5)
    fig += p9.theme_classic()
    return fig
Exemple #25
0
def audit_site(df, audit_cols, batch, plate, resolution="full"):
    
    audit_title = "{}: {}".format(batch, plate)

    site_df = audit(
        df,
        audit_groups=audit_cols,
        audit_resolution=resolution
    )
    
    same_well = site_df.Metadata_Well_pair_a == site_df.Metadata_Well_pair_b
    same_site = site_df.Metadata_Site_pair_a == site_df.Metadata_Site_pair_b
    same_plate = site_df.Metadata_Plate_pair_a == site_df.Metadata_Plate_pair_b
    
    if "Metadata_clone_number" in audit_cols:
        same_clone = site_df.Metadata_clone_number_pair_a == site_df.Metadata_clone_number_pair_b
        if "Metadata_treatment" in audit_cols:
            same_treatment = site_df.Metadata_treatment_pair_a == site_df.Metadata_treatment_pair_b
        else:
            same_treatment = same_clone
    else:
        same_treatment = site_df.Metadata_Dosage_pair_a == site_df.Metadata_Dosage_pair_b
        same_clone = site_df.Metadata_CellLine_pair_a == site_df.Metadata_CellLine_pair_b
    
    replicate = same_treatment & same_clone
    same_well_diff_site = (
        same_well & ~same_site
    )

    same_treatment_diff_well = (
        replicate & ~same_well
    )

    diff_treatment_diff_well = (
        ~replicate & ~same_well
    )

    diff_treatment_diff_site = (
        ~replicate & ~same_site
    )
    
    plot_ready_df = site_df.assign(
        replicate=replicate,
        same_site=same_site,
        same_well_diff_site=same_well_diff_site,
        same_treatment_diff_well=same_treatment_diff_well,
        diff_treatment_diff_well=diff_treatment_diff_well,
        diff_treatment_diff_site=diff_treatment_diff_site
    )
    
    plot_ready_df.pairwise_correlation = plot_ready_df.pairwise_correlation.astype(float)

    plot_ready_df.same_well_diff_site = (
        plot_ready_df
        .same_well_diff_site
        .replace(
            {
                True: "Same Well",
                False: "Different Well"
            }
        )
    )
    
    plot_ready_df.same_site = (
        plot_ready_df
        .same_site
        .replace(
            {
                True: "Same Site",
                False: "Different Site"
            }
        )
    )
    
    site_audit_gg = (
        gg.ggplot(plot_ready_df, gg.aes(x="pairwise_correlation")) + \
            gg.geom_density(gg.aes(fill="replicate"), alpha=0.5) +
            gg.theme_bw() + \
            gg.facet_grid("same_well_diff_site~same_site") +
            gg.ggtitle(audit_title) +
            gg.xlab("Pairwise Pearson Correlation") +
            gg.ylab("Density") +
            gg.theme(
                strip_background=gg.element_rect(colour="black", fill="#fdfff4")
            )
    )
    
    return site_audit_gg
print(ess_bps_per_sec)
# 0.004061
print(ess_hmc_per_sec)
# 0.005769

frames = [
    bpsSamples[burninBPS:bpsSamples.shape[0]],
    hmcSamples[burninHMC:hmcSamples.shape[0]]
]
allDF = pd.concat(frames)
## add a new column to DF
list1 = ["lbps"] * (bpsSamples.shape[0] - burninBPS)
list1.extend(["hmc"] * (hmcSamples.shape[0] - burninHMC))
allDF['method'] = list1
allDF.head(3)

ggplot(allDF, aes('exchangeCoef1', fill='method')) + geom_density(
    alpha=0.3,
    position='identity') + scale_x_continuous(breaks=np.arange(3.5, 5.5, 0.3))

#import matplotlib.pyplot as plt

#kdeplot(bpsSamples['exchangeCoef1'][1000:10000], shade=True)
#kdeplot(hmcSamples['exchangeCoef1'][1000:10000], shade=True,color='r')

#ggplot(df, aes( x=values, fill=method)) +
#  geom_density(alpha=.3, position="identity")+facet_wrap(~variable, ncol=3, scales="free")+
#  geom_vline(aes(xintercept=vl), data=vline.dat, color="red", linetype="dashed")
#sc
Exemple #27
0
def test_triangular():
    p3 = p + geom_density(kernel='triangular', bw='normal_reference',
                          alpha=.3)  # other
    assert p3 + _theme == 'triangular'
Exemple #28
0
def test_gaussian_trimmed():
    p2 = p + geom_density(kernel='gaussian', alpha=.3, trim=True)
    assert p2 + _theme == 'gaussian-trimmed'
Exemple #29
0
def test_gaussian():
    p1 = p + geom_density(kernel='gaussian', alpha=.3)
    assert p1 + _theme == 'gaussian'
Exemple #30
0
sizes = []
for sha1, sha2 in zip(commits, commits[1:]):
    res = subprocess.run(['git', 'diff', '--shortstat', sha1, sha2],
                         stdout=subprocess.PIPE)
    words = res.stdout.decode().split()
    plus = 0
    minus = 0
    for i, word in enumerate(words):
        if 'insertion' in word:
            plus = int(words[i - 1])
        if 'deletion' in word:
            minus = int(words[i - 1])
    sizes.append({'insertions': plus, 'deletions': minus})

df = pandas.DataFrame(sizes)
df['newlines'] = df.insertions - df.deletions
df.describe()

# show some basic stat
for n in (-500, -100):
    rat = df[df.newlines < n].size / df.size
    print('<', n, round(rat * 100, 2), '%')
for n in (0, 100, 500, 1000, 2000):
    rat = df[df.newlines > n].size / df.size
    print('>', n, round(rat * 100, 2), '%')

# draw charts
(gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(-2000, 0))

(gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(0, 2000))
Exemple #31
0
                         "comparison_category"] = "across_batch_replicate"
similarity_melted_df.loc[different_treatment_within_batch,
                         "comparison_category"] = "same_batch_nonreplicate"

# In[8]:

similarity_melted_df.comparison_category.value_counts()

# In[9]:

similarity_melted_df.Metadata_clone_number_pair_a.value_counts()

# In[10]:

(gg.ggplot(similarity_melted_df, gg.aes(x="similarity_metric")) +
 gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) +
 gg.theme_bw())

# In[11]:

(gg.ggplot(similarity_melted_df, gg.aes(x="similarity_metric")) +
 gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) +
 gg.theme_bw() + gg.facet_wrap("~Metadata_batch_pair_a"))

# In[12]:

(gg.ggplot(
    similarity_melted_df.query(
        "Metadata_clone_number_pair_a in ['WT_parental', 'CloneA', 'CloneE']"),
    gg.aes(x="similarity_metric")) +
 gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) +
def test_gaussian():
    p1 = p + geom_density(kernel='gaussian', alpha=.3)
    assert p1 + _theme == 'gaussian'
        ('probability', pipeline.predict_proba(X)[:, 1])
    ])    
    predict_df = predict_df.append(df)

predict_df['probability_str'] = predict_df['probability'].apply('{:.1%}'.format)


# In[27]:

# Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy)
(predict_df
    .sort_values('decision_function', ascending=False)
    .query("status == 0 and feature_set == 'full'")
    .head(10)
)


# In[28]:

predict_df['status_'] = predict_df['status'].map(
    lambda x: 'negative' if x == 0 else 'positive')

(gg.ggplot(predict_df, gg.aes(x='probability', 
                              fill='status_'))
 + gg.geom_density(alpha=0.6)
 + gg.facet_wrap('~feature_set', ncol=1)
 + gg.labs(x='probability', y='density')
 + gg.guides(fill=gg.guide_legend(title=""))
 + theme_cognoma())

def test_triangular():
    p3 = p + geom_density(kernel='triangular', alpha=.3)  # other
    assert p3 + _theme == 'triangular'
# access data
ames = pd.read_csv("bank.csv")

# initial dimension
ames.shape


# first few observations
ames.head()

train, test = train_test_split(ames, test_size=0.3, random_state=123)

f"raw data dimensions: {ames.shape}; training dimensions: {train.shape}; testing dimensions:  {test.shape}"

(ggplot(train, aes('Deposit'))
 + geom_density()
 + geom_density(data = test, color = "red")
 + ggtitle("Random sampling with SciKit-Learn"))


 
y = attrition["age"]
train_strat, test_strat = train_test_split(age, test_size=0.3, random_state=123, stratify=y)



# response distribution for raw data
attrition["age"].value_counts(normalize=True)

# response distribution for training data
train_strat["Attrition"].value_counts(normalize=True)
Exemple #36
0

p0 = ggplot(user_stat) \
        + geom_point(aes(x='ratio', y='accuracy',
                     size='n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + theme(aspect_ratio=1)
p0.save('protobowl_users.pdf')
# p0.draw()
print('p0 done')


p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + theme(aspect_ratio=0.3)
p1.save('protobowl_hist.pdf')
# p1.draw()
print('p1 done')


p2 = ggplot(user_stat, aes(x='accuracy', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density(aes(x='accuracy')) \
        + theme(aspect_ratio=0.3)
p2.save('protobowl_acc.pdf')
# p2.draw()
print('p2 done')

Exemple #37
0
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
import plotnine as p

# read data
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


def read_data(file):
    return pd.read_stata(
        "https://raw.github.com/scunning1975/mixtape/master/" + file)


tb = pd.DataFrame({
    'd':
    np.concatenate((np.repeat(0, 20), np.repeat(1, 20))),
    'y': (0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72,
          0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.9,
          -5.13, -2.19, 2.43, -3.83, 0.5, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11,
          4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50)
})

p.ggplot() + p.geom_density(tb, p.aes(x='y', color='factor(d)')) + p.xlim(
    -7, 8) + p.labs(title="Kolmogorov-Smirnov Test") + p.scale_color_discrete(
        labels=("Control", "Treatment"))