Esempio n. 1
0
def accPlot(accsByNFeats):
    plotdata = []
    for s in accsByNFeats:
        plotdata.append(
            pd.concat([
                pd.DataFrame({
                    "p": p,
                    "acc": accsByNFeats[s][p],
                    "set": s
                },
                             index=[str(p)]) for p in accsByNFeats[s]
            ],
                      axis=0))
    ggd = pd.concat(plotdata)
    ggd['acc'] = ggd['acc'].astype(float)
    ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set'))
    ggo += gg.geom_line(alpha=0.5)
    ggo += gg.geom_point()
    ggo += gg.theme_bw()
    ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000])
    ggo += gg.scale_color_manual(
        values=['darkgray', 'black', 'red', 'dodgerblue'])
    ggo += gg.ylab('Accuracy (5-fold CV)')
    print(ggo)
    return ggd
Esempio n. 2
0
def test_annotation_logticks_coord_flip():
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_log10() + scale_y_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip'
Esempio n. 3
0
def plot_ci_eval(df):
    molten = pd.melt(df,
                     id_vars=['sample_size'],
                     value_vars=['bootstrap', 'ztest', 'ttest'])

    return (ggplot(molten, aes(x='sample_size', y='value', color='variable')) +
            geom_line() + scale_x_log10() + ylim(0, 1))
Esempio n. 4
0
def scatter_plot(df,
                 xcol,
                 ycol,
                 domain,
                 xname=None,
                 yname=None,
                 log=False,
                 width=6,
                 height=6,
                 clamp=True,
                 tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 0.5
    DASH_PATTERN = (0, (3, 1))

    if xname == None:
        xname = xcol
    if yname == None:
        yname = ycol

    # formater for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df = df.copy(deep=True)
        df.loc[df[xcol] > domain[1], xcol] = domain[1]
        df.loc[df[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True)
    scatter += p9.labs(x=xname, y=yname)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    #scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(
        panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(figure_size=(width, height))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1,
                              linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1],
                             linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1],
                             linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Esempio n. 5
0
def test_annotation_logticks():
    # The grid should align with the logticks
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_log10() + scale_y_log10() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks'
Esempio n. 6
0
def test_scale_transformed_breaks():
    df = pd.DataFrame({'x': [1, 10, 100, 1000], 'y': range(4)})
    p = (ggplot(df, aes('x', 'y')) +
         geom_bin2d(breaks=([5, 50, 500], [0.5, 1.5, 2.5])))
    out1 = layer_data(p)
    out2 = layer_data(p + scale_x_log10())
    np.testing.assert_allclose(out1.xmax, [50, 500])
    np.testing.assert_allclose(out2.xmax, np.log10([50, 500]))
Esempio n. 7
0
def test_annotation_logticks_coord_flip_discrete_bottom():
    df2 = df.assign(discrete=pd.Categorical(['A' + str(a) for a in df['x']]))
    p = (ggplot(df2, aes('x', 'discrete')) +
         annotation_logticks(sides='b', size=.75) + geom_point() +
         scale_x_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip_discrete_bottom'
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 1.5
    DASH_PATTERN = (0, (6, 2))

    if xname is None:
        xname = xcol
    if yname is None:
        yname = ycol

    # formatter for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df1 = df1.copy(deep=True)
        df1.loc[df1[xcol] > domain[1], xcol] = domain[1]
        df1.loc[df1[ycol] > domain[1], ycol] = domain[1]

        df2 = df2.copy(deep=True)
        df2.loc[df2[xcol] > domain[1], xcol] = domain[1]
        df2.loc[df2[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df1)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5)
    scatter += p9.labs(x=xname, y=yname)

    # rug plots
    scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05)
    scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    # scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(panel_grid_minor=p9.element_blank())
    scatter += p9.theme(figure_size=(width, height))
    scatter += p9.theme(text=p9.element_text(size=24, color="black"))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Esempio n. 9
0
def plot_scaling_log(plt_df: pd.DataFrame,
                     sweep_vars: Optional[Sequence[str]] = None,
                     with_baseline=True) -> gg.ggplot:
    """Plot scaling of learning time against exponential baseline."""
    p = _base_scaling(plt_df, sweep_vars, with_baseline)
    p += gg.scale_x_log10(breaks=[5, 10, 20, 50])
    p += gg.scale_y_log10(breaks=[100, 300, 1000, 3000, 10000, 30000])
    p += gg.xlab('deep sea problem size (log scale)')
    p += gg.ylab('#episodes until < 90% bad episodes (log scale)')
    return plotting.facet_sweep_plot(p, sweep_vars)
Esempio n. 10
0
def plot_compare(stats,
                 variant,
                 variant_baseline,
                 metric,
                 mode="identity",
                 jitter=0.01):
    assert mode in ["identity", "ratio", "difference"]
    plotdata = compare_stats(stats, variant, variant_baseline)
    bsw = bsw_table2(plotdata, metric=metric, reltol=1.0)
    display(bsw)
    baseline_name = f"{metric}_baseline"
    plotdata = plotdata[[metric, baseline_name, "dataset"]].assign(
        ratio=plotdata[metric] / plotdata[baseline_name],
        difference=plotdata[metric] - plotdata[baseline_name],
    )

    if mode == "identity":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y=metric, fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=1, intercept=0)))
    elif mode == "ratio":
        return (
            ggplot(data=plotdata) + geom_jitter(
                aes(x=f"{metric}_baseline", y="ratio", fill="dataset"),
                width=jitter,
                height=jitter,
            ) + scale_x_log10() + scale_y_log10()
            ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale
            + geom_abline(aes(slope=0, intercept=0.0)) +
            geom_abline(aes(slope=-1, intercept=0.0))  # max
        )
    elif mode == "difference":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y="difference", fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=0, intercept=0)))
    else:
        assert False, "unknown mode"
Esempio n. 11
0
def test_scale_transformed_breaks():
    df = pd.DataFrame({
        'x': [1, 10, 100, 1000],
        'y': range(4)
    })
    p = (ggplot(df, aes('x', 'y'))
         + geom_bin2d(breaks=([5, 50, 500], [0.5, 1.5, 2.5]))
         )
    out1 = layer_data(p)
    out2 = layer_data(p + scale_x_log10())
    np.testing.assert_allclose(out1.xmax, [50, 500])
    np.testing.assert_allclose(out2.xmax, np.log10([50, 500]))
Esempio n. 12
0
def test_annotation_logticks_faceting():
    n = len(df)
    df2 = pd.DataFrame({
        'x': np.hstack([df['x'], df['x']]),
        'g': list('a' * n + 'b' * n)
    })
    p = (ggplot(df2) + annotation_logticks(sides='b', size=.75) +
         geom_point(aes('x', 'x')) + scale_x_log10() + scale_y_log10() +
         facet_wrap('g') + theme(panel_grid_minor=element_line(color='green'),
                                 panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_faceting'
Esempio n. 13
0
    def plot_replicates_log_axes(self):
        """
        Plots replicate traces from a single run on logarithmic axes to determine the baseline metabolic charge production
        or other stabilization.

        """

        from plotnine import ggplot, ylab, xlab, geom_line, aes, scale_y_log10, scale_x_log10

        plot = ((ggplot(self.data, aes('Time', 'Current', color='Channel')) +
                 ylab(u'Current (μA)') + xlab('Time (seconds)') + geom_line() +
                 scale_y_log10() + scale_x_log10()))

        print(plot)
        return plot
Esempio n. 14
0
def make_plot(name):
    df = pd.read_csv(f'small_n/results/{name}.csv')

    molten = pd.melt(
        df,
        id_vars=['sample_size'],
        value_vars=['bootstrap', 'ztest', 'ttest'],
        var_name='method',
        value_name='success',
    )

    (ggplot(molten, aes(x='sample_size', y='success', color='method')) +
     geom_line(size=1) + scale_x_log10() + ylim(0, 1) + geom_hline(
         yintercept=0.95, linetype='dotted', color='#FF5500', size=3)).save(
             f'slides/static/plots/{name}.png',
             height=7.0,
             width=10,
             units='in')
Esempio n. 15
0
def plot_regret_ave_scaling(df_in: pd.DataFrame,
                            group_col: str,
                            episode: int,
                            regret_thresh: float,
                            sweep_vars: Sequence[str] = None,
                            regret_col: str = 'total_regret') -> gg.ggplot:
  """Point plot of average regret investigating scaling to threshold."""
  df = _preprocess_ave_regret(df_in, group_col, episode, sweep_vars, regret_col)
  group_name = group_col.replace('_', ' ')
  p = (gg.ggplot(df)
       + gg.aes(x=group_name, y='average_regret',
                colour='average_regret < {}'.format(regret_thresh))
       + gg.geom_point(size=5, alpha=0.8)
       + gg.scale_x_log10(breaks=[1, 3, 10, 30, 100])
       + gg.scale_colour_manual(values=['#d73027', '#313695'])
       + gg.ylab('average regret at {} episodes'.format(episode))
       + gg.geom_hline(gg.aes(yintercept=0.0), alpha=0)  # axis hack
      )
  return facet_sweep_plot(p, sweep_vars)
Esempio n. 16
0
def accPlot(accsByNFeats):
    plotdata = []
    for s in accsByNFeats:
        plotdata.append(pd.concat([DataFrame({"p" : p,
                                              "acc" : accsByNFeats[s][p],
                                              "set" : s},
                                             index = [str(p)])
                                   for p in accsByNFeats[s]],
                                  axis = 0))
    ggd = pd.concat(plotdata)
    ggd['acc'] = ggd['acc'].astype(float)
    ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set'))
    ggo += gg.geom_line(alpha=0.5)
    ggo += gg.geom_point()
    ggo += gg.theme_bw()
    ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000])
    ggo += gg.scale_color_manual(values=['darkgray', 'black',
                                         'red', 'dodgerblue'])
    ggo += gg.ylab('Accuracy (5-fold CV)')
    print(ggo)
Esempio n. 17
0
def kernel_stats(inFile, log_scale=True):
    par = get_params(inFile)

    n_kernel = 0
    for var in sorted(par["means"]):
        n_kernel += "mus_f" in var

    tf = pm.distributions.transforms.StickBreaking()

    dfs = list()
    for tissue_type in ["t", "f"]:
        weights = tf.backward(
            par["means"][f"w_{tissue_type}_stickbreaking__"]).eval()
        n_dim = par["means"][f"x_{tissue_type}"].shape[1]
        volumes = list()
        for kernel in range(n_kernel):
            # get covariance elipse parameters
            packed_cov = par["means"][
                f"packed_L_{tissue_type}_{kernel}_cholesky-cov-packed__"]
            lower = pm.expand_packed_triangular(n_dim, packed_cov,
                                                lower=True).eval()
            cov = np.dot(lower, lower.T)
            volume = np.linalg.det(cov)
            volumes.append(volume)
        type_df = pd.DataFrame(
            {
                "tissue": "tumor" if tissue_type == "t" else "non-tumor",
                "weight": weights,
                "volume": volumes,
            },
            index=[f"kernel {i}" for i in range(n_kernel)],
        )
        dfs.append(type_df)
    df = pd.concat(dfs)
    pl = (pn.ggplot(pn.aes("volume", "weight", color="tissue"), df) +
          pn.geom_point())
    if log_scale:
        pl += pn.scale_y_log10()
        pl += pn.scale_x_log10()
    pl += pn.theme_minimal()
    return pl, df
Esempio n. 18
0
 def plot(self,
          plotDat,
          tag=None,
          log=True,
          by='cell_type',
          data_set=None,
          title=None,
          alpha=.4):
     pDat = plotDat.copy()
     gcorr = pearsonr(pDat.measured, pDat.prediction)[0]
     corrs = pDat.groupby(
         pDat[by]).apply(lambda x: pearsonr(x.measured, x.prediction)[0])
     pDat['corr'] = corrs[pDat[by]].values
     by_str = '{}_pearson'.format(by)
     pDat[by_str] = pDat.apply(
         lambda x: '{} {:.2f}'.format(x[by], corrs[x[by]]), axis=1)
     if data_set:
         pDat = pDat.loc[pDat['dataset_name'] == data_set]
     pl = (pn.ggplot(pn.aes('measured', 'prediction', color=by_str), pDat) +
           pn.geom_point(alpha=alpha) + pn.stat_smooth(mapping=pn.aes(
               'measured', 'prediction', color=by_str),
                                                       method='lm',
                                                       geom='line',
                                                       alpha=0.5,
                                                       se=False,
                                                       inherit_aes=False))
     if len(pDat['sample'].unique()) < 10:
         pl = pl + pn.aes(shape='sample')
     else:
         pl = pl + pn.aes(shape='dataset_name')
     if log is True:
         pl = pl + pn.scale_x_log10() + pn.scale_y_log10()
     if title is not None:
         pl = pl + pn.ggtitle(title)
     elif tag is not None:
         pl = pl + pn.ggtitle('{} pearson={}'.format(tag, gcorr))
     else:
         pl = pl + pn.ggtitle('pearson={}'.format(gcorr))
     return pl
Esempio n. 19
0
 def _scale_x(self):
     return scale_x_log10()
Esempio n. 20
0
                                   x['k'],
                                   x['resubAccuracy'],
                                   x['testAccuracy'])
                                  for x in repeatedKnnResults],
                                 columns = ['p',
                                            'k',
                                            'resubAccuracy',
                                            'testAccuracy'])

ggdata = pd.concat(
    [DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'resub',
                'Accuracy' : knnResultsSimplified.resubAccuracy}),
     DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'test',
                'Accuracy' : knnResultsSimplified.testAccuracy})],
    axis = 0
)

plt.close()
ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy',
                               color='type', group='type', linetype='type'))
ggo += gg.facet_wrap('~ k')
ggo += gg.scale_x_log10()
ggo += gg.geom_point(alpha=0.6)
ggo += gg.stat_smooth()
ggo += gg.theme_bw()
print(ggo)
Esempio n. 21
0
def plot_violinbox_plots_per_category(
        dataframe: pandas.DataFrame,
        plot_type: str,
        target_feature: str,
        label_column: str,
        colors: List[str],
        coloring_style: str,
        value_skip_list: List = [],
        jitter_alpha: float = 0.7,
        plot_alpha: float = 0.5,
        log_10_scale: bool = False,
        theme: str = 'gray',
        save_to_file: str = None,
        dpi: int = 150,
        show: bool = True
) -> p9.ggplot:
    """
        The :func:`plot_violinbox_plots_per_category` helps with providing the user with nicely plotted violin and
        box plots of the distribution of data points.

        Parameters
        ----------
        dataframe: `pandas.DataFrame`, required
            This is the main parameter that this method is supposed to work with, which is a dataframe that has
            a label column in which we have integer values starting from 0, and a float feature column the distribution
            of which we tend to monitor.
        plot_type: `str`, required
            This value, either `box` or `violin`, determines the type of plot.
        target_feature: `str`, required
            This parameter is the column name of the features that we want to monitor.
        label_column: `str`, required
            The input dataframe must have a label_column (preferably integer starting from 0), the name of that
            column should be input here.
        colors: `List[str]`, required
            Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
            or a list of two colors indicating a range of color values.
        coloring_style: `str`, optional (default='manual')
            Either `manual` or `gradient` which helps assigning colors to clusters.
        value_skip_list: `List`, optional (default=[])
            If some values in the feature column are to be skipped, they should be put in here so that they
            are ignored in the plots. For example, if for some reason some values are -10000000, they can be taken care
            of in here.
        jitter_alpha: `float`, optional (default=0.7)
            The jitter value transparency is set in this parameter.
        plot_alpha: `float`, optional (default=0.5)
            The transparency intensity can be determined by setting this parameter.
        log_10_scale: `bool`, optional (default=False)
            If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
        theme: `str`, optional (default='gray')
            This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
            are consistent with `plotnine` package's format.
        save_to_file: `str`, optional (default=None)
            If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
        dpi: `int`, optional (default=150)
            The dpi for saving the plots indicating the image quality.
        show: `bool`, optional (default=True)
            Whether or not the plot is to be shown is set in this parameter.
        Returns
        ----------
        The output of this method is of `p9.ggplot` type.
        """
    if len(value_skip_list) > 0:
        df = dataframe[~dataframe[target_feature].isin(value_skip_list)]

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(df[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_jitter(alpha=jitter_alpha)

    if plot_type == 'box':
        pplot += p9.geom_boxplot(alpha=plot_alpha)
    elif plot_type == 'violin':
        pplot += p9.geom_violin(alpha=plot_alpha)
    else:
        raise Exception('unknown plot type, it must be violin or box.')

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)

    if show:
        pplot.draw()

    return pplot
    df['feature_set'] = model
    cv_results_df = cv_results_df.append(df)
    
cv_results_summary = (cv_results_df
    .groupby(['classify__alpha', 'feature_set'])['mean_test_score']
    .max()
    .reset_index())


# In[17]:

(gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha',
                                      y='mean_test_score',
                                      color='feature_set'))
 + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05)
 + gg.scale_x_log10()
 + gg.labs(x='Regularization strength multiplier (log alpha)',
           y='CV AUROC')
 + gg.guides(fill=gg.guide_legend(title="Feature Set"))
 + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1)
 + theme_cognoma()
)


# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
Esempio n. 23
0
File: log.py Progetto: wjurayj/ergo
 def _scale_x(self, xmin: float = None, xmax: float = None):
     return scale_x_log10(limits=(xmin, xmax))
        pd.DataFrame(pipeline.cv_results_),
        pd.DataFrame.from_records(pipeline.cv_results_['params'])
    ],
                   axis='columns')
    df['feature_set'] = model
    cv_results_df = cv_results_df.append(df)

cv_results_summary = (cv_results_df.groupby(
    ['classify__alpha', 'feature_set'])['mean_test_score'].max().reset_index())

# In[17]:

(gg.ggplot(
    cv_results_summary,
    gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) +
 gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() +
 gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') +
 gg.guides(fill=gg.guide_legend(title="Feature Set")) +
 gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) +
 theme_cognoma())

# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
        'test': pipeline.decision_function(X_test)
    }
    for model, pipeline in cv_pipelines.items()
Esempio n. 25
0
g = 'T'
ens = adata.var[adata.var.SYMBOL == "T"].ENSEMBL[0]
adata.obs['T_counts'] = adata[:, ens].X.toarray()[:, 0]
adata.obs['T_logcounts'] = adata[:, ens].layers.get("logcounts").toarray()[:,
                                                                           0]
adata.obs['T_smoothed'] = adata[:,
                                ens].layers.get("scvi_normalised").toarray()[:,
                                                                             0]
adata.obs['T_smoothed_lc'] = np.log2(adata.obs['T_smoothed'] + 1)

count_hist = adata.obs[f'{g}_counts'].value_counts().reset_index().rename(
    columns={'index': 'counts'})
p.options.figure_size = 6, 2
plot_ = (p.ggplot(p.aes(x='counts', y=f'{g}_counts'),
                  count_hist.query('0 < counts < 25')) +
         p.geom_bar(stat='identity') + p.scale_x_log10() + p.theme_minimal() +
         p.labs(x=f'{g} UMI counts', y='Number cells'))
plot_.save('mgast_T_counts.pdf', verbose=False)

count_hist = adata.obs[f'{g}_logcounts'].value_counts().reset_index().rename(
    columns={'index': 'logcounts'})
p.options.figure_size = 6, 2
plot_ = (
    p.ggplot(p.aes(x='logcounts'), count_hist.query('0 < logcounts < 25')) +
    p.geom_histogram(bins=128, color="k", fill="w") + p.scale_x_log10() +
    p.theme_minimal() + p.labs(x=f'{g} UMI logcounts', y='Number cells'))
plot_.save('mgast_T_logcounts.pdf', verbose=False)

count_hist = adata.obs[f'{g}_smoothed_lc'].value_counts().reset_index().rename(
    columns={'index': 'smoothed_lc'})
p.options.figure_size = 6, 2
Esempio n. 26
0
def plot_2d_distribution_per_category(
        dataframe: pandas.DataFrame,
        label_column: str,
        coordinates: Tuple[str],
        colors: List[str],
        coloring_style: str = 'manual',
        log_10_scale: bool = False,
        theme: str = 'gray',
        alpha: float = 0.5,
        save_to_file: str = None,
        dpi: int = 150
) -> p9.ggplot:
    """
    The :func:`plot_2d_distribution_per_category` helps with providing the user with a 2-dimensional plot of the
    whole distribution.

    Parameters
    ----------
    dataframe: `pandas.DataFrame`, required
        This is the main parameter that this method is supposed to work with, which is a dataframe with a label column
        (which is to help us determine the column) and coordinates for x and y axes.
    label_column: `str`, required
        The input dataframe must have a label_column (preferably integer starting from 0), the name of that
        column should be input here.
    coordinates: `Tuple[str]`, required
        This is a tuple of column names, the first one being the column in which the `x` values for our 2d plot
        are stored, and the other one corresponds to the `y` axis.
    colors: `List[str]`, required
        Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
        or a list of two colors indicating a range of color values.
    coloring_style: `str`, optional (default='manual')
        Either `manual` or `gradient` which helps assigning colors to clusters.
    log_10_scale: `bool`, optional (default=False)
        If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
    theme: `str`, optional (default='gray')
        This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
        are consistent with `plotnine` package's format.
    alpha: `float`, optional (default=0.5)
        The transparency intensity can be determined by setting this parameter.
    save_to_file: `str`, optional (default=None)
        If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
    dpi: `int`, optional (default=150)
        The dpi for saving the plots indicating the image quality.
    Returns
    ----------
    The output of this method is of `p9.ggplot` type.
    """
    assert coloring_style in ['manual', 'gradient'], "invalid coloring style"

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x=coordinates[0], y=coordinates[1], color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(dataframe[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x=coordinates[0], y=coordinates[1], color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_point(alpha=alpha)
    pplot += p9.xlab(coordinates[0]) + p9.ylab(coordinates[1])

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    pplot += p9.theme(text=p9.element_text(size=8))

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)
    else:
        pplot.draw()

    return pplot
Esempio n. 27
0
def rel_plot(sbs, variant, jitter=0.01):
    plotdata = sbs[sbs.variant == variant]
    xcol = "base"
    ycol = "ratio"
    plotdata = plotdata.assign(x=plotdata[xcol], y=plotdata[ycol])
    plotdata = plotdata.assign(sbs_index=plotdata.index.values)
    session_text = (plotdata[["session_index", "base_session_index"]].apply(
        tuple, axis=1).map(lambda tup: f"{tup[0]} vs. {tup[1]}"))
    plotdata = plotdata.assign(session_text=session_text)

    x = np.geomspace(0.02, 1, num=5)
    y = 1 / x
    diag_df = pd.DataFrame({"x": x, "y": y})

    scatterplot = (
        ggplot(plotdata) + geom_jitter(
            aes(x="x", y="y", fill="dataset", color="dataset"),
            width=jitter,
            height=jitter,
            alpha=0.6,
            size=1.0,
        )
        #                 shape=plotdata.dataset.map(lambda x : '.' if x in ['lvis','objectnet'] else 'o'),
        #                 size=plotdata.dataset.map(lambda x : 1. if x in ['lvis','objectnet'] else 2.))
        #  + geom_text(aes(x='base', y='delta', label='category', color='dataset'), va='bottom',
        #              data=plotdata1[plotdata1.ratio < .6],
        #              position=position_jitter(.05, .05), show_legend=False)
        + geom_line(aes(x="x", y="y"), data=diag_df)
        # + geom_text(aes(x='x', y='y', label='session_text'), va='top', data=plotdata[(plotdata.y < .4) | (plotdata.y > 3)])
        + ylab(ycol)
        #               + geom_area(aes(y2=1.1, y=.9), linetype='dashed', alpha=.7)
        + geom_hline(aes(yintercept=1.1), linetype="dashed", alpha=0.7) +
        geom_hline(aes(yintercept=0.9), linetype="dashed", alpha=0.7) +
        geom_vline(
            aes(xintercept=0.1, ),
            linetype="dashed",
            alpha=0.7,
        ) + geom_vline(
            aes(xintercept=0.3, ),
            linetype="dashed",
            alpha=0.7,
        )
        # + geom_abline()
        #    + geom_point(aes(x='recall', y='precision', color='variant'), size=1.)
        #     + facet_wrap(facets=['cat'], ncol=6, scales='free_x')
        + xlab(xcol)
        # +scale_color_discrete()
        + theme(
            figure_size=(8, 5),
            legend_position="top",
            subplots_adjust={"hspace": 0.5},
            legend_title=element_blank(),
            legend_box_margin=-1,
            legend_margin=0.0,
            axis_text=element_text(size=12, margin={
                "t": 0.2,
                "l": -0.3
            }),
            legend_text=element_text(size=11),
            axis_title=element_text(size=12,
                                    margin={
                                        "r": -0.2,
                                        "b": 0.0,
                                        "l": 0,
                                        "t": 0.0
                                    }),
        ) + scale_x_log10(labels=make_labeler(brief_format),
                          breaks=[0.01, 0.1, 0.3, 1.0]) +
        scale_y_log10(labels=make_labeler(brief_format),
                      breaks=[0.5, 0.9, 1.1, 2.0, 3.0, 6, 12]))

    return scatterplot
Esempio n. 28
0
    package_reader = PackageReader()
    package_features = package_reader.read_features()

    print("packages loaded!")

    class_reader = ClassReader()
    class_features = class_reader.read_features()

    print("classes loaded!")

    (
        ggplot(project_features, aes('loc_sum'))
        + geom_histogram(bins=100)
        + facet_grid('is_pattern_project ~ .')
        + scale_x_log10(breaks=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
        + scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(project_features)) for v in l])
        + xlab("Lines of Code")
        + ylab("Percent of Projects")
    ).save('projects_histogram.png')

    print("projects drawn!")

    (
        ggplot(package_features, aes('loc_sum'))
        + geom_histogram(bins=100)
        + facet_grid('is_pattern_pakkage ~ .')
        + scale_x_log10(breaks=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
        + scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(package_features)) for v in l])
        + xlab("Lines of Code")
        + ylab("Percent of Packages")
Esempio n. 29
0
    # Save
    res.to_csv(FIGURE_DIRECTORY + "crm_res.tsv", sep='\t')
    peakstats.to_csv(FIGURE_DIRECTORY + "peakstats.tsv", sep='\t')
    """
    # To reload :
    res = pd.read_csv(FIGURE_DIRECTORY+"crm_res.tsv", sep = '\t')
    peakstats = pd.read_csv(FIGURE_DIRECTORY+"peakstats.tsv", sep = '\t')
    """

    ## --------- For the figures

    p = (ggplot(data=res[0:10000],
                mapping=aes(x='nb_peaks_2020', y='nb_peaks_2018')) +
         geom_point(mapping=aes(color='average_atypeak_score')) +
         scale_x_log10() + scale_y_log10() +
         labs(x="Nb. peaks Remap 2020",
              y="Nb. peaks Remap 2018",
              color="Mean atyPeak score per CRM") +
         scale_color_gradient(low="red", high="blue"))
    p.save(FIGURE_DIRECTORY + "crm_nb_peaks_update.pdf", verbose=False)

    p = (ggplot(data=res[10000:13000],
                mapping=aes(x='nb_peaks_2018', y='update_ratio')) +
         geom_point(mapping=aes(color='average_atypeak_score')) +
         scale_x_log10() + scale_y_log10() +
         labs(x="Nb. peaks Remap 2018",
              y="Nb peaks ReMap 2020/2018",
              color="Mean atyPeak score per CRM") +
         scale_color_gradient(low="red", high="blue"))
    p.save(FIGURE_DIRECTORY + "crm_update_ratio.pdf", verbose=False)