Beispiel #1
0
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]):
    """
    Plots the pointplot
    Arguments:
        plot_df - the dataframe that contains the odds ratio and lemmas
        y_axis_label - the label for the y axis
        use_log10 - use log10 for the y axis?
    """
    graph = (
        p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) +
        p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"),
                           position=p9.position_dodge(width=1),
                           size=0.3,
                           color="#253494") +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous(
            limits=limits)) +
        p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') +
        p9.coord_flip() + p9.theme_seaborn(
            context='paper', style="ticks", font_scale=1, font='Arial') +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            panel_grid_minor=p9.element_blank(),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10)) +
        p9.labs(x=None, y=y_axis_label))
    return graph
Beispiel #2
0
def test_annotation_logticks_coord_flip():
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_log10() + scale_y_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip'
Beispiel #3
0
def plot_company_versus_sector(
        df: pd.DataFrame,
        stock: str,
        sector: str  # pylint: disable=unused-argument
) -> p9.ggplot:
    if df is None or len(df) < 1:
        print("No data for stock vs. sector plot... ignored")
        return None

    df["date"] = pd.to_datetime(df["date"])
    # print(df)
    plot = p9.ggplot(
        df, p9.aes("date", "value", group="group", color="group",
                   fill="group")) + p9.geom_line(size=1.5)
    # if there are more than two orders of magnitude between best stock and this stock, use a log scale to improve comparability of performance
    # a pseudo log transform is used to handle stocks which are losing money
    if max(df['value']) - min(df['value']) > 100.0:
        plot += p9.scale_y_log10(
            trans=transforms.pseudo_log_trans
        )  # cant use log scale since negative values may be involved
    return user_theme(
        plot,
        y_axis_label="Change since start (%)",
        subplots_adjust={"right": 0.8},
        legend_position="right",
    )
def scatter_plot(df,
                 xcol,
                 ycol,
                 domain,
                 xname=None,
                 yname=None,
                 log=False,
                 width=6,
                 height=6,
                 clamp=True,
                 tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 0.5
    DASH_PATTERN = (0, (3, 1))

    if xname == None:
        xname = xcol
    if yname == None:
        yname = ycol

    # formater for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df = df.copy(deep=True)
        df.loc[df[xcol] > domain[1], xcol] = domain[1]
        df.loc[df[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True)
    scatter += p9.labs(x=xname, y=yname)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    #scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(
        panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(figure_size=(width, height))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1,
                              linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1],
                             linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1],
                             linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Beispiel #5
0
def test_annotation_logticks():
    # The grid should align with the logticks
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_log10() + scale_y_log10() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks'
Beispiel #6
0
def multiplot(files, smooth=100, alpha=0.6, loss_padd=None):
    if not isinstance(files, dict):
        files = [files]

    def load_hist(entry):
        name, file = entry
        try:
            hist = np.loadtxt(file)
        except OSError:
            warn = "{} could not be loaded with np.loadtext({})."
            warnings.warn(warn.format(name, file), UserWarning)
            return name, None
        is_fine = np.isfinite(hist)
        if not any(is_fine):
            return name, None
        iters = np.where(is_fine)[0]
        hist = hist[is_fine]
        lb = min(hist)
        if loss_padd is not None and lb < 0:
            hist += loss_padd - lb
            lb = loss_padd
        ldf = pd.DataFrame({
            "loss": hist,
            "iteration": iters,
            "model": [name] * len(hist)
        })
        if smooth is not False:
            if lb > 0:
                ldf["sloss"] = np.exp(
                    gaussian_filter1d(np.log(hist), sigma=smooth))
            else:
                ldf["sloss"] = gaussian_filter1d(hist, sigma=smooth)
        return name, ldf

    tasks = list(files.items())
    df = pd.DataFrame()
    with mp.Pool() as pool:
        for name, ldf in tqdm(pool.imap(load_hist, tasks),
                              total=len(tasks),
                              desc="models"):
            if ldf is not None:
                df = df.append(ldf)

    def breaks(limits):
        ll = np.log10(limits)
        if (ll[1] - ll[0]) > 3:
            ll = np.round(ll)
            ex = np.linspace(ll[0], ll[1], 10)
            ex = np.round(ex)
        else:
            ex = np.linspace(ll[0], ll[1], 10)
        return 10.0**ex

    pl = (pn.ggplot(pn.aes("iteration", "loss", color="model"), df) +
          pn.geom_line(alpha=alpha) + pn.scale_y_log10() + pn.theme_minimal())
    if smooth is not False:
        pl += pn.geom_line(pn.aes(y="sloss"), size=1, alpha=alpha)
    return pl, df
Beispiel #7
0
def test_annotation_logticks_coord_flip_discrete():
    df2 = df.assign(discrete=pd.Categorical(['A' + str(a) for a in df['x']]))
    p = (ggplot(df2, aes('discrete', 'x')) +
         annotation_logticks(sides='l', size=.75) + geom_point() +
         scale_y_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip_discrete'
Beispiel #8
0
def plot_violin_plots(
    par_id: str,
    dims: List[str],
    draws: Dict,
    log_scale_variables: List[str],
    units: Dict[str, str],
    confidence_intervals,
    measurements,
):
    """Plot and save violin plots of parsed distributions.

    :param par_id: Name of the parameter plotted
    :param dims: Dimensions of the parameter
    :param draws: pd.Dataframe of parameter distribution
    indexed by dimensions and contains the population samples
    :param log_scale_variables: Parameters that are log-distributed
    :param units: Dictionary of units for each parameter
    """
    par_units = units[par_id]
    x = fill = dims[0] if len(dims) <= 1 else "experiments"
    plot = (p9.ggplot(data=draws) + p9.geom_violin(
        p9.aes(y=f"{par_id}", x=x, fill=fill),
        position="identity",
        color="None",
        size=0.5,
        alpha=0.7,
        weight=0.7,
        linetype="None",
    ) + p9.labels.ylab(f"{par_id} {par_units}"))
    if par_id in confidence_intervals.keys():
        plot += p9.geoms.geom_errorbar(
            p9.aes(x=x, ymin="lower_ci", ymax="upper_ci"),
            data=confidence_intervals[par_id],
            width=0.1,
        )
    if par_id in measurements.keys():
        if len(measurements[par_id]) > 0:
            plot += p9.geoms.geom_point(
                p9.aes(y="measurement", x=x),
                data=measurements[par_id],
            )
    if len(dims) == 1:
        plot += p9.themes.theme(axis_text_x=p9.element_text(angle=70), )
    if len(dims) > 1:
        plot += p9.facet_wrap(f"~{dims[1]}") + p9.themes.theme(
            panel_spacing_y=0.05,
            panel_spacing_x=0.35,
            axis_title=p9.element_text(size=10),
            axis_text=p9.element_text(size=11),
            axis_text_y=p9.element_text(size=8, angle=45),
            axis_title_x=p9.element_blank(),
            axis_text_x=p9.element_blank(),
        )
    if par_id in log_scale_variables:
        plot += p9.scale_y_log10()

    return plot
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 1.5
    DASH_PATTERN = (0, (6, 2))

    if xname is None:
        xname = xcol
    if yname is None:
        yname = ycol

    # formatter for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df1 = df1.copy(deep=True)
        df1.loc[df1[xcol] > domain[1], xcol] = domain[1]
        df1.loc[df1[ycol] > domain[1], ycol] = domain[1]

        df2 = df2.copy(deep=True)
        df2.loc[df2[xcol] > domain[1], xcol] = domain[1]
        df2.loc[df2[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df1)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5)
    scatter += p9.labs(x=xname, y=yname)

    # rug plots
    scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05)
    scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    # scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(panel_grid_minor=p9.element_blank())
    scatter += p9.theme(figure_size=(width, height))
    scatter += p9.theme(text=p9.element_text(size=24, color="black"))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Beispiel #10
0
def plot_scaling_log(plt_df: pd.DataFrame,
                     sweep_vars: Optional[Sequence[str]] = None,
                     with_baseline=True) -> gg.ggplot:
    """Plot scaling of learning time against exponential baseline."""
    p = _base_scaling(plt_df, sweep_vars, with_baseline)
    p += gg.scale_x_log10(breaks=[5, 10, 20, 50])
    p += gg.scale_y_log10(breaks=[100, 300, 1000, 3000, 10000, 30000])
    p += gg.xlab('deep sea problem size (log scale)')
    p += gg.ylab('#episodes until < 90% bad episodes (log scale)')
    return plotting.facet_sweep_plot(p, sweep_vars)
Beispiel #11
0
def plot_compare(stats,
                 variant,
                 variant_baseline,
                 metric,
                 mode="identity",
                 jitter=0.01):
    assert mode in ["identity", "ratio", "difference"]
    plotdata = compare_stats(stats, variant, variant_baseline)
    bsw = bsw_table2(plotdata, metric=metric, reltol=1.0)
    display(bsw)
    baseline_name = f"{metric}_baseline"
    plotdata = plotdata[[metric, baseline_name, "dataset"]].assign(
        ratio=plotdata[metric] / plotdata[baseline_name],
        difference=plotdata[metric] - plotdata[baseline_name],
    )

    if mode == "identity":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y=metric, fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=1, intercept=0)))
    elif mode == "ratio":
        return (
            ggplot(data=plotdata) + geom_jitter(
                aes(x=f"{metric}_baseline", y="ratio", fill="dataset"),
                width=jitter,
                height=jitter,
            ) + scale_x_log10() + scale_y_log10()
            ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale
            + geom_abline(aes(slope=0, intercept=0.0)) +
            geom_abline(aes(slope=-1, intercept=0.0))  # max
        )
    elif mode == "difference":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y="difference", fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=0, intercept=0)))
    else:
        assert False, "unknown mode"
def test_annotation_logticks_faceting():
    n = len(df)
    df2 = pd.DataFrame({
        'x': np.hstack([df['x'], df['x']]),
        'g': list('a' * n + 'b' * n)
    })
    p = (ggplot(df2) + annotation_logticks(sides='b', size=.75) +
         geom_point(aes('x', 'x')) + scale_x_log10() + scale_y_log10() +
         facet_wrap('g') + theme(panel_grid_minor=element_line(color='green'),
                                 panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_faceting'
Beispiel #13
0
    def plot_replicates_log_axes(self):
        """
        Plots replicate traces from a single run on logarithmic axes to determine the baseline metabolic charge production
        or other stabilization.

        """

        from plotnine import ggplot, ylab, xlab, geom_line, aes, scale_y_log10, scale_x_log10

        plot = ((ggplot(self.data, aes('Time', 'Current', color='Channel')) +
                 ylab(u'Current (μA)') + xlab('Time (seconds)') + geom_line() +
                 scale_y_log10() + scale_x_log10()))

        print(plot)
        return plot
Beispiel #14
0
def kernel_stats(inFile, log_scale=True):
    par = get_params(inFile)

    n_kernel = 0
    for var in sorted(par["means"]):
        n_kernel += "mus_f" in var

    tf = pm.distributions.transforms.StickBreaking()

    dfs = list()
    for tissue_type in ["t", "f"]:
        weights = tf.backward(
            par["means"][f"w_{tissue_type}_stickbreaking__"]).eval()
        n_dim = par["means"][f"x_{tissue_type}"].shape[1]
        volumes = list()
        for kernel in range(n_kernel):
            # get covariance elipse parameters
            packed_cov = par["means"][
                f"packed_L_{tissue_type}_{kernel}_cholesky-cov-packed__"]
            lower = pm.expand_packed_triangular(n_dim, packed_cov,
                                                lower=True).eval()
            cov = np.dot(lower, lower.T)
            volume = np.linalg.det(cov)
            volumes.append(volume)
        type_df = pd.DataFrame(
            {
                "tissue": "tumor" if tissue_type == "t" else "non-tumor",
                "weight": weights,
                "volume": volumes,
            },
            index=[f"kernel {i}" for i in range(n_kernel)],
        )
        dfs.append(type_df)
    df = pd.concat(dfs)
    pl = (pn.ggplot(pn.aes("volume", "weight", color="tissue"), df) +
          pn.geom_point())
    if log_scale:
        pl += pn.scale_y_log10()
        pl += pn.scale_x_log10()
    pl += pn.theme_minimal()
    return pl, df
Beispiel #15
0
 def plot(self,
          plotDat,
          tag=None,
          log=True,
          by='cell_type',
          data_set=None,
          title=None,
          alpha=.4):
     pDat = plotDat.copy()
     gcorr = pearsonr(pDat.measured, pDat.prediction)[0]
     corrs = pDat.groupby(
         pDat[by]).apply(lambda x: pearsonr(x.measured, x.prediction)[0])
     pDat['corr'] = corrs[pDat[by]].values
     by_str = '{}_pearson'.format(by)
     pDat[by_str] = pDat.apply(
         lambda x: '{} {:.2f}'.format(x[by], corrs[x[by]]), axis=1)
     if data_set:
         pDat = pDat.loc[pDat['dataset_name'] == data_set]
     pl = (pn.ggplot(pn.aes('measured', 'prediction', color=by_str), pDat) +
           pn.geom_point(alpha=alpha) + pn.stat_smooth(mapping=pn.aes(
               'measured', 'prediction', color=by_str),
                                                       method='lm',
                                                       geom='line',
                                                       alpha=0.5,
                                                       se=False,
                                                       inherit_aes=False))
     if len(pDat['sample'].unique()) < 10:
         pl = pl + pn.aes(shape='sample')
     else:
         pl = pl + pn.aes(shape='dataset_name')
     if log is True:
         pl = pl + pn.scale_x_log10() + pn.scale_y_log10()
     if title is not None:
         pl = pl + pn.ggtitle(title)
     elif tag is not None:
         pl = pl + pn.ggtitle('{} pearson={}'.format(tag, gcorr))
     else:
         pl = pl + pn.ggtitle('pearson={}'.format(gcorr))
     return pl
Beispiel #16
0
def plot_box_plots(var, draws, measurements, variable_id_map):
    """Return plotnine.geoms.geom_boxplot of given variable."""
    plot = p9.ggplot(data=draws[var]) + p9.geom_boxplot(
        p9.aes(x=variable_id_map[var], y=var, fill=variable_id_map[var]),
        outlier_shape="",
    )
    if measurements[var].empty is False:
        plot += p9.geoms.geom_point(p9.aes(y="measurement",
                                           x=variable_id_map[var]),
                                    data=measurements[var])
    if var != "flux":
        plot += p9.scale_y_log10()
    plot += p9.facet_wrap("~experiments") + p9.themes.theme(
        panel_spacing_y=0.05,
        panel_spacing_x=0.35,
        axis_title=p9.element_text(size=10),
        axis_text=p9.themes.element_text(size=11),
    )
    if var == "flux":
        plot += p9.scale_y_continuous(breaks=np.arange(-0.001, 0.002, 0.00025),
                                      limits=[-0.001, 0.002])
    plot += p9.theme(axis_text_x=p9.themes.element_text(rotation=90, size=6))
    return plot
Beispiel #17
0
# In[6]:

color_map = {
    "Existing": mcolors.to_hex(pd.np.array([178, 223, 138, 255]) / 255),
    "Novel": mcolors.to_hex(pd.np.array([31, 120, 180, 255]) / 255)
}

# In[7]:

g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges",
                                 color="in_hetionet")) + p9.geom_point() +
     p9.geom_line() + p9.scale_color_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme_bw())
print(g)

# In[8]:

g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", fill="in_hetionet"))
     + p9.geom_bar(stat='identity', position='dodge') +
     p9.scale_fill_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.coord_flip() + p9.facet_wrap("relation") + p9.scale_y_log10() +
     p9.theme(figure_size=(12, 8), aspect_ratio=9) + p9.theme_bw())
print(g)

# In[9]:
    int(grouped_candidates_pred_df.hetionet.value_counts()[1]),
    "relation":
    "DaG"
})
datarows.append({
    "edges": (grouped_candidates_pred_df.query(
        "pred_max > 0.5").hetionet.value_counts()[0]),
    "in_hetionet":
    "Novel",
    "relation":
    "DaG"
})
edges_df = pd.DataFrame.from_records(datarows)
edges_df

# In[20]:

g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) +
     p9.geom_col(position="dodge") + p9.geom_text(p9.aes(label=(
         edges_df.apply(lambda x: f"{x['edges']} ({x['recall']*100:.0f}%)"
                        if not math.isnan(x['recall']) else f"{x['edges']}",
                        axis=1))),
                                                  position=p9.position_dodge(
                                                      width=1),
                                                  size=9,
                                                  va="bottom") +
     p9.scale_y_log10() + p9.theme(axis_text_y=p9.element_blank(),
                                   axis_ticks_major=p9.element_blank(),
                                   rect=p9.element_blank()))
print(g)
Beispiel #19
0
import math
g = (
    p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet"))
    + p9.geom_col(position="dodge")
    + p9.geom_text(
        p9.aes(
            label=(
                edges_df
                .apply(
                    lambda x: 
                    f"{x['edges']} ({x['recall']*100:.0f}%)" 
                    if not math.isnan(x['recall']) else 
                    f"{x['edges']}",
                    axis=1
                )
            )
        ),
        position=p9.position_dodge(width=1),
        size=9,
        va="bottom"
    )
    + p9.scale_y_log10()
    + p9.theme(
        axis_text_y=p9.element_blank(),
        axis_ticks_major = p9.element_blank(),
        rect=p9.element_blank()
    )
)
print(g)

Beispiel #20
0
            {
                'set_nb': size,
                'algo': 'DL_noiseless_data',
                'time': stop_time - start_time
            },
            ignore_index=True)

df_bench['set_nb'] = df_bench['set_nb'].astype(int)
p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo') +
     geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
     xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename=OUTPUT_ROOT + "scaling_fig4")

p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo') +
     geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
     scale_y_log10() + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename=OUTPUT_ROOT + "scaling_fig4_log10")

# Normalized time to minimum number of sets
min_nb_sets = min(SETS_NB)
minimum_time = df_bench[df_bench['set_nb'] == min_nb_sets][['algo', 'time']]
df_bench['time_relative'] = df_bench['time']  # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(
        minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index, 'time_relative'] = row['time'] / my_minimum_time

p = (ggplot(df_bench) +
     aes('set_nb', 'time_relative', color='algo', group='algo') +
     geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
print([_.shape for _ in tss])
print(ts.shape)

# convert data into dataframe
df = arr2df(arrays, ts, titles)  # , n=50)

df2 = df.loc[:, ['tag', 'hr', 'v']].groupby(
    ['tag', 'hr']).quantile(q=[.9, .99, 1]).unstack()
df2.columns = ['q090', 'q099', 'q100']

df2 = df2.reset_index().melt(id_vars=['tag', 'hr'],
                             var_name='q',
                             value_name='v')

df2['g'] = df2['tag'] + df2['q'].astype(str)

hrmax = df['hr'].max()
p = (ggplot(df2) +
     geom_line(aes('hr', 'v', color='tag', alpha='q', size='q', group='g')) +
     scale_x_continuous(
         breaks=np.arange(0, hrmax, 24),
         minor_breaks=np.arange(0, hrmax, 6),
     ) + scale_color_brewer(type='qual', palette='Set1') +
     scale_alpha_manual(np.linspace(1, 0.2, num=3)) +
     scale_size_manual([2, 1, .5]) + labs(title=title, y='conc (ppb)'))
p.save(oname)

print(df2['v'].max())
pp = p + scale_y_log10(limits=[1.0, df2['v'].max()])
pp.save(oname[:-4] + '_log.png')
Beispiel #22
0
    # Save
    res.to_csv(FIGURE_DIRECTORY + "crm_res.tsv", sep='\t')
    peakstats.to_csv(FIGURE_DIRECTORY + "peakstats.tsv", sep='\t')
    """
    # To reload :
    res = pd.read_csv(FIGURE_DIRECTORY+"crm_res.tsv", sep = '\t')
    peakstats = pd.read_csv(FIGURE_DIRECTORY+"peakstats.tsv", sep = '\t')
    """

    ## --------- For the figures

    p = (ggplot(data=res[0:10000],
                mapping=aes(x='nb_peaks_2020', y='nb_peaks_2018')) +
         geom_point(mapping=aes(color='average_atypeak_score')) +
         scale_x_log10() + scale_y_log10() +
         labs(x="Nb. peaks Remap 2020",
              y="Nb. peaks Remap 2018",
              color="Mean atyPeak score per CRM") +
         scale_color_gradient(low="red", high="blue"))
    p.save(FIGURE_DIRECTORY + "crm_nb_peaks_update.pdf", verbose=False)

    p = (ggplot(data=res[10000:13000],
                mapping=aes(x='nb_peaks_2018', y='update_ratio')) +
         geom_point(mapping=aes(color='average_atypeak_score')) +
         scale_x_log10() + scale_y_log10() +
         labs(x="Nb. peaks Remap 2018",
              y="Nb peaks ReMap 2020/2018",
              color="Mean atyPeak score per CRM") +
         scale_color_gradient(low="red", high="blue"))
    p.save(FIGURE_DIRECTORY + "crm_update_ratio.pdf", verbose=False)
# %%
# Runs with small uploads/downloads look better with log scale.
use_y_log10 = max(data["MiB"]) <= 8.0

# %%
# A common facet for all plots
facet = p9.facet_grid("Op ~ Crc32cEnabled + MD5Enabled",
                      labeller="label_both",
                      scales="free_y")

# %%
plot = (p9.ggplot(
    data=data, mapping=p9.aes(x="MiB", y="ElapsedSeconds", color="ApiName")) +
        p9.geom_point() + facet)
(plot +
 p9.scale_y_log10() if use_y_log10 else plot).save(args.output_prefix +
                                                   ".elapsed-vs-size.png")

# %%
plot = (p9.ggplot(
    data=data, mapping=p9.aes(x="MiB", y="CpuNanosPerByte", color="ApiName")) +
        p9.geom_point() + facet)
(plot + p9.scale_y_log10() if use_y_log10 else plot).save(args.output_prefix +
                                                          ".cpu-vs-size.png")

# %%
(p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="MiBs", color="ApiName")) +
 p9.geom_point() + facet).save(args.output_prefix + ".tp-vs-size.png")

# %%
(p9.ggplot(data=data, mapping=p9.aes(x="ApiName", y="MiBs", color="ApiName")) +
Beispiel #24
0
def rel_plot(sbs, variant, jitter=0.01):
    plotdata = sbs[sbs.variant == variant]
    xcol = "base"
    ycol = "ratio"
    plotdata = plotdata.assign(x=plotdata[xcol], y=plotdata[ycol])
    plotdata = plotdata.assign(sbs_index=plotdata.index.values)
    session_text = (plotdata[["session_index", "base_session_index"]].apply(
        tuple, axis=1).map(lambda tup: f"{tup[0]} vs. {tup[1]}"))
    plotdata = plotdata.assign(session_text=session_text)

    x = np.geomspace(0.02, 1, num=5)
    y = 1 / x
    diag_df = pd.DataFrame({"x": x, "y": y})

    scatterplot = (
        ggplot(plotdata) + geom_jitter(
            aes(x="x", y="y", fill="dataset", color="dataset"),
            width=jitter,
            height=jitter,
            alpha=0.6,
            size=1.0,
        )
        #                 shape=plotdata.dataset.map(lambda x : '.' if x in ['lvis','objectnet'] else 'o'),
        #                 size=plotdata.dataset.map(lambda x : 1. if x in ['lvis','objectnet'] else 2.))
        #  + geom_text(aes(x='base', y='delta', label='category', color='dataset'), va='bottom',
        #              data=plotdata1[plotdata1.ratio < .6],
        #              position=position_jitter(.05, .05), show_legend=False)
        + geom_line(aes(x="x", y="y"), data=diag_df)
        # + geom_text(aes(x='x', y='y', label='session_text'), va='top', data=plotdata[(plotdata.y < .4) | (plotdata.y > 3)])
        + ylab(ycol)
        #               + geom_area(aes(y2=1.1, y=.9), linetype='dashed', alpha=.7)
        + geom_hline(aes(yintercept=1.1), linetype="dashed", alpha=0.7) +
        geom_hline(aes(yintercept=0.9), linetype="dashed", alpha=0.7) +
        geom_vline(
            aes(xintercept=0.1, ),
            linetype="dashed",
            alpha=0.7,
        ) + geom_vline(
            aes(xintercept=0.3, ),
            linetype="dashed",
            alpha=0.7,
        )
        # + geom_abline()
        #    + geom_point(aes(x='recall', y='precision', color='variant'), size=1.)
        #     + facet_wrap(facets=['cat'], ncol=6, scales='free_x')
        + xlab(xcol)
        # +scale_color_discrete()
        + theme(
            figure_size=(8, 5),
            legend_position="top",
            subplots_adjust={"hspace": 0.5},
            legend_title=element_blank(),
            legend_box_margin=-1,
            legend_margin=0.0,
            axis_text=element_text(size=12, margin={
                "t": 0.2,
                "l": -0.3
            }),
            legend_text=element_text(size=11),
            axis_title=element_text(size=12,
                                    margin={
                                        "r": -0.2,
                                        "b": 0.0,
                                        "l": 0,
                                        "t": 0.0
                                    }),
        ) + scale_x_log10(labels=make_labeler(brief_format),
                          breaks=[0.01, 0.1, 0.3, 1.0]) +
        scale_y_log10(labels=make_labeler(brief_format),
                      breaks=[0.5, 0.9, 1.1, 2.0, 3.0, 6, 12]))

    return scatterplot
            {
                'lines': lines,
                'algo': 'DL',
                'time': stop_time - start_time
            },
            ignore_index=True)

df_bench['lines'] = df_bench['lines'].astype(int)
p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo') +
     geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
     xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename=OUTPUT_ROOT + "scaling_fig5")

p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo') +
     geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
     scale_y_log10() + xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename=OUTPUT_ROOT + "scaling_fig5_log10")

# Normalized time to minimum line number
min_nb_lines = min(LINES_NB)
minimum_time = df_bench[df_bench['lines'] == min_nb_lines][['algo', 'time']]
df_bench['time_relative'] = df_bench['time']  # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(
        minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index, 'time_relative'] = row['time'] / my_minimum_time

p = (ggplot(df_bench) +
     aes('lines', 'time_relative', color='algo', group='algo') + geom_point() +
     geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
# Runs with small uploads/downloads look better with log scale.
use_y_log10 = max(data["MiB"]) <= 8.0

# %%
# A common facet for all plots
facet = p9.facet_grid(
    "Op ~ Crc32cEnabled + MD5Enabled", labeller="label_both", scales="free_y"
)

# %%
plot = (
    p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="ElapsedSeconds", color="ApiName"))
    + p9.geom_point()
    + facet
)
(plot + p9.scale_y_log10() if use_y_log10 else plot).save(
    args.output_prefix + ".elapsed-vs-size.png"
)

# %%
plot = (
    p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="CpuNanosPerByte", color="ApiName"))
    + p9.geom_point()
    + facet
)
(plot + p9.scale_y_log10() if use_y_log10 else plot).save(
    args.output_prefix + ".cpu-vs-size.png"
)

# %%
(
        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = k, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


df_bench['scaling_factor'] = df_bench['scaling_factor'].astype(int)
p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3")

p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_log10")


# Normalized time to scaling factor of 1
minimum_time = df_bench[df_bench['scaling_factor']==1][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (relative)"))