Ejemplo n.º 1
0
def plot_mem(df):
    x = df.copy()
    # initialise some extra columns useful for plotting
    x['new_cols'] = [str(i) for i in x['col_name']]
    x['new_cols'] = pd.Categorical(x['new_cols'],
                                   categories=x['new_cols'],
                                   ordered=True)
    x['cnt_print_loc_pos'] = (x.pcnt.values) + (np.max(x.pcnt.values)) / 70
    x['cnt_print_loc_neg'] = (x.pcnt.values) - (np.max(x.pcnt.values)) / 70
    # build basic plot
    ggplt  = p9.ggplot(x, p9.aes(x = 'new_cols', y = 'pcnt', fill = 'new_cols')) \
      + p9.geom_bar(stat = 'identity') \
      + p9.guides(fill = False) \
      + p9.ylab('% of total size') \
      + p9.xlab('') \
      + p9.theme(axis_text_x=p9.element_text(rotation = 45, hjust=1))

    # add text labels to the highest bars
    y1 = x.copy()[x.pcnt > 0.3 * np.max(x.pcnt)]
    ggplt = ggplt + \
      p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_neg', label = 'size', \
        fill = 'col_name'), inherit_aes = False, data = y1, color = 'white', \
        angle = 90, vjust = 'top')
    # add text labels to the lower bars
    y2 = x.copy()[x.pcnt <= 0.3 * np.max(x.pcnt)]
    ggplt = ggplt + \
      p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_pos', label = 'size', \
        fill = 'col_name'), inherit_aes = False, data = y2, color = 'gray', \
        angle = 90, vjust = 'bottom')
    return ggplt
Ejemplo n.º 2
0
def cell_cycle_phase_barplot(adata, palette='Set2'):
    """Plots the proportion of cells in each phase of the cell cycle

    See also: cell_cycle_phase_pieplot for the matplotlib pie chart


    Parameters
    -----------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.annotate_cell_cycle`.

    Returns
    -----------
    A plotnine barplot with the total counts of cell in each phase of the
    cell cycle.

    """
    plt_data = adata.obs.copy()
    plt_data['cell_cycle_phase'] = pd.Categorical(
        plt_data['cell_cycle_phase'],
        categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M'])

    cycle_plot = (
        ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) +
        geom_bar() + coord_flip() + guides(fill=False) +
        labs(y='', x='Cell cycle phase') + theme_light() +
        theme(panel_grid_major_y=element_blank(),
              panel_grid_minor_y=element_blank(),
              panel_grid_major_x=element_line(size=1.5),
              panel_grid_minor_x=element_line(size=1.5)) +
        scale_fill_brewer(type='qual', palette=palette))

    return cycle_plot
Ejemplo n.º 3
0
def derplot(adata=None,
            filename='derplot',
            embedding='tsne',
            feature='sample_type_tech',
            size=(12, 12),
            save=False,
            draw=False,
            psize=1):
    start = datetime.datetime.now()
    p.options.figure_size = size
    savename = filename + '.' + embedding + '.' + feature + '.derplot.png'
    print(
        start.strftime("%H:%M:%S"),
        'Starting ... \t',
        savename,
    )
    p.theme_set(p.theme_classic())
    pt = \
    p.ggplot(p.aes(embedding +'0', embedding + '1', color=feature), adata.obs) \
        + p.geom_point(size=psize, alpha = 1, stroke = 0 ) \
        + p.guides(color = p.guide_legend(override_aes={'size': 15}))

    if save: pt.save(savename, format='png', dpi=200)
    end = datetime.datetime.now()
    delta = end - start
    print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())),
          's to make: \t', savename)
Ejemplo n.º 4
0
def plot_two_way_sdc(sdc_df: pd.DataFrame, alpha: float = .05, **kwargs):
    """
    Plots the results of a SDC analysis for a fixed window size in a 2D figure.

    In a similar fashion to a recurrence plot, x and y axes represent the start index of the x and y sequences. Only
    results with a p_value < alpha are shown, while controlling the alpha as a function of the intensity of the score
    and the color as a function of the sign of the established relationship.

    Parameters
    ----------
    sdc_df
        Data frame as outputted by `compute_sdc` which will be used to plot the results.
    alpha
        Significance threshold. Only values with a score < alpha will be plotted
    kwargs
        Keyword arguments to pass to `plotnine.theme` to customize the plot.
    Returns
    -------
    p9.ggplot.ggplot
        Plot
    """
    fragment_size = int(sdc_df.iloc[0]['stop_1'] - sdc_df.iloc[0]['start_1'])
    f = (sdc_df.loc[lambda dd: dd.p_value < alpha].assign(r_str=lambda dd: dd[
        'r'].apply(lambda x: '$r > 0$' if x > 0 else '$r < 0$')).pipe(
            lambda dd: p9.ggplot(dd) + p9.aes(
                'start_1', 'start_2', fill='r_str', alpha='abs(r)'
            ) + p9.geom_tile() + p9.scale_fill_manual(['#da2421', 'black']) +
            p9.scale_y_reverse() + p9.theme(**kwargs) + p9.guides(alpha=False)
            + p9.labs(x='$X_i$',
                      y='$Y_j$',
                      fill='$r$',
                      title=f'Two-Way SDC plot for $S = {fragment_size}$' +
                      r' and $\alpha =$' + f'{alpha}')))

    return f
Ejemplo n.º 5
0
def plot_save_rank(df_ranks, df_teams, year, week, show=False):
    """Plot the ranking iterations for each team

  :param df_ranks: data frame with team_id, and rankings for each iteration
  :param df_teams: data frame with team_id and owner info
  :param year: year for data
  :param week: current week
  :param show: flag to display the plot
  :return: final summarised rankings data frame with columns for team_id and ranks
  """
    # Plot each iteration
    df_ranks_lsq = pd.merge(df_teams[['team_id', 'firstName']],
                            df_ranks,
                            on='team_id')
    # Space out labels on x-axis according to final rankings
    df_ranks_lsq['label_x_pos'] = df_ranks_lsq.get(
        99).rank() * 100 / df_ranks_lsq.get(99).size
    # Convert to long format for plotting ease
    df_ranks_lsq_long = (df_ranks_lsq.rename({
        'ranks': '0'
    }, axis='columns').melt(id_vars=['team_id', 'firstName', 'label_x_pos']))
    # Convert iteration variable to int
    df_ranks_lsq_long.variable = df_ranks_lsq_long.variable.astype(int)
    # Make the plot
    p = (ggplot(aes(
        x='variable', y='value', color='factor(team_id)', group='team_id'),
                data=df_ranks_lsq_long) + geom_line() +
         geom_label(aes(label='firstName',
                        x='label_x_pos',
                        y='value',
                        color='factor(team_id)'),
                    data=df_ranks_lsq_long[df_ranks_lsq_long.variable == 99],
                    size=10) + labs(x='Iteration', y='LSQ rank') + theme_bw() +
         guides(color=False))
    # Save plot
    if show:
        p.draw()
    # make dir if it doesn't exist already
    out_dir = Path(f'output/{year}/week{week}')
    out_dir.mkdir(parents=True, exist_ok=True)
    out_name = out_dir / 'lsq_iter_rankings.png'
    # plotnine is throwing too many warnings
    warnings.filterwarnings('ignore')
    p.save(out_name, width=9, height=6, dpi=300)
    warnings.filterwarnings('default')
    logger.info(f'Saved LSQ rankings plot to local file: {out_name.resolve()}')
    # Average last 70 elements to get final rank
    df_final_ranks = (df_ranks_lsq_long.query('variable>70').groupby([
        'team_id'
    ])[['value'
        ]].agg(lambda x: np.tanh(np.mean(x) / 75.)).reset_index().rename(
            {'value': 'lsq'}, axis=1))
    # Normalize by max score
    df_final_ranks['lsq'] = df_final_ranks.get('lsq') / df_final_ranks.get(
        'lsq').max()
    return df_final_ranks
Ejemplo n.º 6
0
    def comparison_plot(  # type: ignore
            self,
            df: pd.DataFrame,
            xmin=None,
            xmax=None,
            bins: int = 50,
            **kwargs):

        return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) +
                scale_fill_brewer(type="qual", palette="Pastel1") +
                geom_histogram(position="identity", alpha=0.9, bins=bins) +
                self._scale_x(xmin, xmax) + facet_wrap(df.columns[0], ncol=1) +
                guides(fill=False) + ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 7
0
def plot_qq(df, color_var, facet_var=None, title=''):
    """
    Inspired by https://www.cureffi.org/2012/08/15/qq-plots-with-matplotlib/
    """
    # retrive pmin, the most significant (i.e. min) p value (for defining
    # the axes)
    axis_max = max(df['pval_neglog10'])

    if facet_var is None:
        pvals = df.groupby(
            by=color_var).apply(calculate_expected_pval).reset_index(
                level=color_var, drop=True)
    else:
        pvals = df.groupby(by=[color_var, facet_var]).apply(
            calculate_expected_pval).reset_index(level=[color_var, facet_var],
                                                 drop=True)

    # now plot these two arrays against each other
    n_colors = pvals[color_var].nunique()
    qqplot = plt9.ggplot(
        pvals,
        plt9.aes(x='expected_pval_neglog10',
                 y='pval_neglog10',
                 color=color_var))
    qqplot = qqplot + plt9.geom_point(size=0.1, alpha=0.25)
    qqplot = qqplot + plt9.geom_abline(
        slope=1, intercept=0, color='black', linetype='dashed')
    qqplot = qqplot + plt9.theme_bw()
    if n_colors < 9:
        qqplot = qqplot + plt9.scale_colour_brewer(palette='Dark2',
                                                   type='qual')
    qqplot = qqplot + plt9.labs(x='Expected (-log10 p-value)',
                                y='Observed (-log10 p-value)',
                                title=title,
                                color='')
    qqplot = qqplot + plt9.lims(x=(0, axis_max), y=(0, axis_max))
    if facet_var is not None:
        qqplot = qqplot + plt9.facet_wrap('~ {}'.format(facet_var), ncol=5)
    qqplot = qqplot + plt9.theme(strip_text=plt9.element_text(size=5),
                                 axis_text_x=plt9.element_text(angle=-45,
                                                               hjust=0))
    # set guide legend alpha to 1
    qqplot = qqplot + plt9.guides(color=plt9.guide_legend(override_aes={
        'size': 2.0,
        'alpha': 1.0
    }))
    return (qqplot)
Ejemplo n.º 8
0
def plot_num(df) :
  x = df.copy()
  # add group column to the 
  z = x['hist'].to_list()
  for i in range(len(z)) : 
    z[i]['groups'] = x['col_name'][i] 
  z = pd.concat(z)
  # generate the plot
  ggplt = p9.ggplot(z, p9.aes(x = 'value', y = 'prop', group = 'groups'))\
    + p9.geom_col()\
    + p9.guides(fill = False) \
    + p9.ylab('Proportion') \
    + p9.xlab('') \
    + p9.theme(axis_text_x=p9.element_text(rotation = 45, hjust=1))\
    + p9.facet_wrap(facets = ['groups'], ncol = 3, scales = 'free')
  # return the plot object
  return ggplt
Ejemplo n.º 9
0
def test_inplace_add():
    p = _p = ggplot(df)

    p += aes('x', 'y')
    assert p is _p

    p += geom_point()
    assert p is _p

    p += stat_identity()
    assert p is _p

    p += scale_x_continuous()
    assert p is _p

    with pytest.warns(PlotnineWarning):
        # Warning for; replacing existing scale added above
        p += xlim(0, 10)
        assert p is _p

    p += lims(y=(0, 10))
    assert p is _p

    p += labs(x='x')
    assert p is _p

    p += coord_trans()
    assert p is _p

    p += facet_null()
    assert p is _p

    p += annotate('point', 5, 5, color='red', size=5)
    assert p is _p

    p += guides()
    assert p is _p

    p += theme_gray()
    assert p is _p

    th = _th = theme_gray()
    th += theme(aspect_ratio=1)
    assert th is _th
Ejemplo n.º 10
0
def lollipop(data):
    data = data.sort_values(by=['probability']).reset_index(drop=True)
    custom_order = pd.Categorical(data['label'], categories=data.label)
    data = data.assign(label_custom=custom_order)


    p = ggplot(data, aes('label_custom', 'probability')) + \
        geom_point(color = "#88aa88", size = 4) + \
            geom_segment(aes(x = 'label_custom', y = 0, xend = 'label_custom', yend = 'probability'), color = "#88aa88") + \
                coord_flip(expand=True) + \
                    theme_minimal() + \
                        labs(x="", y="probability", title = "Most Likely Object") + \
                            guides(title_position = "left") + \
                                theme(plot_title = element_text(size = 20, face = "bold", ha= "right"))

    fig = p.draw()
    figfile = BytesIO()
    plt.savefig(figfile, format='png', bbox_inches='tight')
    figfile.seek(0)  # rewind to beginning of file
    figdata_png = base64.b64encode(figfile.getvalue()).decode()
    return p, figdata_png
Ejemplo n.º 11
0
def wraplot(adata=None,
            filename='wraplot',
            embedding='tsne',
            feature='sample_type_tech',
            size=(12, 12),
            color=None,
            save=False,
            draw=False,
            psize=1):
    start = datetime.datetime.now()
    p.options.figure_size = size
    savename = filename + '.' + embedding + '.' + feature + '.' + str(
        color) + '.png'
    if color == None:
        color = feature
        savename = filename + '.' + embedding + '.' + feature + '.wraplot.png'
    print(
        start.strftime("%H:%M:%S"),
        'Starting ... \t',
        savename,
    )

    pt = (p.ggplot(p.aes(x=embedding + '0', y=embedding + '1', color=color),
                   adata.obs) +
          p.geom_point(color='lightgrey',
                       shape='.',
                       data=adata.obs.drop(feature, axis=1)) +
          p.geom_point(shape='.', size=psize, alpha=1, stroke=0) +
          p.theme_minimal() + p.facet_wrap('~' + feature) +
          p.guides(color=p.guide_legend(override_aes={'size': 10})))

    if save: pt.save(savename, format='png', dpi=200)

    end = datetime.datetime.now()
    delta = end - start
    print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())),
          's to make: \t', savename)
Ejemplo n.º 12
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_normed_samples = self.normalize_samples(samples)

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_normalized_community()
                        for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_normed_samples,  # type: ignore
                })
            # import pdb
            # pdb.set_trace()
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df["prediction"] = self.denormalize_samples(df["prediction"])
            df["community"] = self.denormalize_samples(df["community"])

            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_histogram(position="identity", alpha=0.9) +
                    scale_x_datetime(limits=(_xmin, _xmax)) +
                    facet_wrap("sources", ncol=1) + labs(
                        x="Prediction",
                        y="Counts",
                        title=title_name,
                    ) + guides(fill=False) + ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            (_xmin, _xmax) = self.get_central_quantiles(
                prediction_normed_samples,
                percent_kept=percent_kept,
                side_cut_from=side_cut_from,
            )
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df = pd.DataFrame(data={
                "prediction":
                self.denormalize_samples(prediction_normed_samples)
            })
            return (ggplot(df, aes("prediction")) +
                    geom_histogram(fill="#b3cde3", bins=bins)
                    # + coord_cartesian(xlim = (_xmin,_xmax))
                    + scale_x_datetime(limits=(_xmin, _xmax)) +
                    labs(x="Prediction", y="Counts", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
Ejemplo n.º 13
0
    def plot_abs_dataframe(self, df: pd.DataFrame) -> p9.ggplot:
        facets = []
        n_per_facet = {}
        print(df)
        for col in df.columns:
            try:
                n_values = df[col].nunique()
                if n_values == 1 and col not in [
                        "TIME_PERIOD",
                        "value",
                        "Measure",
                        "OBS_COMMENT",
                ]:
                    self.fixed_datapoints.add(f"{col}={df.at[0, col]}")
                elif n_values > 1 and col not in [
                        "value",
                        "TIME_PERIOD",
                        "OBS_COMMENT",
                ]:
                    facets.append(col)
                    n_per_facet[col] = n_values
            except:
                print(f"Ignoring unusable column: {col}")
                continue

        extra_args = {}
        need_shape = False
        if len(facets) > 2:
            # can only use two variables as plotting facets, third value will be used as a group on each plot
            # any more facets is not supported at this stage
            sorted_facets = sorted(n_per_facet.keys(),
                                   key=lambda k: n_per_facet[k])
            # print(n_per_facet)
            # print(sorted_facets)
            facets = sorted_facets[-2:]
            extra_args.update({
                "group": sorted_facets[0],
                "color": facets[0],
                "shape": sorted_facets[0],
            })
            need_shape = True
            print(f"Using {facets} as facets, {extra_args} as series")
        else:
            if len(facets) > 0:
                extra_args.update({"color": facets[0]})

        # compute figure size to give enough room for each plot
        mult = 1
        for facet in facets:
            mult *= n_per_facet[facet]
        mult /= len(facets)
        nrow = int(mult + 1)

        # facet column names must not have spaces in them as this is not permitted by plotnine facet formulas
        if len(facets) > 0:
            new_facets = []
            for f in facets:
                if " " in f:
                    new_name = f.replace(" ", "_")
                    df = df.rename(columns={f: new_name})
                    new_facets.append(new_name)
                else:
                    new_facets.append(f)
            facets = new_facets
            if "color" in extra_args:
                extra_args.update({"color": facets[0]})
            print(f"Renamed facet columns due to whitespace: {facets}")

        plot = p9.ggplot(df, p9.aes(x="TIME_PERIOD", y="value", **
                                    extra_args)) + p9.geom_point(size=3)

        if len(facets) > 0 and len(facets) <= 2:
            facet_str = "~" + " + ".join(facets[:2])
            print(f"Using facet formula: {facet_str}")
            plot += p9.facet_wrap(facet_str, ncol=len(facets), scales="free_y")

        plot_theme = {
            "figure_size": (12, int(nrow * 1.5)),
        }
        if (len(facets) == 2
            ):  # two columns of plots? if so, make sure  space for axis labels
            plot_theme.update({"subplots_adjust": {"wspace": 0.2}})
        if need_shape:
            plot += p9.scale_shape(guide="legend")
            plot += p9.guides(
                colour=False
            )  # colour legend is not useful since it is included in the facet title
            plot_theme.update({"legend_position": "right"})
        return user_theme(plot, **plot_theme)
Ejemplo n.º 14
0
color_map = {
    "before": mcolors.to_hex(pd.np.array([178,223,138, 255])/255),
    "after": mcolors.to_hex(pd.np.array([31,120,180, 255])/255)
}


# In[14]:


g = (
    p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration"))
    + p9.geom_point()
    + p9.geom_path()
    + p9.geom_abline(p9.aes(slope=1, intercept=0), linetype='dashed', color='black')
    + p9.scale_color_manual(values={
        "before":color_map["before"],
        "after":color_map["after"]
    })
    + p9.facet_wrap("relation")
    + p9.labs(
        x="Predicted",
        y="Actual"
    )
    + p9.guides(color=p9.guide_legend(title="Model Calibration"))
    + p9.theme_bw()
)
print(g)
g.save(filename="../model_calibration.png", dpi=300)

Ejemplo n.º 15
0
def density_plot(df,
                 x,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 position='overlay',
                 sort_groups=True,
                 base_size=10,
                 figure_size=(6, 3),
                 **stat_kwargs):
    '''
    Plot a 1-d density plot

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    position : str
      if groups are present, choose between `stack` or `overlay`
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    stat_kwargs : kwargs
      kwargs for the density stat

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=False)
    gdata = gdata[[
        c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns
    ]]

    # start plotting
    g = EZPlot(gdata)

    # determine order and create a categorical type
    colors = ez_colors(g.n_groups('group'))

    # set groups
    if group is None:
        g += p9.geom_density(p9.aes(x="x"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             colour=ez_colors(1)[0],
                             fill=ez_colors(1)[0],
                             **POSITION_KWARGS[position])
    else:
        g += p9.geom_density(p9.aes(x="x",
                                    group="factor(group)",
                                    colour="factor(group)",
                                    fill="factor(group)"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             **POSITION_KWARGS[position])
        g += p9.scale_fill_manual(values=colors, reverse=False)
        g += p9.scale_color_manual(values=colors, reverse=False)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab('Density')

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True))

    return g
fig = pn.ggplot(normalized_all_data_UMAPencoded_df, pn.aes(x="1", y="2"))
fig += pn.geom_point(pn.aes(color="sample group"), alpha=0.4)
fig += pn.labs(x="UMAP 1",
               y="UMAP 2",
               title="Gene expression data in gene space")
fig += pn.theme_bw()
fig += pn.theme(
    legend_title_align="center",
    plot_background=pn.element_rect(fill="white"),
    legend_key=pn.element_rect(fill="white", colour="white"),
    legend_title=pn.element_text(family="sans-serif", size=15),
    legend_text=pn.element_text(family="sans-serif", size=12),
    plot_title=pn.element_text(family="sans-serif", size=15),
    axis_text=pn.element_text(family="sans-serif", size=12),
    axis_title=pn.element_text(family="sans-serif", size=15),
)
fig += pn.scale_color_manual(["#bdbdbd", "red", "blue"])
fig += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1}))

fig += pn.scales.xlim(9, 10)
print(fig)
# -

# Based on a UMAP of the normalized gene expression data, it looks like there isn't a clear separation between WT and mutant samples, though there are only 2 samples per group so this type of clustering observation is limited.
#
# **Takeaway:**
#
# In trying to understand why there are these flat-tops to some of the volcano plots and why some volcano plots are completely flat, we found:
# 1. This behavior is _not_ a result of how we are plotting in python (there was some speculation about there being an issue with the numpy library used)
# 2. The latent space shifting we're doing seems to roughly preserve differences between groups (as seen in [this notebook](https://github.com/greenelab/simulate-expression-compendia/blob/master/Pseudo_experiments/create_heatmap.ipynb) where the structure of the samples is preserved but there is a different set of related genes that are DE. More information can be found in Figure 3D in [this paper](https://academic.oup.com/gigascience/article/9/11/giaa117/5952607)), but this signal can be muddled/noisy depending on where the experiment was shifted to (i.e. the representation that is found in that location can cause the experiment to have a more compressed difference between groups) as seen in the heatmaps. The heatmap of the two simulation experiments shows that some experiments have a more noisey distinction between groups (WT vs mutant) whereas the other simulation experiment has a more distinct difference where the within grouping is cleaner. This definitely points to the need to understand how this simulation process is working and how biology is represented in the latent space. This will definitely be a project for the future. For now we at least have an explanation for why we are observing these shapes in the volcano plots
cv_results_summary = (cv_results_df
    .groupby(['classify__alpha', 'feature_set'])['mean_test_score']
    .max()
    .reset_index())


# In[17]:

(gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha',
                                      y='mean_test_score',
                                      color='feature_set'))
 + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05)
 + gg.scale_x_log10()
 + gg.labs(x='Regularization strength multiplier (log alpha)',
           y='CV AUROC')
 + gg.guides(fill=gg.guide_legend(title="Feature Set"))
 + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1)
 + theme_cognoma()
)


# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
        'test':  pipeline.decision_function(X_test)
    } for model, pipeline in cv_pipelines.items()
}
Ejemplo n.º 18
0
def area_plot(df,
              x,
              y,
              group=None,
              facet_x=None,
              facet_y=None,
              aggfun='sum',
              fill=False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):
    '''
    Aggregates data in df and plots as a stacked area chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    aggfun : str or fun
      function to be used for aggregating (eg sum, mean, median ...)
    fill : bool
      plot shares for each group instead of absolute values
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True)
    gdata['y'].fillna(0, inplace=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    if fill:
        groups_to_normalize = [
            c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns
        ]
        total_values = gdata \
            .groupby(groups_to_normalize)['y'] \
            .sum() \
            .reset_index() \
            .rename(columns = {'y':'tot_y'})
        gdata = pd.merge(gdata, total_values, on=groups_to_normalize)
        gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON)
        gdata.drop('tot_y', axis=1, inplace=True)
        ylabeller = percent_labels
    else:
        ylabeller = ez_labels

    # get plot object
    g = EZPlot(gdata)

    # determine order and create a categorical type
    if sort_groups:
        sort_data_groups(g)

    # get colors
    colors = np.flip(ez_colors(g.n_groups('group')))

    # set groups
    if group is None:
        g += p9.geom_area(p9.aes(x="x", y="y"),
                          colour=None,
                          fill=ez_colors(1)[0],
                          na_rm=True)
    else:
        g += p9.geom_area(p9.aes(x="x",
                                 y="y",
                                 group="factor(group)",
                                 fill="factor(group)"),
                          colour=None,
                          na_rm=True)
        g += p9.scale_fill_manual(values=colors)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ylabeller,
                               expand=[0, 0, 0.1 * (not fill) + 0.03, 0])

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True),
                       color=p9.guide_legend(reverse=True))

    return g
Ejemplo n.º 19
0
def variable_histogram(df,
                       x,
                       group=None,
                       facet_y=None,
                       w='1',
                       bins=21,
                       bin_width=None,
                       position='stack',
                       normalize=False,
                       base_size=10,
                       figure_size=(6, 3)):
    '''
    Plot a 1-d histogram

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str or list
      quoted expressions to be plotted on the x axis
    group : str
      quoted expression to be used as group (ie color)
    facet_y : str
      quoted expression to be used as facet
    w : str
      quoted expression representing histogram weights (default is 1)
    bins : int or tuple
      number of bins to be used
    bin_width : float or tuple
      bin width to be used
    position : str
      if groups are present, choose between `stack`, `overlay` or `dodge`
    normalize : bool
      normalize histogram counts
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # TODO: performance improvement
    # TODO: add support for categorical variables in x

    if position not in ['overlay', 'stack', 'dodge']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    if (bins is None) and (bin_width is None):
        log.error("Either bins or bin_with should be defined")
        raise ValueError("Either bins or bin_with should be defined")

    if (bins is not None) and (bin_width is not None):
        log.error("Only one between bins or bin_with should be defined")
        raise ValueError(
            "Only one between  bins or bin_with should be defined")

    if isinstance(x, str):
        x = [x]

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['group', 'facet_y'], [group, facet_y]):
        names[label], groups[label] = unname(var)
    xs = []
    for i, var in enumerate(x):
        xs.append('x_{}'.format(i))
        names['x_{}'.format(i)], groups['x_{}'.format(i)] = unname(var)
    names['w'], variables['w'] = unname(w)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {
        c: c
        for c in tmp_df.columns if c in ['group', 'facet_y'] + xs
    }
    non_x_groups = [g for g in new_groups.keys() if g not in xs]

    # bin data (if necessary)
    bins_x = {}
    bin_width_x = {}
    for x in xs:
        if tmp_df[x].dtypes != np.dtype('O'):
            tmp_df[x], bins_x[x], bin_width_x[x] = bin_data(
                tmp_df[x], bins, bin_width)
        else:
            bin_width_x[x] = 1

    # aggregate data and reorder columns
    df_ls = []
    for x in xs:
        # aggregate data
        groups = {g: g for g in non_x_groups}
        groups[x] = x
        single_df = agg_data(tmp_df,
                             variables,
                             groups,
                             'sum',
                             fill_groups=True)
        single_df.fillna(0, inplace=True)
        single_df['facet_x'] = names[x]
        single_df.rename(columns={x: 'x'}, inplace=True)

        # normalize
        if normalize:
            if len(non_x_groups) == 0:
                single_df['w'] = single_df['w'] / (single_df['w'].sum() *
                                                   bin_width_x[x])
            else:
                single_df['w'] = single_df.groupby(non_x_groups)['w'].apply(
                    lambda z: z / (z.sum() * bin_width_x[x]))

        df_ls.append(single_df)
    gdata = pd.concat(df_ls)
    gdata = gdata[[
        c for c in ['x', 'w', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    # start plotting
    g = EZPlot(gdata)

    # set groups
    for single_df in df_ls:
        if group is None:
            g += p9.geom_bar(p9.aes(x="x", y="w"),
                             data=single_df,
                             stat='identity',
                             colour=None,
                             fill=ez_colors(1)[0])
        else:
            g += p9.geom_bar(p9.aes(x="x",
                                    y="w",
                                    group="factor(group)",
                                    fill="factor(group)"),
                             data=single_df,
                             colour=None,
                             stat='identity',
                             **POSITION_KWARGS[position])
            g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_y is None:
        g += p9.facet_wrap('~facet_x', scales='free')
    else:
        g += p9.facet_grid('facet_y~facet_x', scales='free')

    # set x scale
    g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab('Value') + \
        p9.ylab('Counts')

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    g += p9.guides(fill=p9.guide_legend(reverse=True))

    return g
Ejemplo n.º 20
0
def plot_umi_mt_density(df_plot,
                        output_file='plot_umi_mt_density',
                        facet_column='none',
                        color_var='density',
                        density_contour=False):
    """Plot plot_umi_mt_density to png.

    Parameters
    ----------
    df_plot : pandas.DataFrame
        DataFrame with the followig keys 'total_counts', 'pct_counts_gene_group__mito_transcript'.
    output_file : string
        Basename of output file.
    facet_column : string
        Column to facet the output by.

    Returns
    -------
    NULL
    """
    if color_var == 'density':
        color_title = 'Density\n'
        # Also calculate density using a gaussian 2d kernal -- use random
        # name for plot column
        color_var = "1251234_density"
        df_plot[color_var] = calculate_density(df_plot, facet_column)
    elif color_var == 'pct_counts_gene_group__mito_transcript':
        color_title = '% MT\n'
    elif color_var == 'cell_passes_qc':
        color_title = 'Cell passed QC\n'
    else:
        color_title = color_var
    gplt = plt9.ggplot(
        df_plot,
        plt9.aes(x='total_counts',
                 y='pct_counts_gene_group__mito_transcript',
                 color=color_var))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_point(alpha=0.5, size=0.8)
    gplt = gplt + plt9.scale_x_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    if color_var == 'pct_counts_gene_group__mito_transcript':
        gplt = gplt + plt9.scale_color_gradient2(low='#3B9AB2',
                                                 mid='#EBCC2A',
                                                 high='#F21A00',
                                                 midpoint=50,
                                                 limits=[0, 100])
        gplt = gplt + plt9.guides(color=plt9.guide_colorbar(ticks=False))
    elif color_var == 'cell_passes_qc':
        gplt = gplt + plt9.scale_colour_brewer(type='qual', palette='Dark2')
    elif color_var == '1251234_density':
        gplt = gplt + plt9.scale_color_cmap(cmap_name='viridis')

    if density_contour:
        gplt = gplt + plt9.geom_density_2d(alpha=0.5)
    gplt = gplt + plt9.labs(x='Number of molecules',
                            y='Percent of molecules from MT genes',
                            title='',
                            color=color_title)
    if facet_column != 'none':
        gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5)
        n_samples = df_plot[facet_column].nunique()
        gplt.save('{}.png'.format(output_file),
                  dpi=300,
                  width=4 * (n_samples / 2),
                  height=4 * (n_samples / 4),
                  limitsize=False)
    else:
        gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4)
Ejemplo n.º 21
0
        rel
    })
edges_df = pd.DataFrame.from_records(datarows)
edges_df

# In[11]:

import math
g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) +
     p9.geom_col(position="dodge") +
     p9.scale_fill_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.geom_text(p9.aes(label=(
         edges_df.apply(lambda x: f"{x['edges']}\n({x['recall']*100:.0f}%)"
                        if not math.isnan(x['recall']) else f"{x['edges']}",
                        axis=1))),
                       position=p9.position_dodge(width=0.9),
                       size=9,
                       va="bottom") + p9.scale_y_log10() +
     p9.labs(y="# of Edges",
             x="Relation Type",
             title="Reconstructing Edges in Hetionet") +
     p9.guides(fill=p9.guide_legend(title="In Hetionet?")) + p9.theme(
         axis_text_y=p9.element_blank(),
         axis_ticks_major=p9.element_blank(),
         rect=p9.element_blank(),
     ))
print(g)
g.save(filename="../edges_added.png", dpi=300)
    x="median expression of PAO1-only genes (TPM)",
    y="median expression of PA14-only genes (TPM)",
    title="TPM of accessory genes in binned PAO1 compendium",
)
fig1 += pn.theme_bw()
fig1 += pn.theme(
    legend_title_align="center",
    plot_background=pn.element_rect(fill="white"),
    legend_key=pn.element_rect(fill="white", colour="white"),
    legend_title=pn.element_text(family="sans-serif", size=15),
    legend_text=pn.element_text(family="sans-serif", size=12),
    plot_title=pn.element_text(family="sans-serif", size=15),
    axis_text=pn.element_text(family="sans-serif", size=10),
    axis_title=pn.element_text(family="sans-serif", size=12),
)
fig1 += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1}))

print(fig1)

# +
# Plot accessory gene expression in PA14 compendium
fig2 = pn.ggplot(
    pao1_pa14_acc_pa14_compendium_label,
    pn.aes(x="median acc expression_pao1", y="median acc expression_pa14"),
)
fig2 += pn.geom_point(pn.aes(color="Strain type_pa14"), alpha=0.4)
fig2 += pn.labs(
    x="median expression of PAO1-only genes (TPM)",
    y="median expression of PA14-only genes (TPM)",
    title="TPM of accessory genes in binned PA14 compendium",
)
Ejemplo n.º 23
0
def plot_xbs(df, group, var, n_side=9, n_delta=6):
    r"""Construct Xbar and S chart

    Construct an Xbar and S chart to assess the state of statistical control of
    a dataset.

    Args:
        df (DataFrame): Data to analyze
        group (str): Variable for grouping
        var (str): Variable to study

    Keyword args:
        n_side (int): Number of consecutive runs above/below centerline to flag
        n_delta (int): Number of consecutive runs increasing/decreasing to flag

    Returns:
        plotnine object: Xbar and S chart

    Examples::

        import grama as gr
        DF = gr.Intention()

        from grama.data import df_shewhart
        (
            df_shewhart
            >> gr.tf_mutate(idx=DF.index // 10)
            >> gr.pt_xbs("idx", "tensile_strength")
        )

    """
    ## Prepare the data
    DF = Intention()
    df_batched = (df >> tf_group_by(group) >> tf_summarize(
        X=mean(DF[var]),
        S=sd(DF[var]),
        n=nfcn(DF.index),
    ) >> tf_ungroup())

    df_stats = (df_batched >> tf_summarize(
        X_center=mean(DF.X),
        S_biased=mean(DF.S),
        n=mean(DF.n),
    ))
    n = df_stats.n[0]
    df_stats["S_center"] = df_stats.S_biased / c_sd(n)
    df_stats["X_LCL"] = df_stats.X_center - 3 * df_stats.S_center / sqrt(n)
    df_stats["X_UCL"] = df_stats.X_center + 3 * df_stats.S_center / sqrt(n)
    df_stats["S_LCL"] = B3(n) * df_stats.S_center
    df_stats["S_UCL"] = B4(n) * df_stats.S_center

    ## Reshape for plotting
    df_stats_long = (df_stats >> tf_pivot_longer(
        columns=["X_LCL", "X_center", "X_UCL", "S_LCL", "S_center", "S_UCL"],
        names_to=["_var", "_stat"],
        names_sep="_",
        values_to="_value",
    ))
    # Fake group value to avoid issue with discrete group variable
    df_stats_long[group] = [df_batched[group].values[0]
                            ] * df_stats_long.shape[0]

    df_batched_long = (
        df_batched >> tf_pivot_longer(
            columns=["X", "S"],
            names_to="_var",
            values_to="_value",
        )
        ## Flag patterns
        >> tf_left_join(
            df_stats >> tf_pivot_longer(
                columns=[
                    "X_LCL", "X_center", "X_UCL", "S_LCL", "S_center", "S_UCL"
                ],
                names_to=["_var", ".value"],
                names_sep="_",
            ),
            by="_var",
        ) >> tf_group_by("_var") >> tf_mutate(
            outlier_below=(DF._value < DF.LCL),  # Outside control limits
            outlier_above=(DF.UCL < DF._value),
            below=consec(DF._value < DF.center, i=n_side),  # Below mean
            above=consec(DF.center < DF._value, i=n_side),  # Above mean
        ) >> tf_mutate(
            decreasing=consec((lead(DF._value) - DF._value) < 0, i=n_delta - 1)
            |  # Decreasing
            consec((DF._value - lag(DF._value)) < 0, i=n_delta - 1),
            increasing=consec(0 < (lead(DF._value) - DF._value), i=n_delta - 1)
            |  # Increasing
            consec(0 < (DF._value - lag(DF._value)), i=n_delta - 1),
        ) >> tf_mutate(
            sign=case_when([DF.outlier_below, "-2"], [DF.outlier_above, "+2"],
                           [DF.below | DF.decreasing, "-1"],
                           [DF.above | DF.increasing, "+1"], [True, "0"]),
            glyph=case_when(
                [DF.outlier_below, "Below Limit"],
                [DF.outlier_above, "Above Limit"],
                [DF.below, "Low Run"],
                [DF.above, "High Run"],
                [DF.increasing, "Increasing Run"],
                [DF.decreasing, "Decreasing Run"],
                [True, "None"],
            )) >> tf_ungroup())

    ## Visualize
    return (df_batched_long >> ggplot(aes(x=group)) + geom_hline(
        data=df_stats_long,
        mapping=aes(yintercept="_value", linetype="_stat"),
    ) + geom_line(aes(y="_value", group="_var"), size=0.2) + geom_point(
        aes(y="_value", color="sign", shape="glyph"),
        size=3,
    ) + scale_color_manual(values={
        "-2": "blue",
        "-1": "darkturquoise",
        "0": "black",
        "+1": "salmon",
        "+2": "red"
    }, ) + scale_shape_manual(
        name="Patterns",
        values={
            "Below Limit": "s",
            "Above Limit": "s",
            "Low Run": "X",
            "High Run": "X",
            "Increasing Run": "^",
            "Decreasing Run": "v",
            "None": "."
        },
    ) + scale_linetype_manual(
        name="Guideline",
        values=dict(LCL="dashed", UCL="dashed", center="solid"),
    ) + guides(color=None) + facet_grid(
        "_var~.",
        scales="free_y",
        labeller=labeller(dict(X="Mean", S="Variability")),
    ) + labs(
        x="Group variable ({})".format(group),
        y="Value ({})".format(var),
    ))
    ],
                   axis='columns')
    df['feature_set'] = model
    cv_results_df = cv_results_df.append(df)

cv_results_summary = (cv_results_df.groupby(
    ['classify__alpha', 'feature_set'])['mean_test_score'].max().reset_index())

# In[17]:

(gg.ggplot(
    cv_results_summary,
    gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) +
 gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() +
 gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') +
 gg.guides(fill=gg.guide_legend(title="Feature Set")) +
 gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) +
 theme_cognoma())

# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
        'test': pipeline.decision_function(X_test)
    }
    for model, pipeline in cv_pipelines.items()
}
Ejemplo n.º 25
0
def hist_plot(df,
              x,
              y=None,
              group = None,
              facet_x = None,
              facet_y = None,
              w='1',
              bins=21,
              bin_width = None,
              position = 'stack',
              normalize = False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):

    '''
    Plot a 1-d or 2-d histogram

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d.
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    w : str
      quoted expression representing histogram weights (default is 1)
    bins : int or tuple
      number of bins to be used
    bin_width : float or tuple
      bin width to be used
    position : str
      if groups are present, choose between `stack`, `overlay` or `dodge`
    normalize : bool
      normalize histogram counts
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack', 'dodge']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    if (bins is None) and (bin_width is None):
        log.error("Either bins or bin_with should be defined")
        raise ValueError("Either bins or bin_with should be defined")

    if (bins is not None) and (bin_width is not None):
        log.error("Only one between bins or bin_with should be defined")
        raise ValueError("Only one between  bins or bin_with should be defined")

    if (y is not None) and (group is not None):
        log.error("y and group cannot be requested at the same time")
        raise ValueError("y and group cannot be requested at the same time")

    if y is None:
        bins = (bins, bins)
        bin_width = (bin_width, bin_width)
    else:
        if type(bins) not in [tuple, list]:
            bins = (bins, bins)
        if type(bin_width) not in [tuple, list]:
            bin_width = (bin_width, bin_width)

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['w'], variables['w'] = unname(w)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']}
    non_xy_groups = [g for g  in new_groups.keys() if g not in ['x', 'y']]
    new_variables = {'w':'w'}

    # bin data (if necessary)
    if tmp_df['x'].dtypes != np.dtype('O'):
        tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0])
    else:
        bin_width_x=1
    if y is not None:
        if tmp_df['y'].dtypes != np.dtype('O'):
            tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1])
        else:
            bin_width_y=1
    else:
        bin_width_y=1

    # aggregate data and reorder columns
    gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True)
    gdata.fillna(0, inplace=True)
    gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # normalize
    if normalize:
        if len(non_xy_groups)==0:
            gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y)
        else:
            gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y))

    # start plotting
    g = EZPlot(gdata)
    # determine order and create a categorical type
    if (group is not None) and sort_groups:
        if g.column_is_categorical('x'):
            g.sort_group('x', 'w', ascending=False)
        g.sort_group('group', 'w')
        g.sort_group('facet_x', 'w', ascending=False)
        g.sort_group('facet_y', 'w', ascending=False)
        if groups:
            colors = np.flip(ez_colors(g.n_groups('group')))
    elif (group is not None):
        colors = ez_colors(g.n_groups('group'))

    if y is None:
        # set groups
        if group is None:
            g += p9.geom_bar(p9.aes(x="x", y="w"),
                             stat = 'identity',
                             colour = None,
                             fill = ez_colors(1)[0])
        else:
            g += p9.geom_bar(p9.aes(x="x", y="w",
                                    group="factor(group)",
                                    fill="factor(group)"),
                             colour=None,
                             stat = 'identity',
                             **POSITION_KWARGS[position])
            g += p9.scale_fill_manual(values=colors)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab('Counts')

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text=names['group'], size=base_size))

        if sort_groups:
            g += p9.guides(fill=p9.guide_legend(reverse=True))

    else:
        g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'),
                          stat = 'identity',
                          colour = None)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        if g.column_is_categorical('y'):
            g += p9.scale_y_discrete()
        else:
            g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab(names['y'])

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text='Counts', size=base_size))

    return g
def generate_map(data,
                 region,
                 value_field,
                 iso_field='iso',
                 scale_params=None,
                 plot_na_dots=False,
                 tolerance=None,
                 plot_size=8,
                 out_region_color='#f0f0f0',
                 na_color='#aaaaaa',
                 line_color='#666666',
                 projection=None):
    """
    This function returns a map plot with the specified options.

    :param pandas.DataFrame data: Data to be plotted.
    :param str region: Region to center the map around. Countries outside
        the chosen region will be obscured.
    :param str value_field: Column of *data* with the values to be plotted.
    :param str iso_field: Column of *data* with the ISO3 codes for each
        country.
    :param dict scale_params: Dictionary of parameters to be passed to the
        ggplot corresponding color scale (continuous or discrete).
    :param bool plot_na_dots: Whether to plot the dots for small countries
        if said country doesn't have data available.
    :param int tolerance: Coordinate tolerance for polygon simplification,
        a higher number will result in simpler polygons and faster
        rendering (see DEFAULT_TOLERANCES).
    :param int plot_size: Size of the plot, which determines the relative sizes
        of the elements within.
    :param str out_region_color: Hex color of the countries that are out of the
        specified region.
    :param str na_color: Hex color of the countries with no data available.
    :param str line_color: Color of the country borders.
    :param str projection: Kind of map projection to be used in the map.
        Currently, Oceania (XOX) is only available in ESPG:4326 to enable
        wrapping.
    :returns: a ggplot-like plot with the map
    :rtype: plotnine.ggplot
    """
    if projection is None:
        if region == 'XOX':
            projection = 'epsg4326'
        else:
            projection = 'robinson'

    if projection not in PROJECTION_DICT.keys():
        raise ValueError('Projection "{}" not valid'.format(projection))

    if scale_params is None:
        scale_params = {}

    if region not in REGION_BOUNDS[projection]:
        raise ValueError(
            '"region" not available. Valid regions are: {}'.format(', '.join(
                REGION_BOUNDS[projection].keys())))

    if tolerance is None:
        tolerance = DEFAULT_TOLERANCES[projection][region]

    countries = GeoDataFrame.from_file(
        os.path.join(os.path.dirname(__file__), 'data/world-countries.shp'))

    # To plot Oceania we need the original EPSG:4326 to wrap around the 180º
    # longitude. In other cases transform to the desired projection.
    if region == 'XOX':
        countries.crs['lon_wrap'] = '180'  # Wrap around longitude 180º

        XOX_countries = countries['continent'] == 'XOX'
        countries[XOX_countries] = countries[XOX_countries].to_crs(
            countries.crs)
        centroids = countries[XOX_countries].apply(
            lambda row: row['geometry'].centroid, axis=1)
        countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids]
        countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids]
    else:
        if projection != 'epsg4326':
            countries = countries.to_crs(PROJECTION_DICT[projection])
            centroids = countries.apply(lambda row: row['geometry'].centroid,
                                        axis=1)
            countries['lon'] = [c.x for c in centroids]
            countries['lat'] = [c.y for c in centroids]

    countries['geometry'] = countries['geometry'].simplify(tolerance)

    upper_left, lower_right = REGION_BOUNDS[projection][region]
    limits_x = [upper_left[0], lower_right[0]]
    limits_y = [lower_right[1], upper_left[1]]
    ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0])

    plot_data = pd.merge(countries,
                         data,
                         how='left',
                         left_on='iso',
                         right_on=iso_field)
    map_bounds = REGION_BOUNDS['epsg4326'][region]
    map_area = ((map_bounds[1][0] - map_bounds[0][0]) *
                (map_bounds[0][1] - map_bounds[1][1]))
    plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area)

    if not plot_na_dots:
        plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field])

    if region != 'XWX':
        in_region = ((~pd.isnull(plot_data[value_field])) &
                     (plot_data['continent'] == region))
        in_region_missing = ((pd.isnull(plot_data[value_field])) &
                             (plot_data['continent'] == region))
        out_region = plot_data['continent'] != region
    else:
        in_region = ~pd.isnull(plot_data[value_field])
        in_region_missing = pd.isnull(plot_data[value_field])
        out_region = np.repeat(False, len(plot_data))

    if plot_data[value_field].dtype == 'object':
        # Assume discrete values
        fill_scale = scale_fill_brewer(**scale_params, drop=False)
    else:
        # Assume continuous values
        fill_scale = scale_fill_gradient(**scale_params)

    plot_data_values = plot_data[in_region]
    plot_data_missing = plot_data[in_region_missing]
    plot_data_out_region = plot_data[out_region]

    dots_region = plot_data_values[plot_data_values['plot_dot']]
    dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']]
    dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']]

    plt = (
        ggplot() + geom_map(plot_data_values,
                            aes(fill=value_field),
                            color=line_color,
                            size=0.3) +
        geom_map(
            plot_data_missing, aes(color='plot_dot'), fill=na_color,
            size=0.3) + geom_map(plot_data_out_region,
                                 fill=out_region_color,
                                 color=line_color,
                                 size=0.3) +
        geom_point(dots_region,
                   aes(x='lon', y='lat', fill=value_field),
                   size=3,
                   stroke=.1,
                   color=line_color) + geom_point(dots_region_missing,
                                                  aes(x='lon', y='lat'),
                                                  fill=na_color,
                                                  size=3,
                                                  stroke=.1,
                                                  color=line_color) +
        geom_point(dots_out_region,
                   aes(x='lon', y='lat'),
                   fill=out_region_color,
                   size=3,
                   stroke=.1,
                   color=line_color) +
        scale_x_continuous(breaks=[], limits=limits_x) +
        scale_y_continuous(breaks=[], limits=limits_y) + theme(
            figure_size=(plot_size * ratio, plot_size),
            panel_background=element_rect(fill='white', color='black'),
            #  panel_border=element_rect(fill='white',
            #                            color='black',
            #                            size=.1),
            legend_background=element_rect(
                fill="white", color='black', size=.5),
            legend_box_just='left') + xlab('') + ylab(''))

    if len(plot_data_values.index) > 0:
        plt += fill_scale

    plt += scale_color_manual(name=' ',
                              values=[line_color],
                              breaks=[False],
                              labels=['No data available'])

    if plot_data[value_field].dtype == 'object':
        plt += guides(fill=guide_legend(override_aes={'shape': None}))

    return {
        'plot': plt,
        'ratio': ratio,
    }
Ejemplo n.º 27
0
def no_colorbar_ticks():
    return pn.guides(color=pn.guide_colorbar(ticks=False))
Ejemplo n.º 28
0
def plot_empirical_buzz():
    proto_df = pd.read_hdf(
        "data/external/datasets/protobowl/protobowl-042818.log.h5")
    dataset = read_json(QANTA_MAPPED_DATASET_PATH)
    questions = {q["qanta_id"]: q for q in dataset["questions"]}
    folds = {
        q["proto_id"]: q["fold"]
        for q in questions.values() if q["proto_id"] is not None
    }
    proto_df["fold"] = proto_df["qid"].map(lambda x: folds[x]
                                           if x in folds else None)
    proto_df["n"] = 1
    buzztest_df = proto_df[proto_df.fold == "buzztest"]
    play_counts = (
        buzztest_df.groupby("qid").count().reset_index().sort_values(
            "fold", ascending=False))
    qid_to_counts = {r.qid: r.n for r in play_counts.itertuples()}
    popular_questions = play_counts.qid.tolist()
    curve = CurveScore()
    x = np.linspace(0, 1, 100)
    y = [curve.get_weight(n) for n in x]
    curve_df = pd.DataFrame({"buzzing_position": x, "result": y})
    curve_df["qid"] = "Expected Wins Curve Score"
    curve_df["source"] = "Curve Score | Average"
    proto_ids = popular_questions[:10]
    frames = []
    for proto_id in proto_ids:
        plays = buzztest_df[buzztest_df.qid == proto_id].sort_values(
            "buzzing_position")
        plays = plays[plays.result != "prompt"]
        plays["result"] = plays["result"].astype(int)
        frames.append(plays)
    sample_df = pd.concat(frames)

    rows = []
    for qid, group_df in sample_df.groupby("qid"):
        n_opp_correct = 0
        n_opp_total = 0
        n = qid_to_counts[qid]
        rows.append({
            "buzzing_position": 0,
            "n_opp_correct": 0,
            "n_opp_total": 1,
            "qid": f"Question with {n} Plays",
            "source": "Single Question",
            "n_plays": n,
        })
        for r in group_df.itertuples():
            if r.result == 1:
                n_opp_correct += 1
            n_opp_total += 1
            rows.append({
                "buzzing_position": r.buzzing_position,
                "n_opp_correct": n_opp_correct,
                "n_opp_total": n_opp_total,
                "qid": f"Question with {n} Plays",
                "source": "Single Question",
                "n_plays": n,
            })
    n_opp_correct = 0
    n_opp_total = 0
    for r in sample_df.sort_values("buzzing_position").itertuples():
        if r.result == 1:
            n_opp_correct += 1
        n_opp_total += 1
        rows.append({
            "buzzing_position": r.buzzing_position,
            "n_opp_correct": n_opp_correct,
            "n_opp_total": n_opp_total,
            "qid": "Average of Most Played",
            "source": "Curve Score | Average",
        })

    df = pd.DataFrame(rows)
    df["p_opp_correct"] = df["n_opp_correct"] / df["n_opp_total"]
    df["p_win"] = 1 - df["p_opp_correct"]
    df["result"] = df["p_win"]

    def order(c):
        if c.startswith("Expected"):
            return -1000
        elif c.startswith("Average"):
            return -999
        elif c.startswith("Question with"):
            return -int(c.split()[2])
        else:
            return 1000

    categories = list(set(df.qid.tolist()) | set(curve_df.qid.tolist()))
    categories = sorted(categories, key=order)
    categories = pd.CategoricalDtype(categories, ordered=True)
    df["qid"] = df["qid"].astype(categories)
    cmap = plt.get_cmap("tab20")
    colors = [matplotlib.colors.to_hex(c) for c in cmap.colors]
    filter_df = df[df.n_opp_total > 4]
    chart = (p9.ggplot(
        filter_df,
        p9.aes(x="buzzing_position", y="result", color="qid"),
    ) + p9.geom_line(
        p9.aes(linetype="source"),
        data=filter_df[filter_df.source.map(lambda s: s.startswith("Curve"))],
        size=2,
    ) + p9.geom_line(
        p9.aes(linetype="source"),
        data=filter_df[filter_df.source.map(
            lambda s: not s.startswith("Curve"))],
        size=0.5,
    ) + p9.geom_line(
        p9.aes(x="buzzing_position", y="result", linetype="source"),
        data=curve_df,
        size=2,
    ) + p9.labs(
        x="Position in Question (%)",
        y="Empirical Probability of Winning",
        linetype="Data Type",
        color="Data Source",
    ) + p9.guides(size=False) + p9.scale_color_manual(values=colors) +
             theme_pedroai() + p9.theme(legend_position="right"))
    chart.save("output/empirical_buzz.pdf")
fig += pn.labs(x ='UMAP 1',
            y = 'UMAP 2',
            title = 'Gene expression data in gene space')
fig += pn.theme_bw()
fig += pn.theme(
    legend_title_align = "center",
    plot_background=pn.element_rect(fill='white'),
    legend_key=pn.element_rect(fill='white', colour='white'), 
    legend_title=pn.element_text(family='sans-serif', size=15),
    legend_text=pn.element_text(family='sans-serif', size=12),
    plot_title=pn.element_text(family='sans-serif', size=15),
    axis_text=pn.element_text(family='sans-serif', size=12),
    axis_title=pn.element_text(family='sans-serif', size=15)
    )
fig += pn.scale_color_manual(['#bdbdbd', 'red', 'blue'])
fig += pn.guides(colour=pn.guide_legend(override_aes={'alpha': 1}))

print(fig)


# ## PCA in latent space

# In[21]:


# Model files
model_encoder_filename = glob.glob(os.path.join(vae_model_dir, "*_encoder_model.h5"))[0]
weights_encoder_filename = glob.glob(os.path.join(vae_model_dir, "*_encoder_weights.h5"))[0]
model_decoder_filename = glob.glob(os.path.join(vae_model_dir, "*_decoder_model.h5"))[0]
weights_decoder_filename = glob.glob(os.path.join(vae_model_dir, "*_decoder_weights.h5"))[0]
# Plot
fig = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2'))
fig += geom_point(aes(color='dataset'), alpha=0.2)
fig += labs(x ='UMAP 1',
            y = 'UMAP 2',
            title = 'UMAP of normalized compendium')
fig += theme_bw()
fig += theme(
    legend_title_align = "center",
    plot_background=element_rect(fill='white'),
    legend_key=element_rect(fill='white', colour='white'), 
    legend_title=element_text(family='sans-serif', size=15),
    legend_text=element_text(family='sans-serif', size=12),
    plot_title=element_text(family='sans-serif', size=15),
    axis_text=element_text(family='sans-serif', size=12),
    axis_title=element_text(family='sans-serif', size=15)
    )
fig += guides(colour=guide_legend(override_aes={'alpha': 1}))
fig += scale_color_manual(['#ff6666', '#add8e6'])

print(fig)


# **Observations:**
# * There looks to be a good amount of variance in the compendium overall.
# * Using a split of 25% seems to get a similar distribution of data between training and validation sets.
# * Remember, the dataset is in 17K dimensional space, which will make the small clusters difficult to represent during training
# 
# Overall, having so many features in our dataset, points to the need for more samples to represent the structure in the compendium. For now, we are limited by memory to only select a subset of recount2, but in a future iteration perhaps this will be updated.
Ejemplo n.º 31
0
def plot_sinew_outputs(
    df, var=None, out=None, sweep_ind="sweep_ind", sweep_var="sweep_var"
):
    r"""Construct sinew plot

    Create a relational lineplot with hues for each sweep. Often used to
    visualize the outputs of a sinew design.

    Usually called as a dispatch from plot_auto().

    Args:
        df (Pandas DataFrame): Input design data with output results
        var (list of strings): Variables to plot
        out (list of strings): Outputs to plot
        sweep_ind (string): Sweep index column in df
        sweep_var (string): Swept variable column in df

    Returns:
        Seaborn relational lineplot

    Examples:

        >>> import grama as gr
        >>> import matplotlib.pyplot as plt
        >>> from grama.models import make_cantilever_beam
        >>> md = make_cantilever_beam()
        >>> ## Dispatch from autoplotter
        >>> (
        >>>     md
        >>>     >> gr.ev_sinews(df_det="swp")
        >>>     >> gr.pt_auto()
        >>> )
        >>> ## Re-create without metadata
        >>> (
        >>>     md
        >>>     >> gr.ev_sinews(df_det="swp")
        >>>     >> gr.pt_sinew_inputs(var=md.var, out=md.out)
        >>> )

    """
    if var is None:
        raise ValueError("Must provide input columns list as keyword arg var")
    if out is None:
        raise ValueError("Must provide output columns list as keyword arg out")

    ## Prepare data
    # Gather inputs
    id_vars = [col for col in df.columns if col not in var]
    df_tmp = melt(df, id_vars, var, "_var", "_x")

    # Gather outputs
    id_vars = [col for col in df_tmp.columns if col not in out]
    df_plot = melt(df_tmp, id_vars, out, "_out", "_y")

    # Filter off-sweep values
    df_plot = df_plot[df_plot[sweep_var] == df_plot["_var"]]

    breaks_min = lambda lims: (lims[0], 0.5 * (lims[0] + lims[1]), lims[1])
    return (
        df_plot
        >> ggplot(aes(
            "_x",
            "_y",
            color="factor(" + sweep_ind + ")",
            group="factor(" + sweep_ind + ")",
        ))
        + geom_line()
        + facet_grid("_out~_var", scales="free")

        + scale_x_continuous(
            breaks=breaks_min,
            labels=_sci_format,
        )
        + scale_y_continuous(
            breaks=breaks_min,
            labels=_sci_format,
        )
        + guides(color=None)
        + theme_minimal()
        + theme(
            strip_text_y=element_text(angle=0),
            panel_border=element_rect(color="black", size=0.5),
        )
        + labs(
            x="Input Value",
            y="Output Value",
        )
    )