コード例 #1
0
ファイル: _cell_cycle_phase.py プロジェクト: csgroen/scycle
def cell_cycle_phase_barplot(adata, palette='Set2'):
    """Plots the proportion of cells in each phase of the cell cycle

    See also: cell_cycle_phase_pieplot for the matplotlib pie chart


    Parameters
    -----------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.annotate_cell_cycle`.

    Returns
    -----------
    A plotnine barplot with the total counts of cell in each phase of the
    cell cycle.

    """
    plt_data = adata.obs.copy()
    plt_data['cell_cycle_phase'] = pd.Categorical(
        plt_data['cell_cycle_phase'],
        categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M'])

    cycle_plot = (
        ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) +
        geom_bar() + coord_flip() + guides(fill=False) +
        labs(y='', x='Cell cycle phase') + theme_light() +
        theme(panel_grid_major_y=element_blank(),
              panel_grid_minor_y=element_blank(),
              panel_grid_major_x=element_line(size=1.5),
              panel_grid_minor_x=element_line(size=1.5)) +
        scale_fill_brewer(type='qual', palette=palette))

    return cycle_plot
コード例 #2
0
ファイル: continuous.py プロジェクト: fagan2888/ergo
 def comparison_plot(self,
                     df: pd.DataFrame,
                     xmin=None,
                     xmax=None,
                     bw="normal_reference",
                     **kwargs):
     return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) +
             scale_fill_brewer(type="qual", palette="Pastel1") +
             geom_density(bw=bw, alpha=0.8) + ggtitle(self.plot_title) +
             self._scale_x(xmin, xmax) + ergo_theme)
コード例 #3
0
def all_stack(fold=BUZZER_DEV_FOLD):
    df_rnn = stack('output/buzzer/RNNBuzzer', 'RNN', fold)
    df_mlp = stack('output/buzzer/MLPBuzzer', 'MLP', fold)
    df_thr = stack('output/buzzer/ThresholdBuzzer', 'Threshold', fold)
    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)
    model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)
    p = (ggplot(df) +
         geom_area(aes(x='Position', y='Frequency', fill='Buzzing')) +
         facet_grid('~ Model') + theme_fs() + theme(aspect_ratio=1) +
         scale_fill_brewer(type='div', palette=7))
    p.save('output/buzzer/{}_stack.pdf'.format(fold))
コード例 #4
0
ファイル: lineardate.py プロジェクト: chrisorwa/ergo
    def comparison_plot(  # type: ignore
            self,
            df: pd.DataFrame,
            xmin=None,
            xmax=None,
            bins: int = 50,
            **kwargs):

        return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) +
                scale_fill_brewer(type="qual", palette="Pastel1") +
                geom_histogram(position="identity", alpha=0.9, bins=bins) +
                self._scale_x(xmin, xmax) + facet_wrap(df.columns[0], ncol=1) +
                guides(fill=False) + ergo_theme +
                theme(axis_text_x=element_text(rotation=45, hjust=1)))
def plot_preprocessing_boxplot_bymodel(dataframe,
                                       models_labels,
                                       metrics_labels,
                                       groups_labels,
                                       figure_size=(14, 4)):
    """
    We define a function to plot the grid.
    """

    return (
        # Define the plot.
        p9.ggplot(dataframe, p9.aes(x='variable', y='value', fill='group'))
        # Add the boxplots.
        + p9.geom_boxplot(position='dodge')
        # Rename the x axis.
        + p9.scale_x_discrete(name='Metric',
                              labels=lambda l: [metrics_labels[x] for x in l])
        # Rename the y axis.
        + p9.scale_y_continuous(
            name='Value',
            expand=(0, 0.05),
            # breaks=[-0.25, 0, 0.25, 0.5, 0.75, 1], limits=[-0.25, 1],
            labels=lambda l: ['{:.2f}'.format(x) for x in l])
        # Define the colors for the metrics for color-blind people.
        + p9.scale_fill_brewer(name='Group',
                               labels=lambda l: [groups_labels[x] for x in l],
                               type='qual',
                               palette='Set2')
        # Place the plots in a grid, renaming the labels.
        + p9.facet_grid(
            'model ~ .',
            scales='free_y',
            labeller=p9.labeller(rows=lambda x: f'{models_labels[x]}'))
        # Define the theme for the plot.
        + p9.theme(
            # Remove the x and y axis names.
            axis_title_x=p9.element_blank(),
            axis_title_y=p9.element_blank(),
            # Set the size of x and y tick labels font.
            axis_text_x=p9.element_text(size=7),
            axis_text_y=p9.element_text(size=7),
            # Place the legend on top, without title, and reduce the margin.
            legend_title=p9.element_blank(),
            legend_position='top',
            legend_box_margin=2,
            # Set the size for the figure.
            figure_size=figure_size,
        ))
コード例 #6
0
ファイル: plot.py プロジェクト: NPSDC/qb
def all_stack(fold=BUZZER_DEV_FOLD):
    df_rnn = stack("output/buzzer/RNNBuzzer", "RNN", fold)
    df_mlp = stack("output/buzzer/MLPBuzzer", "MLP", fold)
    df_thr = stack("output/buzzer/ThresholdBuzzer", "Threshold", fold)
    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)
    model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"])
    df["Model"] = df["Model"].astype(model_type)
    p = (
        ggplot(df)
        + geom_area(aes(x="Position", y="Frequency", fill="Buzzing"))
        + facet_grid("~ Model")
        + theme_fs()
        + theme(aspect_ratio=1)
        + scale_fill_brewer(type="div", palette=7)
    )
    p.save("output/buzzer/{}_stack.pdf".format(fold))
コード例 #7
0
def _make_plots(df_plt, out_file_base, y='AUC', facet_grid='', h_line=''):
    len_x = len(np.unique(df_plt['resolution']))
    if 'sparsity_l1' in df_plt.columns:
        df_plt['Sparsity'] = df_plt['sparsity_l1']
        len_x2 = len(np.unique(df_plt['Sparsity']))
    else:
        len_x2 = 0
    if len_x2 > 1:
        gplt = plt9.ggplot(df_plt,
                           plt9.aes(
                               fill='Sparsity',
                               x='resolution',
                               y=y,
                           ))
        gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0)
        gplt = gplt + plt9.geom_jitter(
            plt9.aes(color='Sparsity'), alpha=0.25, width=0.2)
    else:
        gplt = plt9.ggplot(df_plt, plt9.aes(x='resolution', y=y))
        gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0)
        gplt = gplt + plt9.geom_jitter(alpha=0.25, width=0.2)
    gplt = gplt + plt9.theme_bw(base_size=12)
    if facet_grid != '':
        gplt = gplt + plt9.facet_grid('{} ~ .'.format(facet_grid))
    if y == 'f1-score':
        gplt = gplt + plt9.labs(x='Resolution', y='F1 score', title='')
    elif y in ['AUC', 'MCC']:
        gplt = gplt + plt9.labs(x='Resolution', y=y, title='')
    else:
        gplt = gplt + plt9.labs(
            x='Resolution', y=y.capitalize().replace('_', ' '), title='')
    gplt = gplt + plt9.theme(
        # legend_position='none',
        axis_text_x=plt9.element_text(angle=-45, hjust=0))
    if len_x2 != 0 and len_x2 < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    if h_line != '':
        gplt = gplt + plt9.geom_hline(plt9.aes(yintercept=h_line),
                                      linetype='dashdot')
    gplt.save('{}-resolution__{}.png'.format(out_file_base,
                                             y.replace('-', '_')),
              dpi=300,
              width=4 * ((len_x + len_x2) / 4),
              height=5,
              limitsize=False)
コード例 #8
0
ファイル: plot.py プロジェクト: NPSDC/qb
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open("output/buzzer/RNNBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_rnn = df_rnn.groupby(["Possibility", "Outcome"])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: "Count"})
    df_rnn["Model"] = pd.Series(["RNN" for _ in range(len(df_rnn))], index=df_rnn.index)

    df_mlp = pickle.load(
        open("output/buzzer/MLPBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_mlp = df_mlp.groupby(["Possibility", "Outcome"])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: "Count"})
    df_mlp["Model"] = pd.Series(["MLP" for _ in range(len(df_mlp))], index=df_mlp.index)

    df_thr = pickle.load(
        open("output/buzzer/ThresholdBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_thr = df_thr.groupby(["Possibility", "Outcome"])
    df_thr = df_thr.size().reset_index().rename(columns={0: "Count"})
    df_thr["Model"] = pd.Series(
        ["Threshold" for _ in range(len(df_thr))], index=df_thr.index
    )

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df["Outcome"] = df["Outcome"].astype(outcome_type)
    model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"])
    df["Model"] = df["Model"].astype(model_type)

    p = (
        ggplot(df)
        + geom_col(aes(x="Possibility", y="Count", fill="Outcome"), width=0.7)
        + facet_grid("Model ~")
        + coord_flip()
        + theme_fs()
        + theme(aspect_ratio=0.17)
        + scale_fill_brewer(type="div", palette=7)
    )

    figure_dir = os.path.join("output/buzzer/{}_protobowl.pdf".format(fold))
    p.save(figure_dir)
コード例 #9
0
ファイル: plot.py プロジェクト: Pinafore/qb
def all_stack(fold=BUZZER_DEV_FOLD):
    df_rnn = stack('output/buzzer/RNNBuzzer', 'RNN', fold)
    df_mlp = stack('output/buzzer/MLPBuzzer', 'MLP', fold)
    df_thr = stack('output/buzzer/ThresholdBuzzer', 'Threshold', fold)
    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)
    model_type = CategoricalDtype(
        categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)
    p = (
        ggplot(df)
        + geom_area(aes(x='Position', y='Frequency', fill='Buzzing'))
        + facet_grid('~ Model')
        + theme_fs()
        + theme(
            aspect_ratio=1,
        )
        + scale_fill_brewer(type='div', palette=7)
    )
    p.save('output/buzzer/{}_stack.pdf'.format(fold))
コード例 #10
0
ファイル: plot.py プロジェクト: Pinafore/qb
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_rnn = df_rnn.groupby(['Possibility', 'Outcome'])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'})
    df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index)

    df_mlp = pickle.load(
        open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_mlp = df_mlp.groupby(['Possibility', 'Outcome'])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'})
    df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index)

    df_thr = pickle.load(
        open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_thr = df_thr.groupby(['Possibility', 'Outcome'])
    df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'})
    df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index)

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df['Outcome'] = df['Outcome'].astype(outcome_type)
    model_type = CategoricalDtype(
        categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)

    p = (
        ggplot(df)
        + geom_col(aes(x='Possibility', y='Count', fill='Outcome'),
                   width=0.7)
        + facet_grid('Model ~')
        + coord_flip()
        + theme_fs()
        + theme(aspect_ratio=0.17)
        + scale_fill_brewer(type='div', palette=7)
    )

    figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold))
    p.save(figure_dir)
コード例 #11
0
    def _make_p9_plot(self):
        '''
        Make ggplot2 style stacked barplot of cutting patterns annotated with
        waste and pattern quantities.
        
        Stacked bars are colored based on the cut type (either the cut length
        or waste).

        Returns
        -------
        g : plotnine ggplot.

        '''
        # self.make_plot_df()
        g = (p9.ggplot(mapping=p9.aes(
            x='Pattern', y='Length', fill='Length Cat'),
                       data=self.plot_df) +
             p9.geom_bar(position='stack', stat='identity', color='black') +
             p9.scale_fill_brewer(type='qual', palette=2, name='Cut Type') +
             p9.geom_text(mapping=p9.aes(y='Length', label='Annotate'),
                          position='stack') + p9.ggtitle('Pattern Cuts'))
        return (g)
コード例 #12
0
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_rnn = df_rnn.groupby(['Possibility', 'Outcome'])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'})
    df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))],
                                index=df_rnn.index)

    df_mlp = pickle.load(
        open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_mlp = df_mlp.groupby(['Possibility', 'Outcome'])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'})
    df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))],
                                index=df_mlp.index)

    df_thr = pickle.load(
        open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold),
             'rb'))
    df_thr = df_thr.groupby(['Possibility', 'Outcome'])
    df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'})
    df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))],
                                index=df_thr.index)

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df['Outcome'] = df['Outcome'].astype(outcome_type)
    model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)

    p = (ggplot(df) +
         geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) +
         facet_grid('Model ~') + coord_flip() + theme_fs() +
         theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7))

    figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold))
    p.save(figure_dir)
コード例 #13
0
    "output/comparison_stats/corpora_kl_divergence.tsv", sep="\t")
kl_divergence_df.head()

g = (p9.ggplot(
    kl_divergence_df.replace({
        "biorxiv_vs_pmc": "bioRxiv-PMC",
        "biorxiv_vs_nytac": "bioRxiv-NYTAC",
        "pmc_vs_nytac": "PMC-NYTAC",
    }).rename(index=str, columns={"comparison": "Comparison"})) + p9.aes(
        x="factor(num_terms)",
        y="KL_divergence",
        fill="Comparison",
        color="Comparison",
        group="Comparison",
    ) + p9.geom_point(size=2) + p9.geom_line(linetype="dashed") +
     p9.scale_fill_brewer(type="qual", palette="Paired", direction=-1) +
     p9.scale_color_brewer(
         type="qual",
         palette="Paired",
         direction=-1,
     ) + p9.labs(
         x="Number of terms evaluated",
         y="Kullback–Leibler Divergence",
     ) + p9.theme_seaborn(
         context="paper",
         style="ticks",
         font_scale=1.8,
     ) + p9.theme(figure_size=(11, 8.5), text=p9.element_text(family="Arial")))
g.save("output/svg_files/corpora_kl_divergence.svg")
g.save("output/figures/corpora_kl_divergence.png", dpi=500)
print(g)
コード例 #14
0
ファイル: utils.py プロジェクト: cookesd/mcda
def plot_alt_benefit(plot_df,
                     title='Benefit by Alternative',
                     which='both',
                     sensitivity=False,
                     legend=True):
    '''Builds a stacked bar chart of the alternative benefits
    @ param plot_df: The df containing benefits for each alt by the criteria and total benefit
    @ param title: The title for the graph
    @ param which: which parts to plot. Acceptable values are
    'total' for just total value.
    'criteria' for just criteria level stacked bars'
    'both' for total and criteria. The graphs will be faceted in this case
    
    Returns the ggplot graph to be displayed elsewhere'''

    _facet = which == 'both'
    if which == 'both':
        plot_df = plot_df
    elif which == 'total':
        plot_df = plot_df.loc[plot_df['type'] == 'Total Value']
    elif which == 'criteria':
        plot_df = plot_df.loc[plot_df['type'] == 'Weighted Criterion Value']
    else:
        print(
            which,
            'is not an approved value for which.\n Enter "total", "criteria", or "both"'
        )
        return (None)

    if legend:
        g = (
            p9.ggplot(plot_df,
                      p9.aes(x='Alternative', y='Benefit', fill='Criterion')) +
            p9.geom_col(stat='identity', position=p9.position_stack(
                vjust=.5))  # makes stacked bar plot
            + p9.scale_fill_brewer(type='qual', palette='Paired')
        )  # changes the color palette to one for qualitative scales)
    else:
        g = (
            p9.ggplot(plot_df,
                      p9.aes(x='Alternative', y='Benefit', fill='Criterion')) +
            p9.geom_col(
                p9.aes(show_legend=False),
                stat='identity',
                position=p9.position_stack(vjust=.5))  # makes stacked bar plot
            + p9.scale_fill_brewer(
                type='qual', palette='Paired', guide=False
            )  # changes the color palette to one for qualitative scales
            + p9.theme(legend_position=None))

        # Builds the base plot
    g = (
        g
        # + p9.geom_col(stat='identity',position=p9.position_stack(vjust=.5)) # makes stacked bar plot
        # + p9.scale_fill_brewer(type='qual',palette='Paired') # changes the color palette to one for qualitative scales
        + p9.geom_text(p9.aes(label='print_value'),
                       position=p9.position_stack(vjust=.5),
                       size=6,
                       hjust='center')  # adds weighted value to bars
        + p9.ggtitle(title)  # makes the title
        + p9.theme(axis_text_x=p9.element_text(
            rotation=45, hjust=1))  # rotates x axis labels
    )
    # Adds the facet if required
    if sensitivity:
        if _facet:
            return ((g + p9.facet_grid('type~Criterion Weight')))
        else:
            return ((g + p9.facet_grid('Criterion Weight~')))
    elif _facet:
        return ((g + p9.facet_grid('~type')))
    else:
        return (g)
コード例 #15
0
def generate_map(data,
                 region,
                 value_field,
                 iso_field='iso',
                 scale_params=None,
                 plot_na_dots=False,
                 tolerance=None,
                 plot_size=8,
                 out_region_color='#f0f0f0',
                 na_color='#aaaaaa',
                 line_color='#666666',
                 projection=None):
    """
    This function returns a map plot with the specified options.

    :param pandas.DataFrame data: Data to be plotted.
    :param str region: Region to center the map around. Countries outside
        the chosen region will be obscured.
    :param str value_field: Column of *data* with the values to be plotted.
    :param str iso_field: Column of *data* with the ISO3 codes for each
        country.
    :param dict scale_params: Dictionary of parameters to be passed to the
        ggplot corresponding color scale (continuous or discrete).
    :param bool plot_na_dots: Whether to plot the dots for small countries
        if said country doesn't have data available.
    :param int tolerance: Coordinate tolerance for polygon simplification,
        a higher number will result in simpler polygons and faster
        rendering (see DEFAULT_TOLERANCES).
    :param int plot_size: Size of the plot, which determines the relative sizes
        of the elements within.
    :param str out_region_color: Hex color of the countries that are out of the
        specified region.
    :param str na_color: Hex color of the countries with no data available.
    :param str line_color: Color of the country borders.
    :param str projection: Kind of map projection to be used in the map.
        Currently, Oceania (XOX) is only available in ESPG:4326 to enable
        wrapping.
    :returns: a ggplot-like plot with the map
    :rtype: plotnine.ggplot
    """
    if projection is None:
        if region == 'XOX':
            projection = 'epsg4326'
        else:
            projection = 'robinson'

    if projection not in PROJECTION_DICT.keys():
        raise ValueError('Projection "{}" not valid'.format(projection))

    if scale_params is None:
        scale_params = {}

    if region not in REGION_BOUNDS[projection]:
        raise ValueError(
            '"region" not available. Valid regions are: {}'.format(', '.join(
                REGION_BOUNDS[projection].keys())))

    if tolerance is None:
        tolerance = DEFAULT_TOLERANCES[projection][region]

    countries = GeoDataFrame.from_file(
        os.path.join(os.path.dirname(__file__), 'data/world-countries.shp'))

    # To plot Oceania we need the original EPSG:4326 to wrap around the 180º
    # longitude. In other cases transform to the desired projection.
    if region == 'XOX':
        countries.crs['lon_wrap'] = '180'  # Wrap around longitude 180º

        XOX_countries = countries['continent'] == 'XOX'
        countries[XOX_countries] = countries[XOX_countries].to_crs(
            countries.crs)
        centroids = countries[XOX_countries].apply(
            lambda row: row['geometry'].centroid, axis=1)
        countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids]
        countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids]
    else:
        if projection != 'epsg4326':
            countries = countries.to_crs(PROJECTION_DICT[projection])
            centroids = countries.apply(lambda row: row['geometry'].centroid,
                                        axis=1)
            countries['lon'] = [c.x for c in centroids]
            countries['lat'] = [c.y for c in centroids]

    countries['geometry'] = countries['geometry'].simplify(tolerance)

    upper_left, lower_right = REGION_BOUNDS[projection][region]
    limits_x = [upper_left[0], lower_right[0]]
    limits_y = [lower_right[1], upper_left[1]]
    ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0])

    plot_data = pd.merge(countries,
                         data,
                         how='left',
                         left_on='iso',
                         right_on=iso_field)
    map_bounds = REGION_BOUNDS['epsg4326'][region]
    map_area = ((map_bounds[1][0] - map_bounds[0][0]) *
                (map_bounds[0][1] - map_bounds[1][1]))
    plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area)

    if not plot_na_dots:
        plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field])

    if region != 'XWX':
        in_region = ((~pd.isnull(plot_data[value_field])) &
                     (plot_data['continent'] == region))
        in_region_missing = ((pd.isnull(plot_data[value_field])) &
                             (plot_data['continent'] == region))
        out_region = plot_data['continent'] != region
    else:
        in_region = ~pd.isnull(plot_data[value_field])
        in_region_missing = pd.isnull(plot_data[value_field])
        out_region = np.repeat(False, len(plot_data))

    if plot_data[value_field].dtype == 'object':
        # Assume discrete values
        fill_scale = scale_fill_brewer(**scale_params, drop=False)
    else:
        # Assume continuous values
        fill_scale = scale_fill_gradient(**scale_params)

    plot_data_values = plot_data[in_region]
    plot_data_missing = plot_data[in_region_missing]
    plot_data_out_region = plot_data[out_region]

    dots_region = plot_data_values[plot_data_values['plot_dot']]
    dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']]
    dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']]

    plt = (
        ggplot() + geom_map(plot_data_values,
                            aes(fill=value_field),
                            color=line_color,
                            size=0.3) +
        geom_map(
            plot_data_missing, aes(color='plot_dot'), fill=na_color,
            size=0.3) + geom_map(plot_data_out_region,
                                 fill=out_region_color,
                                 color=line_color,
                                 size=0.3) +
        geom_point(dots_region,
                   aes(x='lon', y='lat', fill=value_field),
                   size=3,
                   stroke=.1,
                   color=line_color) + geom_point(dots_region_missing,
                                                  aes(x='lon', y='lat'),
                                                  fill=na_color,
                                                  size=3,
                                                  stroke=.1,
                                                  color=line_color) +
        geom_point(dots_out_region,
                   aes(x='lon', y='lat'),
                   fill=out_region_color,
                   size=3,
                   stroke=.1,
                   color=line_color) +
        scale_x_continuous(breaks=[], limits=limits_x) +
        scale_y_continuous(breaks=[], limits=limits_y) + theme(
            figure_size=(plot_size * ratio, plot_size),
            panel_background=element_rect(fill='white', color='black'),
            #  panel_border=element_rect(fill='white',
            #                            color='black',
            #                            size=.1),
            legend_background=element_rect(
                fill="white", color='black', size=.5),
            legend_box_just='left') + xlab('') + ylab(''))

    if len(plot_data_values.index) > 0:
        plt += fill_scale

    plt += scale_color_manual(name=' ',
                              values=[line_color],
                              breaks=[False],
                              labels=['No data available'])

    if plot_data[value_field].dtype == 'object':
        plt += guides(fill=guide_legend(override_aes={'shape': None}))

    return {
        'plot': plt,
        'ratio': ratio,
    }
コード例 #16
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
        bins: int = 50,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_normed_samples = self.normalize_samples(samples)

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_normalized_community()
                        for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_normed_samples,  # type: ignore
                })
            # import pdb
            # pdb.set_trace()
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df["prediction"] = self.denormalize_samples(df["prediction"])
            df["community"] = self.denormalize_samples(df["community"])

            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_histogram(position="identity", alpha=0.9) +
                    scale_x_datetime(limits=(_xmin, _xmax)) +
                    facet_wrap("sources", ncol=1) + labs(
                        x="Prediction",
                        y="Counts",
                        title=title_name,
                    ) + guides(fill=False) + ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            (_xmin, _xmax) = self.get_central_quantiles(
                prediction_normed_samples,
                percent_kept=percent_kept,
                side_cut_from=side_cut_from,
            )
            _xmin, _xmax = self.denormalize_samples([_xmin, _xmax])
            df = pd.DataFrame(data={
                "prediction":
                self.denormalize_samples(prediction_normed_samples)
            })
            return (ggplot(df, aes("prediction")) +
                    geom_histogram(fill="#b3cde3", bins=bins)
                    # + coord_cartesian(xlim = (_xmin,_xmax))
                    + scale_x_datetime(limits=(_xmin, _xmax)) +
                    labs(x="Prediction", y="Counts", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
コード例 #17
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
            prediction_true_scale_samples = self.denormalize_samples(
                prediction_normed_samples)
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_true_scale_samples = samples

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_community() for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_true_scale_samples,
                })
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            df = pd.DataFrame(
                data={"prediction": prediction_true_scale_samples})
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)

            return (ggplot(df, aes("prediction")) +
                    geom_density(fill="#b3cde3", alpha=0.8) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
コード例 #18
0
ファイル: plot_utils.py プロジェクト: laashub-soa/community-4
def make_likert_chart_multi_year(
    survey_data,
    topic,
    labels,
    facet_by=[],
    five_is_high=False,
    exclude_new_contributors=False,
):
    """Make an offset stacked barchart showing the number of respondents at each rank or value for 
        all columns in the topic. Each column in the topic is a facet, with the years displayed
        along the x-axis.

    Args:
        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
        topic (str): String that all questions of interest start with
        labels (list): List of strings to use as labels, corresponding
             to the numerical values given by the respondents.
        facet_by (list,optional): List of columns use for grouping
        five_is_high (bool, optiona ): Defaults to False. If True,
            five is considered the highest value in a ranking, otherwise 
            it is taken as the lowest value.
        exclude_new_contributors (bool, optional): Defaults to False. If True,
            do not include any responses from contributors with less than 
            one year of experience        

    Returns:
        (plotnine.ggplot): Offset stacked barchart plot object which 
            can be displayed in a notebook or saved out to a file
    """

    facet_by = copy(facet_by)
    og_cols = [x for x in survey_data.columns if x.startswith(topic)]
    show_legend = True

    topic_data_long = get_multi_year_data_subset(
        survey_data, topic, facet_by, exclude_new_contributors
    )

    if not five_is_high:
        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)

    mid_point = 3 if five_is_high else -3
    top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)

    if facet_by:
        fix = False
        if "." in facet_by:
            facet_by.remove(".")
            fix = True

        # Calculate proportion for each rank
        top_scores = top_scores.merge(
            topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
            on=facet_by + ["year"],
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        top_scores = top_scores.assign(
            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
        )

        bottom_scores = bottom_scores.merge(
            topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
            on=facet_by + ["year"],
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        bottom_scores = bottom_scores.assign(
            level_1=bottom_scores.level_1_x
            * -1
            / (bottom_scores.level_1_y / len(og_cols))
        )

        if fix:
            facet_by.append(".")
    else:
        # Calculate proportion for each rank
        top_scores = top_scores.merge(
            topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        top_scores = top_scores.assign(
            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
        )

        bottom_scores = bottom_scores.merge(
            topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        bottom_scores = bottom_scores.assign(
            level_1=bottom_scores.level_1_x
            * -1
            / (bottom_scores.level_1_y / len(og_cols))
        )

    vp = (
        p9.ggplot(
            topic_data_long,
            p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"),
        )
        + p9.geom_col(
            data=top_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
            position=p9.position_stack(reverse=True),
        )
        + p9.geom_col(
            data=bottom_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
            position=p9.position_stack(),
        )
        + p9.geom_hline(yintercept=0, color="white")
    )

    if five_is_high:
        vp = (
            vp
            + p9.scale_color_brewer(
                "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels
            )
            + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels)
            + p9.theme(
                axis_text_x=p9.element_text(angle=45, ha="right"),
                strip_text_y=p9.element_text(angle=0, ha="left"),
            )
        )
    else:
        vp = (
            vp
            + p9.scale_color_brewer(
                "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
            )
            + p9.scale_fill_brewer(
                "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
            )
            + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left"))
        )

    if facet_by:
        facet_by.remove(".")

    else:
        facet_by.append(".")

    vp = (
        vp
        + p9.facet_grid(
            facet_by + ["level_0"],
            labeller=lambda x: "\n".join(
                wrap(
                    x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(),
                    15,
                )
            ),
        )
        + p9.theme(
            strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1
        )
    )

    return vp
コード例 #19
0
ファイル: plot_utils.py プロジェクト: laashub-soa/community-4
def make_likert_chart(
    survey_data,
    topic,
    labels,
    facet_by=[],
    max_value=5,
    max_is_high=False,
    wrap_facets=True,
    sort_x=False,
):
    """Make an offset stacked barchart showing the number of respondents at each rank or value for 
        all columns in the topic. Each column in the original data is a tick on the x-axis

    Args:
        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
        topic (str): String that all questions of interest start with
        labels (list): List of strings to use as labels, corresponding
             to the numerical values given by the respondents.
        facet_by (list,optional): List of columns use for grouping 
        max_value (int, optional):  Defaults to 5. The maximuum value a respondent can assign.
        max_is_high (bool, optiona ): Defaults to False. If True,
            the max_value is considered the highest value in a ranking, otherwise 
            it is taken as the lowest value.
        wrap_facets (bool, optional): Defaults to True. If True, the facet labels are 
            wrapped
        sort_x  (bool, optional): Defaults to False. If True, the x-axis is sorted by the 
            mean value for each column in the original data 

    Returns:
        (plotnine.ggplot): Offset stacked barchart plot object which 
            can be displayed in a notebook or saved out to a file
    """

    mid_point = math.ceil(max_value / 2)

    og_cols = [x for x in survey_data.columns if x.startswith(topic)]
    show_legend = True

    topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by)

    if not max_is_high:
        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)

        mid_point = -1 * mid_point

    top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)

    if facet_by:
        fix = False
        if "." in facet_by:
            facet_by.remove(".")
            fix = True

        top_scores = top_scores.merge(
            topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        top_scores = top_scores.assign(
            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
        )

        bottom_scores = bottom_scores.merge(
            topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
        bottom_scores = bottom_scores.assign(
            level_1=bottom_scores.level_1_x
            * -1
            / (bottom_scores.level_1_y / len(og_cols))
        )

        if fix:
            facet_by.append(".")

    else:
        bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1)

    if sort_x:
        x_sort_order = (
            topic_data_long.groupby("level_0")
            .mean()
            .sort_values("rating")
            .reset_index()["level_0"]
            .values.tolist()
        )
        x_sort_order.reverse()
    else:
        x_sort_order = topic_data_long["level_0"].unique().tolist()

    vp = (
        p9.ggplot(
            topic_data_long,
            p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"),
        )
        + p9.geom_col(
            data=top_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
            position=p9.position_stack(reverse=True),
        )
        + p9.geom_col(
            data=bottom_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
            position=p9.position_stack(),
        )
        + p9.geom_hline(yintercept=0, color="white")
        + p9.theme(
            axis_text_x=p9.element_text(angle=45, ha="right"),
            strip_text_y=p9.element_text(angle=0, ha="left"),
        )
        + p9.scale_x_discrete(
            limits=x_sort_order,
            labels=[
                "\n".join(
                    textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2]
                )
                for x in x_sort_order
            ],
        )
    )

    if max_is_high:
        vp = (
            vp
            + p9.scale_color_brewer(
                "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
            )
            + p9.scale_fill_brewer(
                "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
            )
        )

    else:
        vp = (
            vp
            + reverse_scale_fill_brewer(
                "div",
                "RdBu",
                limits=list(reversed(range(-max_value, 0))),
                labels=labels,
            )
            + reverse_scale_color_brewer(
                "div",
                "RdBu",
                limits=list(reversed(range(-max_value, 0))),
                labels=labels,
            )
        )

    if facet_by:
        if wrap_facets:
            vp = (
                vp
                + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15)))
                + p9.theme(
                    strip_text_x=p9.element_text(
                        wrap=True, va="bottom", margin={"b": -0.5}
                    )
                )
            )
        else:
            vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x)
    return vp
コード例 #20
0
ファイル: plot_utils.py プロジェクト: laashub-soa/community-4
def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False):
    """Make an offset stacked barchart showing the number of respondents at each rank 
        or value for a single columns in the original data. Each facet is shown as
        a tick on the x-axis

    Args:
        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
        topic (str): String that all questions of interest start with
        labels (list): List of strings to use as labels, corresponding
             to the numerical values given by the respondents.
        facet (str): Column used for grouping 
        five_is_high (bool, optionalc): Defaults to False. If True,
            5 is considered the highest value in a ranking, otherwise 
            it is taken as the lowest value.

    Returns:
        (plotnine.ggplot): Offset stacked barchart plot object which 
            can be displayed in a notebook or saved out to a file
    """
    mid_point = 3
    cols = [column, facet]
    show_legend = True
    topic_data = survey_data[cols]

    topic_data_long = make_long(topic_data, facet)

    if not five_is_high:
        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
    x = topic_data_long.columns.tolist()
    x.remove("level_1")
    x.remove("level_0")

    if not five_is_high:
        mid_point *= -1

    top_cutoff = topic_data_long["rating"] >= mid_point
    bottom_cutoff = topic_data_long["rating"] <= mid_point

    top_scores = (
        topic_data_long[top_cutoff]
        .groupby(x)
        .count()
        .reset_index()
        .sort_index(ascending=False)
    )

    top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = (
        top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0
    )
    top_scores = top_scores.merge(
        topic_data_long.groupby(facet).count().reset_index(), on=facet
    )
    top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y)

    bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index()
    bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = (
        bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0
    )
    bottom_scores = bottom_scores.merge(
        topic_data_long.groupby(facet).count().reset_index(), on=facet
    )
    bottom_scores = bottom_scores.assign(
        level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y
    )

    vp = (
        p9.ggplot(
            topic_data_long,
            p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"),
        )
        + p9.geom_col(
            data=top_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
            position=p9.position_stack(reverse=True),
        )
        + p9.geom_col(
            data=bottom_scores,
            mapping=p9.aes(y="level_1"),
            show_legend=show_legend,
            size=0.25,
        )
        + p9.geom_hline(yintercept=0, color="white")
        + p9.theme(
            axis_text_x=p9.element_text(angle=45, ha="right"),
            strip_text_y=p9.element_text(angle=0, ha="left"),
        )
        + p9.scale_x_discrete(
            limits=topic_data_long[facet].unique().tolist(),
            labels=[
                x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
            ],
        )
    )

    if five_is_high:
        vp = (
            vp
            + p9.scale_color_brewer(
                "div",
                "RdBu",
                limits=[1, 2, 3, 4, 5],
                labels=["\n".join(wrap(x, 15)) for x in labels],
            )
            + p9.scale_fill_brewer(
                "div",
                "RdBu",
                limits=[1, 2, 3, 4, 5],
                labels=["\n".join(wrap(x, 15)) for x in labels],
            )
        )
    else:
        vp = (
            vp
            + reverse_scale_fill_brewer(
                "div",
                "RdBu",
                limits=[-1, -2, -3, -4, -5],
                labels=["\n".join(wrap(x, 15)) for x in labels],
            )
            + reverse_scale_color_brewer(
                "div",
                "RdBu",
                limits=[-1, -2, -3, -4, -5],
                labels=["\n".join(wrap(x, 15)) for x in labels],
            )
        )

    return vp
コード例 #21
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Filter and merge 10x data. Save to AnnData object.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument('--tsv_file',
                        action='store',
                        dest='tsv',
                        required=True,
                        help='cell_filtered_per_experiment tsv file.')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output png file. Will have .png appended.\
            (default: %(default)s)')

    options = parser.parse_args()

    # Get basename of the output file
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}'.format(
            os.path.basename(options.tsv.rstrip('tsv.gz').rstrip('\\.')))

    # Load the data
    df = pd.read_csv(options.tsv, sep='\t')

    # Get the total number of input cells per sample
    df_before_filters = df[df.filter_type.isin(['before_filters'])]
    df_before_filters = df_before_filters.set_index('experiment_id')

    # Check if any difference between before and after filters.	If not,
    # return early.
    df_after_filters = df[df.filter_type.isin(['after_filters'])]
    filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
        df_after_filters.experiment_id, 'n_cells_left_in_adata'].values
    if all(filt):
        print("No difference detected before and after filters. No plots.")
        return ()

    # Set some plotting parameters
    plt_height = 16  # 1.5 * df.experiment_id.nunique()

    # Plot the number of cells before and after all filters across experiments
    df_plt = df[df.filter_type.isin(['before_filters', 'after_filters'])]
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(
            x='experiment_id',
            y='n_cells_left_in_adata',
            # label='n_cells',
            fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    # gplt = gplt + plt9.geom_text(vjust=1.6, color='white', size=3.5)
    gplt = gplt + plt9.scale_y_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(title='', y='Number of cells', x='', fill='')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='horizontal',
        legend_title=plt9.element_blank())
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-n_cells_before_after.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the final fraction of cells filtered per experiment
    df_plt = df_after_filters.copy()
    # Invert the numbers, so instead of the number of cells that pass, get
    # the number of cells that fail at each filter.
    df_plt.n_cells_left_in_adata = df_before_filters.loc[
        df_plt.experiment_id,
        'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata
    # Now calculate the fraction removed
    df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \
        df_before_filters.loc[
            df_plt.experiment_id,
            'n_cells_left_in_adata'
        ].values
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Fraction of total cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-fraction_before_after.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the number of cells falling into each filter acoss experiments.
    # NOTE: cells can fall into multiple filters.
    # Remove the rows that we do not want
    df_plt = df[~df.filter_type.isin(['before_filters', 'after_filters'])]
    df_plt = df_plt[~df_plt.filter_type.str.contains('after_filter')]
    # Invert the numbers, so instead of the number of cells that pass, get
    # the number of cells that fail at each filter.
    df_plt.n_cells_left_in_adata = df_before_filters.loc[
        df_plt.experiment_id,
        'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id',
                 y='n_cells_left_in_adata',
                 fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Number of cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-n_cells_excluded.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the ratio of the total number of cells removed in each filter across
    # experiments.
    # NOTE: cells can fall into multiple filters.
    df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \
        df_before_filters.loc[
            df_plt.experiment_id,
            'n_cells_left_in_adata'
        ].values
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Fraction of total cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-fraction_cells_excluded.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)
コード例 #22
0
    "scala": "Scala",
    "C": "C",
    "sas": "SAS"
}

skills_summary_lang = skills_summary_df[skills_summary_df.attribute.isin(
    languages)]
skills_summary_lang = skills_summary_lang.replace(to_replace=lang_clean)
skills_summary_lang = sort_df(skills_summary_lang, var_col="attribute")

lang_plot = (
    p9.ggplot(skills_summary_lang,
              p9.aes('attribute', 'value', fill='type', show_legend=False)) +
    p9.geom_col() + p9.coord_flip() + p9.scale_y_continuous(expand=[0, 0]) +
    p9.labs(y="Frequency", x="Language", fill="") +
    p9.scale_fill_brewer(palette="Blues") + p9.facet_wrap('~type'))
lang_plot.save(filename='figs/lang_plot.png',
               height=5,
               width=5,
               units='in',
               dpi=1000)
lang_plot

#Software
programs = ["tableau", "docker", "bigquery", "jira", "spark", "hadoop"]

prog_clean = {
    "tableau": "Tableau",
    "docker": "Docker",
    "bigquery": "Google BigQuery",
    "jira": "Jira",
コード例 #23
0
ファイル: 0047-lisi.py プロジェクト: ckrilow/dev-ckrilow
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Calcualte and compare LISI across a series of reduced dims and
            categorical variables.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    # parser.add_argument(
    #     '-h5', '--h5_anndata',
    #     action='store',
    #     dest='h5',
    #     required=True,
    #     help='H5 AnnData file.'
    # )

    parser.add_argument(
        '-rf',
        '--reduced_dims_tsv',
        action='store',
        dest='reduced_dims',
        required=True,
        help='List of tab-delimited files of reduced dimensions (e.g., PCs)\
            for each cell. First column is cell_barcode. List should be\
            split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).')

    parser.add_argument(
        '-lbl',
        '--reduced_dims_tsv_labels',
        action='store',
        dest='reduced_dims_labels',
        required=True,
        help='String of labels for each reduced_dims_tsv file. List should be\
            split by "::".')

    parser.add_argument(
        '-mf',
        '--metadata_tsv',
        action='store',
        dest='metadata_tsv',
        required=True,
        help='Tab-delimited file of metadata for each cell. First column\
            is cell_barcode.')

    parser.add_argument(
        '-mv',
        '--metadata_columns',
        action='store',
        dest='metadata_columns',
        default='experiment_id',
        help='Comma separated string of categorical variables to calculate\
            LISI with.\
            (default: %(default)s)')

    parser.add_argument('-p',
                        '--perplexity',
                        action='store',
                        dest='perplexity',
                        default=30.0,
                        type=float,
                        help='Perplexity.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: <metadata_tsv>-lisi)')

    options = parser.parse_args()

    # Fixed settings.
    # verbose = True

    # Get the out file base.
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}-lisi'.format(
            os.path.basename(
                options.metadata_tsv.rstrip('tsv.gz').rstrip('.')))

    # Get the columns to use
    lisi_columns = options.metadata_columns.split(',')
    # lisi_columns = ['experiment_id', 'batch']
    lisi_columns_dtype = dict(
        zip(lisi_columns, ['category'] * len(lisi_columns)))

    # Load the metadata file
    file_meta = options.metadata_tsv
    df_meta = pd.read_csv(file_meta,
                          sep='\t',
                          index_col='cell_barcode',
                          dtype=lisi_columns_dtype)

    # Load the reduced dims.
    files = options.reduced_dims.split('::')
    labels = options.reduced_dims_labels.split('::')
    assert len(files) == len(labels), 'ERROR: check files and labels input'

    # Make a dict of theoretical maximum LISI value for each label.
    lisi_limit = {}
    for col in lisi_columns:
        n_cat = len(df_meta[col].cat.categories)
        lisi_limit[col] = n_cat

    list_lisi = []
    for i in range(len(files)):
        df_reduced_dims = pd.read_csv(files[i],
                                      sep='\t',
                                      index_col='cell_barcode')

        # Run lisi and save results to dataframe
        _df_lisi = pd.DataFrame(hm.compute_lisi(
            df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns],
            lisi_columns),
                                columns=lisi_columns)
        _df_lisi['file'] = files[i]
        _df_lisi['label'] = labels[i]
        _df_lisi['cell_barcode'] = df_meta.index
        list_lisi.append(_df_lisi)

    # Make one long dataframe.
    df_lisi = pd.concat(list_lisi)
    # Make cell_barcode the first column.
    cols = list(df_lisi.columns)
    cols = [cols[-1]] + cols[:-1]

    # Save the results
    df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base),
                         sep='\t',
                         index=False,
                         quoting=csv.QUOTE_NONNUMERIC,
                         na_rep='',
                         compression='gzip')

    # Compare the lisi distributions
    n_labels = len(labels)
    for lisi_column in lisi_columns:
        # Make density plot.
        gplt = plt9.ggplot(df_lisi,
                           plt9.aes(
                               fill='label',
                               x='label',
                               y=lisi_column,
                           ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.geom_violin(alpha=0.9)
        gplt = gplt + plt9.geom_boxplot(
            group='label',
            position=plt9.position_dodge(width=.9),
            width=.1,
            fill='white',
            outlier_alpha=0  # Do not know how to totally remove outliers.
        )
        # Add a line at the theoretical maximum
        gplt = gplt + plt9.geom_hline(
            plt9.aes(yintercept=lisi_limit[lisi_column]))
        # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label))
        gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt = gplt + plt9.theme(legend_position='none')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
        gplt.save(
            '{}-{}-violin.png'.format(out_file_base, lisi_column),
            dpi=300,
            width=4 * (n_labels / 4),
            height=10,
            # height=4*(n_samples/4),
            limitsize=False)

        # Make ecdf.
        gplt = plt9.ggplot(df_lisi, plt9.aes(
            x=lisi_column,
            color='label',
        ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.stat_ecdf(alpha=0.8)
        gplt = gplt + plt9.labs(
            x='LISI',
            y='Cumulative density',
            # color='Reduction',
            title='')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
        gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)