Example #1
0
 def plot_compare_accuracy(self, expo=False):
     if expo:
         return (
             ggplot(self.acc_df) + facet_wrap('position')
             + aes(x='guesser', y='accuracy', fill='Dataset')
             + geom_bar(stat='identity', position='dodge')
             + xlab('Guessing Model')
             + ylab('Accuracy')
         )
     else:
         return (
             ggplot(self.acc_df) + facet_wrap('position')
             + aes(x='guesser', y='accuracy')
             + geom_bar(stat='identity')
         )
def test_ribbon_facetting():
    p = (ggplot(df, aes('x', ymin='ymin', ymax='ymax',
                        fill='factor(z)')) +
         geom_ribbon() +
         facet_wrap('~ z')
         )

    assert p + _theme == 'ribbon_facetting'
Example #3
0
 def plot_char_percent_vs_accuracy_histogram(self, category=False):
     if category:
         return (
             ggplot(self.char_plot_df) + facet_wrap('category_jmlr')
             + aes(x='char_percent', fill='Outcome')
             + geom_histogram(binwidth=.05)
         )
     else:
         return (
             ggplot(self.char_plot_df)
             + aes(x='char_percent', fill='Outcome')
             + geom_histogram(binwidth=.05)
         )
Example #4
0
def create_confidence_plot(conf_df):
    plt = (
        ggplot(conf_df)
        + aes(x='x', color='Method', fill='Method')
        + geom_density(alpha=.45)
        + facet_wrap('Task', nrow=4)
        + xlab('Confidence')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            axis_text_y=element_blank(),
            axis_ticks_major_y=element_blank(),
            axis_title_y=element_blank(),
            legend_title=element_blank(),
            legend_position='top',
            legend_box='horizontal',
        )
    )
    return plt
Example #5
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (
        ggplot(len_df)
        + aes(x='x', fill='Method', y='..density..')
        + geom_histogram(binwidth=2, position='identity', alpha=.6)
        + geom_text(
            aes(x='x', y=.22, label='x', color='Method'),
            mean_len_df,
            inherit_aes=False,
            format_string='{:.1f}',
            show_legend=False
        )
        + geom_segment(
            aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
            mean_len_df,
            inherit_aes=False, color='black'
        )
        + scale_linetype_manual(['dashed'])
        + facet_wrap('Task')
        + xlim(0, 20) + ylim(0, .23)
        + xlab('Example Length') + ylab('Frequency')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            aspect_ratio=1,
            legend_title=element_blank(),
            legend_position=legend_position,
            legend_box=legend_box,
        )
    )

    return plt
Example #6
0
def test_facet_wrap_one_var():
    p = g + facet_wrap('~var1')
    p2 = g + facet_wrap('~class')  # python keyword in formula
    assert p == 'facet_wrap_one_var'
    assert p2 == 'facet_wrap_one_var'
Example #7
0
def test_aslabeller_func_hashtag():
    func = as_labeller(lambda s: '#{}'.format(s))
    p = g + facet_wrap('~ gear + am', labeller=func)

    assert p == 'aslabeller_func_hashtagit'
Example #8
0
def test_label_context_wrap2vars():
    p = g + facet_wrap('~ gear + am', labeller='label_context')

    assert p == 'label_context_wrap2vars'
Example #9
0
def test_non_mapped_facetting():
    p = (g + geom_abline(intercept=0, slope=1, size=1) + facet_wrap('var1'))
    assert p == 'non_mapped_facetting'
Example #10
0
File: jmlr.py Project: Pinafore/qb
def syntactic_diversity_plots():
    with open('data/external/syntactic_diversity_table.json') as f:
        rows = json.load(f)
    parse_df = pd.DataFrame(rows)
    parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses']
    melt_df = pd.melt(
        parse_df,
        id_vars=['dataset', 'depth', 'overlap', 'parses'],
        value_vars=['parse_ratio', 'unique_parses'],
        var_name='metric',
        value_name='y'
    )

    def label_facet(name):
        if name == 'parse_ratio':
            return 'Average Unique Parses per Instance'
        elif name == 'unique_parses':
            return 'Count of Unique Parses'

    def label_y(ys):
        formatted_ys = []
        for y in ys:
            y = str(y)
            if y.endswith('000.0'):
                formatted_ys.append(y[:-5] + 'K')
            else:
                formatted_ys.append(y)
        return formatted_ys
    p = (
    ggplot(melt_df)
        + aes(x='depth', y='y', color='dataset')
        + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet)
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth') + ylab('')
        + scale_color_discrete(name='Dataset')
        + scale_y_continuous(labels=label_y)
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'syn_div_plot.pdf'))
    p = (
    ggplot(parse_df)
        + aes(x='depth', y='unique_parses', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Count of Unique Parses')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'n_unique_parses.pdf'))
    p = (
        ggplot(parse_df)
        + aes(x='depth', y='parse_ratio', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Average Unique Parses per Instance')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10])
        + scale_y_continuous(limits=[0, 1])
        + theme_fs()
    )
    p.save(path.join(output_path, 'parse_ratio.pdf'))
Example #11
0
def test_facet_wrap_direction_v():
    p = g + facet_wrap('~var1', dir='v')
    assert p == 'facet_wrap_direction_v'
Example #12
0
def test_facet_wrap_expression():
    p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
    assert p == 'facet_wrap_expression'
Example #13
0
    def plot_tag_repartition(self, data, options):
        tag_df = data["tags"]
        if not "background" in tag_df.columns:
            tag_df["background"] = False
        test = tag_df[["tag", "matched", "background", "id"]].copy()
        test.loc[:, "prop_matched"] = -1
        test.loc[:, "prop_background"] = -1
        test.loc[:, "lbl_matched"] = ""
        test.loc[:, "lbl_background"] = ""
        test.loc[:, "n_tags"] = -1

        n_total = test.shape[0]
        n_matched = test.matched.value_counts()

        tags_summary = (
            test.groupby("tag")
            .apply(self.get_proportions, by=["matched", "background"])
            .reset_index(drop=True)
        )
        tags_summary = tags_summary.sort_values(["tag", "matched", "background"])

        plt = ggplot(
            data=tags_summary,
            mapping=aes(
                x="tag",  # "factor(species, ordered=False)",
                y="n_tags",
                fill="background",
                ymax=max(tags_summary.n_tags) + 35,  # "factor(species, ordered=False)",
            ),
        )
        plot_width = 10 + len(tags_summary.tag.unique()) * 0.75
        plt = (
            plt
            + geom_bar(stat="identity", show_legend=True, position=position_dodge())
            + facet_wrap(
                "matched",
                nrow=1,
                ncol=2,
                scales="fixed",
                labeller=(lambda x: self.get_matched_label(x, n_total, n_matched)),
            )
            + xlab("Species")
            + ylab("Number of annotations")
            + geom_text(
                mapping=aes(label="lbl_background"), position=position_dodge(width=0.9),
            )
            + geom_text(
                mapping=aes(y=max(tags_summary.n_tags) + 30, label="lbl_matched",)
            )
            + theme_classic()
            + theme(
                axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}),
                plot_title=element_text(
                    weight="bold", size=14, margin={"t": 10, "b": 10}
                ),
                figure_size=(plot_width, 10),
                text=element_text(size=12, weight="bold"),
            )
            + ggtitle(
                (
                    "Tag repartition for model {}, database {}, class {}\n"
                    + "with detector options {}"
                ).format(
                    options["scenario_info"]["model"],
                    options["scenario_info"]["database"],
                    options["scenario_info"]["class"],
                    options,
                )
            )
        )

        return plt
    .mean()
    .reset_index()
)

for plate in platelist:
    os.makedirs(output_figuresdir, exist_ok=True)
    by_well_gg = (
        gg.ggplot(
            cell_count_totalcells_df.loc[
                cell_count_totalcells_df["site"].str.contains(plate)
            ],
            gg.aes(x="x_loc", y="y_loc"),
        )
        + gg.geom_point(gg.aes(fill="total_cell_count"), shape="s", size=6)
        + gg.geom_text(gg.aes(label="site_location"), color="lightgrey", size=6)
        + gg.facet_wrap("~well")
        + gg.coord_fixed()
        + gg.theme_bw()
        + gg.ggtitle(f"Total Cells/Well\n{plate}")
        + gg.theme(
            axis_text=gg.element_blank(),
            axis_title=gg.element_blank(),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        )
        + gg.labs(fill="Cells")
        + gg.scale_fill_cmap(name="Number of Cells")
    )

    output_file = pathlib.Path(
        output_figuresdir, f"plate_layout_cells_count_per_well_{plate}.png"
    )
Example #15
0
knnResultsSimplified = pd.DataFrame(
    [(x['p'], x['k'], x['cvAccuracy'], x['testAccuracy'])
     for x in repeatedKnnResults],
    columns=['p', 'k', 'cvAccuracy', 'testAccuracy'])

ggdata = pd.concat([
    pd.DataFrame({
        'p': knnResultsSimplified.p,
        'k': knnResultsSimplified.k.apply(int),
        'type': 'cv',
        'Accuracy': knnResultsSimplified.cvAccuracy
    }),
    pd.DataFrame({
        'p': knnResultsSimplified.p,
        'k': knnResultsSimplified.k.apply(int),
        'type': 'test',
        'Accuracy': knnResultsSimplified.testAccuracy
    })
],
                   axis=0)

ggo = gg.ggplot(
    ggdata,
    gg.aes(x='p', y='Accuracy', color='type', group='type', linetype='type'))
ggo += gg.scale_x_log10()
ggo += gg.geom_point(alpha=0.6)
ggo += gg.stat_smooth()
ggo += gg.facet_wrap('~ k')
ggo += gg.theme_bw()
print(ggo)
# In[46]:

all_data_df.columns = [
    'PC1', 'PC2', 'num_partitions', 'comparison', 'No. of partitions',
    'Comparison'
]

# In[52]:

# Plot all comparisons in one figure
panel_B = ggplot(all_data_df[all_data_df['Comparison'] != '1'],
                 aes(x='PC1', y='PC2')) \
    + geom_point(aes(color='No. of partitions'),
                 alpha=0.2) \
    + facet_wrap('~Comparison') \
    + labs(x = "PC 1",
           y = "PC 2",
           title = "PCA of partition 1 vs multiple partitions") \
    + theme_bw() \
    + theme(
        legend_title_align = "center",
        plot_background=element_rect(fill='white'),
        legend_key=element_rect(fill='white', colour='white'),
        legend_text=element_text(family='sans-serif', size=12),
        plot_title=element_text(family='sans-serif', size=15),
        axis_text=element_text(family='sans-serif', size=12),
        axis_title=element_text(family='sans-serif', size=15)
    ) \
    + guides(colour=guide_legend(override_aes={'alpha': 1})) \
    + scale_color_manual(['#bdbdbd', '#b3e5fc']) \
Example #17
0
    std_accuracy=pd.NamedAgg('accuracy', np.std),
)
summary.reset_index(inplace=True)
print('Summary for all participants')
print(summary)

summary_per_participant = trials.groupby(
    by=['participant_id', 'freq_category']).agg(np.mean)
summary_per_participant.reset_index(inplace=True)
print('Summary per participant')
print(summary_per_participant)

plot = (
    ggplot(gg.aes(x='freq_category', y='reaction_time'), data=trials) +
    gg.geom_boxplot(gg.aes(fill='freq_category')) +  # of jitter
    gg.facet_wrap('participant_id'))
plot.draw()
plt.show()

plot = (ggplot(gg.aes(x="freq_category", weight='accuracy'),
               summary_per_participant) + gg.geom_bar() +
        gg.facet_wrap(['participant_id']))
plot.draw()
plt.show()

plot = (
    ggplot(gg.aes(x='freq_category', y='reaction_time'), data=trials) +
    gg.geom_boxplot(gg.aes(fill='freq_category'))  # of jitter
)
plot.draw()
plt.show()
Example #18
0
from plotnine import (ggplot, aes, geom_point, facet_wrap,
                      stat_smooth, theme_xkcd)
from plotnine.data import mtcars

kwargs = dict(width=6, height=4)

p1 = (ggplot(mtcars, aes('wt', 'mpg'))
      + geom_point())
p1.save('readme-image-1.png', **kwargs)

p2 = p1 + aes(color='factor(gear)')
p2.save('readme-image-2.png', **kwargs)

p3 = p2 + stat_smooth(method='lm')
p3.save('readme-image-3.png', **kwargs)

p4 = p3 + facet_wrap('~gear')
p4.save('readme-image-4.png', **kwargs)

p5 = p4 + theme_xkcd()
p5.save('readme-image-5.png', **kwargs)
Example #19
0
    def plot_char_percent_vs_accuracy_smooth(self,
                                             expo=False,
                                             no_models=False,
                                             columns=False):
        if self.y_max is not None:
            limits = [0, float(self.y_max)]
            eprint(f'Setting limits to: {limits}')
        else:
            limits = [0, 1]
        if expo:
            if os.path.exists('data/external/all_human_gameplay.json'
                              ) and not self.no_humans:
                with open('data/external/all_human_gameplay.json') as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [('parents', 'Intermediate'),
                                        ('maryland', 'Expert'),
                                        ('live', 'National')]:
                        if self.merge_humans:
                            name = 'Human'
                        gameplay = all_gameplay[event]
                        if event != 'live':
                            control_correct_positions = gameplay[
                                'control_correct_positions']
                            control_wrong_positions = gameplay[
                                'control_wrong_positions']
                            control_positions = control_correct_positions + control_wrong_positions
                            control_positions = np.array(control_positions)
                            control_result = np.array(
                                len(control_correct_positions) * [1] +
                                len(control_wrong_positions) * [0])
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[
                                argsort_control]
                            control_y = control_sorted_result.cumsum(
                            ) / control_sorted_result.shape[0]
                            control_df = pd.DataFrame({
                                'correct': control_y,
                                'char_percent': control_x
                            })
                            control_df['Dataset'] = 'Regular Test'
                            control_df['Guessing_Model'] = f' {name}'
                            frames.append(control_df)

                        adv_correct_positions = gameplay[
                            'adv_correct_positions']
                        adv_wrong_positions = gameplay['adv_wrong_positions']
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(adv_positions)
                        adv_result = np.array(
                            len(adv_correct_positions) * [1] +
                            len(adv_wrong_positions) * [0])
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum(
                        ) / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({
                            'correct': adv_y,
                            'char_percent': adv_x
                        })
                        adv_df['Dataset'] = 'IR Adversarial'
                        adv_df['Guessing_Model'] = f' {name}'
                        frames.append(adv_df)

                        if len(gameplay['advneural_correct_positions']) > 0:
                            adv_correct_positions = gameplay[
                                'advneural_correct_positions']
                            adv_wrong_positions = gameplay[
                                'advneural_wrong_positions']
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(adv_positions)
                            adv_result = np.array(
                                len(adv_correct_positions) * [1] +
                                len(adv_wrong_positions) * [0])
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = adv_sorted_result.cumsum(
                            ) / adv_sorted_result.shape[0]
                            adv_df = pd.DataFrame({
                                'correct': adv_y,
                                'char_percent': adv_x
                            })
                            adv_df['Dataset'] = 'RNN Adversarial'
                            adv_df['Guessing_Model'] = f' {name}'
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
                    human_vals = sort_humans(
                        list(human_df['Guessing_Model'].unique()))
                    human_dtype = CategoricalDtype(human_vals, ordered=True)
                    human_df['Guessing_Model'] = human_df[
                        'Guessing_Model'].astype(human_dtype)
                    dataset_dtype = CategoricalDtype(
                        ['Regular Test', 'IR Adversarial', 'RNN Adversarial'],
                        ordered=True)
                    human_df['Dataset'] = human_df['Dataset'].astype(
                        dataset_dtype)

            if no_models:
                p = ggplot(human_df) + geom_point(shape='.')
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 1 - IR Adversarial']
                if 2 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 2 - IR Adversarial']
                    df = df[df['Dataset'] != 'Round 2 - RNN Adversarial']
                p = ggplot(df)
                if self.save_df is not None:
                    eprint(f'Saving df to: {self.save_df}')
                    df.to_json(self.save_df)

                if os.path.exists('data/external/all_human_gameplay.json'
                                  ) and not self.no_humans:
                    eprint('Loading human data')
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap('Guessing_Model', ncol=1)
            else:
                facet_conf = facet_wrap('Guessing_Model', nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(method='mavg',
                                        se=False,
                                        method_args={'window': 400})
                else:
                    chart = stat_summary_bin(fun_data=mean_no_se,
                                             bins=20,
                                             shape='.',
                                             linetype='None',
                                             size=0.5)
            else:
                chart = None

            p = (p + facet_conf +
                 aes(x='char_percent', y='correct', color='Dataset'))
            if chart is not None:
                p += chart
            p = (
                p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) +
                scale_x_continuous(breaks=[0, .5, 1]) +
                coord_cartesian(ylim=limits) +
                xlab('Percent of Question Revealed') + ylab('Accuracy') +
                theme(
                    #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={
                        't': 6,
                        'b': 6,
                        'l': 1,
                        'r': 5
                    })) + scale_color_manual(
                        values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'],
                        name='Questions'))
            if self.title != '':
                p += ggtitle(self.title)

            return p
        else:
            if self.save_df is not None:
                eprint(f'Saving df to: {self.save_df}')
                df.to_json(self.save_df)
            return (ggplot(self.char_plot_df) + aes(
                x='char_percent', y='correct', color='Guessing_Model') +
                    stat_smooth(
                        method='mavg', se=False, method_args={'window': 500}) +
                    scale_y_continuous(breaks=np.linspace(0, 1, 6)) +
                    coord_cartesian(ylim=limits))
# In[11]:

# Side by side original input vs simulated data

# Add label for input or simulated dataset
input_data_UMAPencoded_df['dataset'] = 'original'
simulated_data_UMAPencoded_df['dataset'] = 'simulated'

# Concatenate input and simulated dataframes together
combined_data_df = pd.concat(
    [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot
ggplot(combined_data_df, aes(
    x='1', y='2')) + geom_point(alpha=0.3) + facet_wrap('~dataset') + labs(
        x="UMAP 1", y="UMAP 2", title="UMAP of original and simulated data")

# In[12]:

# Overlay original input vs simulated data

# Add label for input or simulated dataset
input_data_UMAPencoded_df['dataset'] = 'original'
simulated_data_UMAPencoded_df['dataset'] = 'simulated'

# Concatenate input and simulated dataframes together
combined_data_df = pd.concat(
    [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot
Example #21
0
def test_non_mapped_facetting():
    p = (g
         + geom_abline(intercept=0, slope=1, size=1)
         + facet_wrap('var1')
         )
    assert p == 'non_mapped_facetting'
Example #22
0
# In[12]:

for level in levels:
    all_feature_results_subset_df = all_feature_results_df.query(
        "level == @level").reset_index(drop=True)

    output_dir = pathlib.Path(f"{output_fig_dir}/{level}")
    output_dir.mkdir(exist_ok=True)

    # Figure 1 - Per plate feature differences
    per_plate_feature_gg = (
        gg.ggplot(all_feature_results_subset_df,
                  gg.aes(x="plate", y="metric_value")) +
        gg.geom_point(size=0.1, alpha=0.5) +
        gg.facet_wrap("~metric", scales="free", nrow=len(metrics)) +
        gg.xlab("Plate") + gg.ylab("Feature Difference\nBetween Tools") +
        gg.ggtitle(f"Plate Summary\n{level}") + theme_summary)

    output_file = pathlib.Path(f"{output_dir}/{level}_metrics_per_plate.png")
    per_plate_feature_gg.save(output_file, dpi=dpi, height=height, width=width)

    print(per_plate_feature_gg)
    del per_plate_feature_gg

# In[13]:

for level in levels:
    all_feature_results_subset_df = all_feature_results_df.query(
        "level == @level").reset_index(drop=True)
Example #23
0
def test_facet_wrap_label_both():
    p = g + facet_wrap('~var1+var2', labeller='label_both')
    assert p == 'facet_wrap_label_both'
Example #24
0
# print(aci)
res
# res.to_feather("data_glm.feather")


def label_x(dates):
    res = [(datetime.datetime(2018, 1, 1) +
            datetime.timedelta(x)).strftime("%d-%m") for x in dates]
    print(res)
    return res


(ggplot(data=res, mapping=aes(x='julian', y='ACI_mean', colour='site')) +
 xlab("Day") + ylab("Mean daily ACI (standardized)")
 # + facet_grid("site~", scales="free")
 + facet_wrap("site", nrow=2, ncol=3) + geom_point() +
 geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) +
 geom_smooth(method="mavg",
             se=False,
             method_args={
                 "window": 4,
                 "center": True,
                 "min_periods": 1
             }) + scale_colour_manual(values=cbbPalette, guide=False) +
 scale_x_continuous(labels=label_x)).save("figs/ACI_all_testfacet3.png",
                                          height=10,
                                          width=16,
                                          dpi=150)

#################
### Denoising ###
Example #25
0
def plot_portfolio(portfolio_df,
                   figure_size=(12, 4),
                   line_size=1.5,
                   date_text_size=7):
    """
    Given a daily snapshot of virtual purchases plot both overall and per-stock
    performance. Return a tuple of figures representing the performance as inline data.
    """
    assert portfolio_df is not None
    #print(portfolio_df)
    portfolio_df['date'] = pd.to_datetime(portfolio_df['date'])
    avg_profit_over_period = portfolio_df.filter(
        items=['stock', 'stock_profit']).groupby('stock').mean()
    avg_profit_over_period['contribution'] = [
        'positive' if profit >= 0.0 else 'negative'
        for profit in avg_profit_over_period.stock_profit
    ]
    avg_profit_over_period = avg_profit_over_period.drop(
        'stock_profit',
        axis='columns')  # dont want to override actual profit with average
    portfolio_df = portfolio_df.merge(avg_profit_over_period,
                                      left_on='stock',
                                      right_index=True,
                                      how='inner')
    #print(portfolio_df)

    # 1. overall performance
    df = portfolio_df.filter(items=[
        'portfolio_cost', 'portfolio_worth', 'portfolio_profit', 'date'
    ])
    df = df.melt(id_vars=['date'], var_name='field')
    plot = (
        p9.ggplot(df, p9.aes('date', 'value', group='field', color='field')) +
        p9.labs(x='', y='$ AUD') + p9.geom_line(size=1.5) +
        p9.facet_wrap('~ field', nrow=3, ncol=1, scales='free_y') +
        p9.theme(axis_text_x=p9.element_text(angle=30, size=date_text_size),
                 figure_size=figure_size,
                 legend_position='none'))
    overall_figure = plot_as_inline_html_data(plot)

    df = portfolio_df.filter(
        items=['stock', 'date', 'stock_profit', 'stock_worth', 'contribution'])
    melted_df = df.melt(id_vars=['date', 'stock', 'contribution'],
                        var_name='field')
    all_dates = sorted(melted_df['date'].unique())
    df = melted_df[melted_df['date'] == all_dates[-1]]
    df = df[df['field'] == 'stock_profit']  # only latest profit is plotted
    df['contribution'] = [
        'positive' if profit >= 0.0 else 'negative' for profit in df['value']
    ]

    # 2. plot contributors ie. winners and losers
    plot = (p9.ggplot(df, p9.aes('stock', 'value', fill='stock')) +
            p9.geom_bar(stat='identity') + p9.labs(x='', y='$ AUD') +
            p9.facet_grid('contribution ~ field') +
            p9.theme(legend_position='none', figure_size=figure_size))
    profit_contributors = plot_as_inline_html_data(plot)

    # 3. per purchased stock performance
    plot = (
        p9.ggplot(melted_df,
                  p9.aes('date', 'value', group='stock', colour='stock')) +
        p9.xlab('') + p9.geom_line(size=1.0) +
        p9.facet_grid('field ~ contribution', scales="free_y") + p9.theme(
            axis_text_x=p9.element_text(angle=30, size=date_text_size),
            figure_size=figure_size,
            panel_spacing=
            0.5,  # more space between plots to avoid tick mark overlap
            subplots_adjust={'right': 0.8}))
    stock_figure = plot_as_inline_html_data(plot)
    return overall_figure, stock_figure, profit_contributors
Example #26
0
def test_label_both():
    p = g + facet_wrap('~ gear', labeller='label_both')

    assert p == 'label_both'
Example #27
0
def plot_cellranger_vs_cellbender(samplename, raw_cellranger_mtx,
                                  filtered_cellranger_mtx,
                                  cellbender_unfiltered_h5, fpr,
                                  n_expected_cells, n_total_droplets_included,
                                  out_dir):
    """compare cellranger raw vs cellranger filtered vs cellbender outputs"""
    logging.info('samplename ' + str(samplename))
    logging.info('raw_cellranger_mtx ' + str(raw_cellranger_mtx))
    logging.info('filtered_cellranger_mtx ' + str(filtered_cellranger_mtx))
    logging.info('cellbender_unfiltered_h5 ' + str(cellbender_unfiltered_h5))
    logging.info('fpr ' + str(fpr))
    logging.info('n_expected_cells ' + str(n_expected_cells))
    logging.info('n_total_droplets_included ' + str(n_total_droplets_included))
    logging.info('out_dir ' + str(out_dir))

    # Make the output directory if it does not exist.
    if out_dir == '':
        out_dir = os.getcwd()
    else:
        os.makedirs(out_dir, exist_ok=True)
        out_dir = out_dir + '/fpr_' + fpr
        os.makedirs(out_dir, exist_ok=True)
        os.makedirs(out_dir + '/' + samplename, exist_ok=True)
        logging.info(out_dir)
    # logging.info(df.head())

    # Get compression opts for pandas
    compression_opts = 'gzip'
    if LooseVersion(pd.__version__) > '1.0.0':
        compression_opts = dict(method='gzip', compresslevel=9)

    # read cellranger raw
    adata_cellranger_raw = sc.read_10x_mtx(raw_cellranger_mtx,
                                           var_names='gene_symbols',
                                           make_unique=True,
                                           cache=False,
                                           cache_compression=compression_opts)

    # First filter out any cells that have 0 total counts
    zero_count_cells_cellranger_raw = adata_cellranger_raw.obs_names[np.where(
        adata_cellranger_raw.X.sum(axis=1) == 0)[0]]
    # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering.
    logging.info(
        "_cellranger_raw: Filtering {}/{} cells with 0 counts.".format(
            len(zero_count_cells_cellranger_raw), adata_cellranger_raw.n_obs))
    adata_cellranger_raw = adata_cellranger_raw[
        adata_cellranger_raw.obs_names.difference(
            zero_count_cells_cellranger_raw, sort=False)]

    sc.pp.calculate_qc_metrics(adata_cellranger_raw, inplace=True)

    logging.info('cellranger raw n barcodes(.obs) x cells(.var) .X.shape:')
    logging.info(adata_cellranger_raw.X.shape)
    logging.info('cellranger raw .obs:')
    logging.info(adata_cellranger_raw.obs)
    logging.info('cellranger raw .var:')
    logging.info(adata_cellranger_raw.var)

    df_total_counts = pd.DataFrame(data=adata_cellranger_raw.obs.sort_values(
        by=['total_counts'], ascending=False).total_counts)
    df_total_counts['barcode_row_number'] = df_total_counts.reset_index(
    ).index + 1
    df_total_counts['barcodes'] = df_total_counts.index
    df_total_counts_cellranger_raw = df_total_counts
    df_total_counts_cellranger_raw['dataset'] = 'Cellranger Raw'

    logging.info(df_total_counts)
    # read cellranger filtered
    adata_cellranger_filtered = sc.read_10x_mtx(
        filtered_cellranger_mtx,
        var_names='gene_symbols',
        make_unique=True,
        cache=False,
        cache_compression=compression_opts)

    # First filter out any cells that have 0 total counts
    zero_count_cells_cellranger_filtered = adata_cellranger_filtered.obs_names[
        np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]]
    # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering.
    logging.info(
        "_cellranger_filtered: Filtering {}/{} cells with 0 counts.".format(
            len(zero_count_cells_cellranger_filtered),
            adata_cellranger_filtered.n_obs))
    adata_cellranger_filtered = adata_cellranger_filtered[
        adata_cellranger_filtered.obs_names.difference(
            zero_count_cells_cellranger_filtered, sort=False)]

    sc.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True)

    logging.info(
        'cellranger filtered n barcodes(.obs) x cells(.var) .X.shape:')
    logging.info(adata_cellranger_filtered.X.shape)
    logging.info('cellranger filtered .obs:')
    logging.info(adata_cellranger_filtered.obs.columns)
    logging.info(adata_cellranger_filtered.obs)
    logging.info('cellranger filtered .var:')
    logging.info(adata_cellranger_filtered.var)

    df_total_counts = pd.DataFrame(
        data=adata_cellranger_filtered.obs.sort_values(
            by=['total_counts'], ascending=False).total_counts)
    df_total_counts['barcodes'] = df_total_counts.index
    df_total_counts['barcode_row_number'] = df_total_counts.reset_index(
    ).index + 1
    df_total_counts_cellranger_filtered = df_total_counts
    df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered'

    logging.info(df_total_counts)
    # read cellbender output
    adata_cellbender = anndata_from_h5(cellbender_unfiltered_h5,
                                       analyzed_barcodes_only=True)

    # First filter out any cells that have 0 total counts
    zero_count_cells_cellbender_filtered = adata_cellbender.obs_names[np.where(
        adata_cellbender.X.sum(axis=1) == 0)[0]]
    # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering.
    logging.info(
        "_cellbender_filtered: Filtering {}/{} cells with 0 counts.".format(
            len(zero_count_cells_cellbender_filtered), adata_cellbender.n_obs))
    adata_cellbender = adata_cellbender[adata_cellbender.obs_names.difference(
        zero_count_cells_cellbender_filtered, sort=False)]

    sc.pp.calculate_qc_metrics(adata_cellbender, inplace=True)

    logging.info(
        'cellbender cellbender.n barcodes(.obs) x cells(.var) .X.shape:')
    logging.info(adata_cellbender.X.shape)
    logging.info('cellbender cellbender.obs:')
    logging.info(adata_cellbender.obs)
    logging.info('cellbender cellbender.var:')
    logging.info(adata_cellbender.var)

    df_total_counts = pd.DataFrame(data=adata_cellbender.obs.sort_values(
        by=['total_counts'], ascending=False).total_counts)
    df_total_counts['barcodes'] = df_total_counts.index
    df_total_counts['barcode_row_number'] = df_total_counts.reset_index(
    ).index + 1
    df_total_counts_cellbender = df_total_counts
    df_total_counts_cellbender['dataset'] = 'Cellbender'

    logging.info(df_total_counts)

    # df_total_counts_cellranger_filtered.rename(columns={"total_counts": "cellranger_filtered_total_counts"})
    df_cellranger_cellbender = pd.merge(
        df_total_counts_cellranger_filtered,
        df_total_counts_cellbender,
        how='outer',
        left_index=True,
        right_index=True,
        suffixes=('_cellranger',
                  '_cellbender')).sort_values(by=['total_counts_cellbender'],
                                              ascending=False)
    logging.info(df_cellranger_cellbender)
    df_cellranger_cellbender[['cellranger', 'cellbender']] = np.where(
        df_cellranger_cellbender[[
            'total_counts_cellranger', 'total_counts_cellbender'
        ]].isnull(), 0, 1)

    #df_cellranger_cellbender.to_csv('df_cellranger_cellbender.csv', index=True, index_label='barcode')

    grouped = df_cellranger_cellbender[['cellranger', 'cellbender']].groupby(
        ["cellranger", "cellbender"]).size().reset_index(name='counts')
    logging.info(grouped.columns)
    #grouped.to_csv('cellranger_cellbender.csv', index=False)

    df_cellranger_cellbender[
        'barcode_row_number'] = df_cellranger_cellbender.reset_index(
        ).index + 1

    ### plot UMI counts descending order
    df_merged = pd.concat([
        df_total_counts_cellranger_raw, df_total_counts_cellranger_filtered,
        df_total_counts_cellbender
    ])
    #df_merged.to_csv('df_merged.csv', index=True, index_label='barcode')

    df_vline = pd.DataFrame(
        data={
            'x': [int(n_expected_cells),
                  int(n_total_droplets_included)],
            'color': ['expected-cells', 'total-droplets-included']
        })

    gplt = ggplot(df_merged, aes(x='barcode_row_number', y='total_counts')) \
        + geom_point() \
        + geom_vline(df_vline, aes(xintercept='x', color='color')) \
        + theme_bw() + facet_wrap('dataset') \
        + labs(x='Barcodes (ordered by descending cell total couts)',color='Cellbender input',
               y='Cell total counts', title='Cells filtered out by Cellranger or Cellbender') \
        + scale_y_continuous(trans='log10',minor_breaks=0) + scale_x_continuous(trans='log10',minor_breaks=0)
    gplt.save(out_dir + '/' + samplename + '/barcode_vs_total_counts.png',
              width=12,
              height=5,
              dpi=300)  # dpi=300,

    df_cellranger_cellbender_count = grouped  # pd.read_csv('cellranger_cellbender.csv')

    df = pd.merge(df_merged,
                  df_cellranger_cellbender[['cellranger', 'cellbender']],
                  how='left',
                  left_index=True,
                  right_index=True)
    df = pd.merge(df,
                  df_cellranger_cellbender_count,
                  how='left',
                  left_on=['cellranger', 'cellbender'],
                  right_on=['cellranger', 'cellbender'])
    df["counts"].fillna(df['counts'].isnull().sum(), inplace=True)
    df["counts"] = df["counts"].astype(int)
    # df.replace({"counts": {""}  }, inplace=True)

    df["filtered"] = df["cellranger"].astype(
        str) + '-' + df["cellbender"].astype(str)
    df.replace(
        {
            "filtered": {
                "nan-nan": 'Cellranger Raw only',
                "1.0-1.0": "Cellranger Filtered + Cellbender",
                "1.0-0.0": "Cellranger Filtered only",
                "0.0-1.0": "Cellbender only",
                "0.0-0.0": "0.0-0.0"
            }
        },
        inplace=True)
    df["filtered"] = df["filtered"] + ', n=' + df["counts"].astype(str)
    df['filtered'].value_counts()
    df.replace(
        {
            "dataset": {
                "cellbender": "Cellbender output",
                "cellranger_raw": "Cellranger Raw output",
                "cellranger_filtered": "Cellranger Filtered output"
            }
        },
        inplace=True)


    gplt = ggplot(df, aes(x='filtered', y='total_counts', color='filtered')) \
        + geom_boxplot() \
        + theme_bw() \
        + facet_wrap('dataset') \
        + theme(axis_text_x=element_blank()) \
        + scale_y_continuous(trans='log10',minor_breaks=0) \
        + labs(color='n cells in intersection of datasets', x='', y='Cell total counts', title='Total cell counts compared across datasets (facets)')
    gplt.save(out_dir + '/' + samplename +
              '/boxplots_cellranger_vs_cellbender.png',
              width=12,
              height=5,
              dpi=300)  # dpi=300,

    # plot difference cellbender filtered vs cellranger filtered for common cells between the 2 datasets
    df_cellranger_cellbender = df_cellranger_cellbender[
        df_cellranger_cellbender['cellranger'] == 1]
    df_cellranger_cellbender = df_cellranger_cellbender[
        df_cellranger_cellbender['cellbender'] == 1]

    # Subset the datasets to the relevant barcodes.
    adata_cellbender_common = adata_cellbender[
        df_cellranger_cellbender.index.values]
    adata_cellranger_filtered_common = adata_cellranger_filtered[
        df_cellranger_cellbender.index.values]
    # Put count matrices into 'layers' in anndata for clarity.
    adata = adata_cellbender_common
    adata.layers['counts_cellbender'] = adata_cellbender_common.X.copy()
    adata.layers['counts_raw'] = adata_cellranger_filtered_common.X.copy()
    # Get the differences in counts per cell
    X_raw_minus_cb = adata.layers['counts_raw'] - adata.layers[
        'counts_cellbender']
    X_raw_minus_cb = abs(X_raw_minus_cb)
    # Get the top most different genes
    df_diff_genes = pd.DataFrame(data=adata.var.gene_symbols.values)
    df_diff_genes['ensembl_id'] = adata.var.index
    df_diff_genes['gene_symbol'] = adata.var.gene_symbols.values
    df_diff_genes['dif_across_cells'] = np.asarray(
        X_raw_minus_cb.sum(axis=0)).reshape(-1)
    df_diff_genes = df_diff_genes.sort_values('dif_across_cells',
                                              ascending=False).head(n=100)
    #df_diff_genes.to_csv('df_diff_genes.csv', index=True)
    top_genes = df_diff_genes['ensembl_id']
    top_genes_symbols = df_diff_genes['gene_symbol']
    logging.info('top_genes:')
    logging.info(top_genes)

    logging.info(adata_cellbender_common.var.index)
    adata_cellbender_common = adata_cellbender[
        df_cellranger_cellbender.index.values, top_genes].to_df()
    adata_cellbender_common['barcode'] = adata_cellbender_common.index
    adata_cellbender_common = pd.melt(adata_cellbender_common,
                                      ignore_index=True,
                                      id_vars=['barcode'],
                                      var_name='ensembl_id',
                                      value_name='count')
    adata_cellbender_common = pd.merge(
        adata_cellbender_common,
        df_diff_genes[['ensembl_id', 'gene_symbol']],
        how='left',
        left_on='ensembl_id',
        right_on='ensembl_id')
    adata_cellbender_common = adata_cellbender_common.sort_values(
        by=['barcode', 'ensembl_id'], ascending=False)
    adata_cellbender_common['dataset'] = 'Cellbender'
    #adata_cellbender_common.to_csv('adata_cellbender_common.csv', index=True)

    logging.info(adata_cellranger_filtered.var.index)
    adata_cellranger_filtered_common = adata_cellranger_filtered[
        df_cellranger_cellbender.index.values, top_genes_symbols].to_df()
    adata_cellranger_filtered_common[
        'barcode'] = adata_cellranger_filtered_common.index
    adata_cellranger_filtered_common = pd.melt(
        adata_cellranger_filtered_common,
        ignore_index=True,
        id_vars=['barcode'],
        var_name='gene_symbol',
        value_name='count')
    adata_cellranger_filtered_common = pd.merge(
        adata_cellranger_filtered_common,
        df_diff_genes[['ensembl_id', 'gene_symbol']],
        how='left',
        left_on='gene_symbol',
        right_on='gene_symbol')
    adata_cellranger_filtered_common['dataset'] = 'Cellranger Filtered'
    adata_cellranger_filtered_common = adata_cellranger_filtered_common.sort_values(
        by=['barcode', 'ensembl_id'], ascending=False)
    adata_cellranger_filtered_common = adata_cellranger_filtered_common[
        adata_cellbender_common.columns]
    #adata_cellranger_filtered_common.to_csv('adata_cellranger_filtered_common.csv', index=True)

    logging.info(adata_cellranger_raw.var.index)
    adata_cellranger_raw_common = adata_cellranger_raw[
        df_cellranger_cellbender.index.values, top_genes_symbols].to_df()
    adata_cellranger_raw_common['barcode'] = adata_cellranger_raw_common.index
    adata_cellranger_raw_common = pd.melt(adata_cellranger_raw_common,
                                          ignore_index=True,
                                          id_vars=['barcode'],
                                          var_name='gene_symbol',
                                          value_name='count')
    adata_cellranger_raw_common = pd.merge(
        adata_cellranger_raw_common,
        df_diff_genes[['ensembl_id', 'gene_symbol']],
        how='left',
        left_on='gene_symbol',
        right_on='gene_symbol')
    adata_cellranger_raw_common['dataset'] = 'Cellranger Raw'
    adata_cellranger_raw_common = adata_cellranger_raw_common.sort_values(
        by=['barcode', 'ensembl_id'], ascending=False)
    adata_cellranger_raw_common = adata_cellranger_raw_common[
        adata_cellbender_common.columns]
    #adata_cellranger_raw_common.to_csv('adata_cellranger_raw_common.csv', index=True)

    logging.info(adata_cellranger_raw_common['gene_symbol'] ==
                 adata_cellbender_common['gene_symbol'])
    logging.info(adata_cellranger_raw_common['ensembl_id'] ==
                 adata_cellbender_common['ensembl_id'])

    adata_filtered_cellbender_diff = adata_cellbender_common.copy()
    adata_filtered_cellbender_diff['count'] = adata_cellranger_filtered_common[
        'count'] - adata_cellbender_common['count']
    adata_filtered_cellbender_diff[
        'dataset'] = 'Cellranger Filtered - Cellbender'

    adata_raw_cellbender_diff = adata_cellbender_common.copy()
    adata_raw_cellbender_diff['count'] = adata_cellranger_raw_common[
        'count'] - adata_cellbender_common['count']
    adata_raw_cellbender_diff['dataset'] = 'Cellranger Raw - Cellbender'

    df_merged = pd.concat([
        adata_cellbender_common, adata_cellranger_filtered_common,
        adata_cellranger_raw_common, adata_filtered_cellbender_diff,
        adata_raw_cellbender_diff
    ],
                          ignore_index=True)

    gplt = ggplot(df_merged, aes(x='gene_symbol',y='count')) \
        + geom_boxplot() \
        + theme_bw() \
        + theme(axis_text_x = element_text(angle = 90, hjust = 1, size= 6)) \
        + facet_wrap('dataset', scales = 'free', ncol = 1) \
        + labs(x='Genes (top 100 Genes most different between Cellranger Filtered counts and Cellbender filtered counts)', y='Cell total counts', title='Total cell counts compared across most different genes (x-axis) and datasets (facets)')
    gplt.save(out_dir + '/' + samplename +
              '/boxplot_topgenes_cellranger_vs_cellbender.png',
              width=10,
              height=20,
              dpi=300)  # dpi=300,
    logging.info('script done.')
Example #28
0
def test_facet_wrap_two_vars():
    p = g + facet_wrap('~var1+var2')
    p2 = g + facet_wrap('~class+var2')  # python keyword in formula
    assert p == 'facet_wrap_two_vars'
    assert p2 == 'facet_wrap_two_vars'
Example #29
0
# Add label for input or simulated dataset
input_data_UMAPencoded_df['dataset'] = 'original'
simulated_data_UMAPencoded_df['dataset'] = 'simulated'

# Concatenate input and simulated dataframes together
combined_data_df = pd.concat(
    [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot sequentially
#backgrd_data = combined_data_df[combined_data_df['experiment_id'] == 'Not selected']
#select_data = combined_data_df[combined_data_df['experiment_id'] != 'Not selected']

# Plot
ggplot(combined_data_df, aes(x='1', y='2')) + geom_point(
    aes(color='experiment_id'),
    alpha=0.3) + facet_wrap('~dataset') + xlab('UMAP 1') + ylab(
        'UMAP 2') + ggtitle('UMAP of original and simulated data (gene space)')
#+ xlim(3,12) \
#+ ylim(-7,10) \
#+ scale_colour_manual(values=["blue", "purple", "orange", "red", "magenta", "lightgrey"]) \

# In[12]:

# Overlay original and simulated data
ggplot(combined_data_df, aes(x='1', y='2')) + geom_point(
    aes(color='dataset'), alpha=0.3) + scale_colour_manual(
        values=["grey", "blue"]) + xlab('UMAP 1') + ylab('UMAP 2') + ggtitle(
            'UMAP of original and simulated data (gene space)')

# ## Visualize simulated data (gene space) projected into PCA space
Example #30
0
    def plot_abs_dataframe(self, df: pd.DataFrame) -> p9.ggplot:
        facets = []
        n_per_facet = {}
        print(df)
        for col in df.columns:
            try:
                n_values = df[col].nunique()
                if n_values == 1 and col not in [
                        "TIME_PERIOD",
                        "value",
                        "Measure",
                        "OBS_COMMENT",
                ]:
                    self.fixed_datapoints.add(f"{col}={df.at[0, col]}")
                elif n_values > 1 and col not in [
                        "value",
                        "TIME_PERIOD",
                        "OBS_COMMENT",
                ]:
                    facets.append(col)
                    n_per_facet[col] = n_values
            except:
                print(f"Ignoring unusable column: {col}")
                continue

        extra_args = {}
        need_shape = False
        if len(facets) > 2:
            # can only use two variables as plotting facets, third value will be used as a group on each plot
            # any more facets is not supported at this stage
            sorted_facets = sorted(n_per_facet.keys(),
                                   key=lambda k: n_per_facet[k])
            # print(n_per_facet)
            # print(sorted_facets)
            facets = sorted_facets[-2:]
            extra_args.update({
                "group": sorted_facets[0],
                "color": facets[0],
                "shape": sorted_facets[0],
            })
            need_shape = True
            print(f"Using {facets} as facets, {extra_args} as series")
        else:
            if len(facets) > 0:
                extra_args.update({"color": facets[0]})

        # compute figure size to give enough room for each plot
        mult = 1
        for facet in facets:
            mult *= n_per_facet[facet]
        mult /= len(facets)
        nrow = int(mult + 1)

        # facet column names must not have spaces in them as this is not permitted by plotnine facet formulas
        if len(facets) > 0:
            new_facets = []
            for f in facets:
                if " " in f:
                    new_name = f.replace(" ", "_")
                    df = df.rename(columns={f: new_name})
                    new_facets.append(new_name)
                else:
                    new_facets.append(f)
            facets = new_facets
            if "color" in extra_args:
                extra_args.update({"color": facets[0]})
            print(f"Renamed facet columns due to whitespace: {facets}")

        plot = p9.ggplot(df, p9.aes(x="TIME_PERIOD", y="value", **
                                    extra_args)) + p9.geom_point(size=3)

        if len(facets) > 0 and len(facets) <= 2:
            facet_str = "~" + " + ".join(facets[:2])
            print(f"Using facet formula: {facet_str}")
            plot += p9.facet_wrap(facet_str, ncol=len(facets), scales="free_y")

        plot_theme = {
            "figure_size": (12, int(nrow * 1.5)),
        }
        if (len(facets) == 2
            ):  # two columns of plots? if so, make sure  space for axis labels
            plot_theme.update({"subplots_adjust": {"wspace": 0.2}})
        if need_shape:
            plot += p9.scale_shape(guide="legend")
            plot += p9.guides(
                colour=False
            )  # colour legend is not useful since it is included in the facet title
            plot_theme.update({"legend_position": "right"})
        return user_theme(plot, **plot_theme)
Example #31
0
def test_label_value():
    p = g + facet_wrap('~ gear', labeller='label_value')

    assert p == 'label_value'
Example #32
0
def test_facet_wrap_one_var():
    p = g + facet_wrap('~var1')
    assert p == 'facet_wrap_one_var'
Example #33
0
def test_label_context():
    p = g + facet_wrap('~ gear', labeller='label_context')

    assert p == 'label_context'
Example #34
0
def test_facet_wrap_expression():
    p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
    assert p == 'facet_wrap_expression'
Example #35
0
def test_labeller_cols_both_wrap():
    p = g + facet_wrap('~ gear + am', labeller=labeller_cols_both)

    assert p == 'labeller_cols_both_wrap'
Example #36
0
def test_facet_wrap_two_vars():
    p = g + facet_wrap('~var1+var2')
    assert p == 'facet_wrap_two_vars'
Example #37
0
def test_aslabeller_dict_0tag():
    func = as_labeller({'0': '<tag>0</tag>'})
    p = g + facet_wrap('~ gear + am', labeller=func)

    assert p == 'aslabeller_dict_0tag'
Example #38
0
def test_facet_wrap_label_both():
    p = g + facet_wrap('~var1+var2', labeller='label_both')
    assert p == 'facet_wrap_label_both'
Example #39
0
    def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False):
        if self.y_max is not None:
            limits = [0, float(self.y_max)]
            eprint(f'Setting limits to: {limits}')
        else:
            limits = [0, 1]
        if expo:
            if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                with open('data/external/all_human_gameplay.json') as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]:
                        if self.merge_humans:
                            name = 'Human'
                        gameplay = all_gameplay[event]
                        if event != 'live':
                            control_correct_positions = gameplay['control_correct_positions']
                            control_wrong_positions = gameplay['control_wrong_positions']
                            control_positions = control_correct_positions + control_wrong_positions
                            control_positions = np.array(control_positions)
                            control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0])
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[argsort_control]
                            control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0]
                            control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x})
                            control_df['Dataset'] = 'Regular Test'
                            control_df['Guessing_Model'] = f' {name}'
                            frames.append(control_df)

                        adv_correct_positions = gameplay['adv_correct_positions']
                        adv_wrong_positions = gameplay['adv_wrong_positions']
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(adv_positions)
                        adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                        adv_df['Dataset'] = 'IR Adversarial'
                        adv_df['Guessing_Model'] = f' {name}'
                        frames.append(adv_df)

                        if len(gameplay['advneural_correct_positions']) > 0:
                            adv_correct_positions = gameplay['advneural_correct_positions']
                            adv_wrong_positions = gameplay['advneural_wrong_positions']
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(adv_positions)
                            adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                            adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                            adv_df['Dataset'] = 'RNN Adversarial'
                            adv_df['Guessing_Model'] = f' {name}'
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
                    human_vals = sort_humans(list(human_df['Guessing_Model'].unique()))
                    human_dtype = CategoricalDtype(human_vals, ordered=True)
                    human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype)
                    dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True)
                    human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype)

            if no_models:
                p = ggplot(human_df) + geom_point(shape='.')
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 1 - IR Adversarial']
                if 2 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 2 - IR Adversarial']
                    df = df[df['Dataset'] != 'Round 2 - RNN Adversarial']
                p = ggplot(df)
                if self.save_df is not None:
                    eprint(f'Saving df to: {self.save_df}')
                    df.to_json(self.save_df)

                if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                    eprint('Loading human data')
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap('Guessing_Model', ncol=1)
            else:
                facet_conf = facet_wrap('Guessing_Model', nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(method='mavg', se=False, method_args={'window': 400})
                else:
                    chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5)
            else:
                chart = None

            p = (
                p + facet_conf
                + aes(x='char_percent', y='correct', color='Dataset')
            )
            if chart is not None:
                p += chart
            p = (
                p
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + scale_x_continuous(breaks=[0, .5, 1])
                + coord_cartesian(ylim=limits)
                + xlab('Percent of Question Revealed')
                + ylab('Accuracy')
                + theme(
                    #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5})
                )
                + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')
            )
            if self.title != '':
                p += ggtitle(self.title)

            return p
        else:
            if self.save_df is not None:
                eprint(f'Saving df to: {self.save_df}')
                df.to_json(self.save_df)
            return (
                ggplot(self.char_plot_df)
                + aes(x='char_percent', y='correct', color='Guessing_Model')
                + stat_smooth(method='mavg', se=False, method_args={'window': 500})
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + coord_cartesian(ylim=limits)
            )
Example #40
0
def test_facet_wrap_not_as_table():
    p = g + facet_wrap('~var1', as_table=False)
    assert p == 'facet_wrap_not_as_table'
Example #41
0
 def plot_n_train_vs_accuracy(self):
     return (
         ggplot(self.combined_df) + facet_wrap('seen')
         + aes(x='n_train', fill='Outcome')
         + geom_histogram(binwidth=1)
     )
Example #42
0
def test_facet_wrap_direction_v():
    p = g + facet_wrap('~var1', dir='v')
    assert p == 'facet_wrap_direction_v'
Example #43
0
def test_facet_wrap_one_var():
    p = g + facet_wrap('~var1')
    p2 = g + facet_wrap('~class')  # python keyword in formula
    assert p == 'facet_wrap_one_var'
    assert p2 == 'facet_wrap_one_var'
Example #44
0
def test_facet_wrap_not_as_table_direction_v():
    p = g + facet_wrap('~var1', as_table=False, dir='v')
    assert p == 'facet_wrap_not_as_table_direction_v'
Example #45
0
def test_facet_wrap_two_vars():
    p = g + facet_wrap('~var1+var2')
    p2 = g + facet_wrap('~class+var2')  # python keyword in formula
    assert p == 'facet_wrap_two_vars'
    assert p2 == 'facet_wrap_two_vars'
Example #46
0
dfwords['Date'] = X_train_all.Date.values
dfwords.Speaker = [to_speaker_dict[spkr] for spkr in dfwords.Speaker]
pres_Nelson = (dfwords.query(f"Speaker in {first_presidency}").groupby(
    [dfwords['Date'].map(lambda x: x.year),
     'Speaker']).mean()[interesting_words].reset_index())
pres_Nelson['combined'] = [str(dt)
                           for dt in pres_Nelson.Date] + pres_Nelson.Speaker
pres_Nelson = (pres_Nelson.drop(
    columns=['Date', 'Speaker']).set_index('combined').unstack().reset_index())
pres_Nelson['Date'] = [int(comb[:4]) for comb in pres_Nelson.combined]
pres_Nelson['Speaker'] = [comb[4:] for comb in pres_Nelson.combined]
pres_Nelson = pres_Nelson.drop(columns='combined')
pres_Nelson.columns = ['Word', 'Mean TF-IDF Score', 'Date', 'Speaker']

(ggplot(pres_Nelson, aes(x='Date', y='Mean TF-IDF Score', color='Word')) +
 geom_line() + facet_wrap('Speaker', scales='free', nrow=3) +
 labs(title='Word Usage Change over Time in First Presidency'))

int_words = (dfwords.groupby(dfwords['Date'].map(lambda x: x.year)).mean()
             [interesting_words].unstack().reset_index())
int_words.columns = ['Word', 'Date', 'Mean TF-IDF Score']
(ggplot(int_words, aes(x='Date', y='Mean TF-IDF Score', color='Word')) +
 geom_line() +
 labs(title='Word Usage Change over Time in First Presidency and the 12'))

missionary_temple = (dfwords.groupby(
    dfwords['Date'].map(lambda x: x.year)).mean()[[
        'missionary work', 'family history'
    ]].unstack().reset_index())
missionary_temple.columns = ['Word', 'Date', 'Mean TF-IDF Score']
(ggplot(missionary_temple, aes(x='Date', y='Mean TF-IDF Score',
Example #47
0
def test_facet_wrap_not_as_table():
    p = g + facet_wrap('~var1', as_table=False)
    assert p == 'facet_wrap_not_as_table'
Example #48
0
    def plot_char_percent_vs_accuracy_smooth(self, expo=False):
        if expo:
            p = (ggplot(self.char_plot_df) +
                 facet_wrap('Guessing_Model', nrow=1) +
                 aes(x='char_percent', y='correct', color='Dataset') +
                 stat_smooth(
                     method='mavg', se=False, method_args={'window': 200}) +
                 scale_y_continuous(breaks=np.linspace(0, 1, 11)) +
                 scale_x_continuous(breaks=[0, .5, 1]) +
                 xlab('Percent of Question Revealed') + ylab('Accuracy') +
                 theme(legend_position='top'))
            if os.path.exists('data/external/human_gameplay.json'):
                with open('data/external/human_gameplay.json') as f:
                    gameplay = json.load(f)
                    control_correct_positions = gameplay[
                        'control_correct_positions']
                    control_wrong_positions = gameplay[
                        'control_wrong_positions']
                    control_positions = control_correct_positions + control_wrong_positions
                    control_positions = np.array(control_positions)
                    control_result = np.array(
                        len(control_correct_positions) * [1] +
                        len(control_wrong_positions) * [0])
                    argsort_control = np.argsort(control_positions)
                    control_x = control_positions[argsort_control]
                    control_sorted_result = control_result[argsort_control]
                    control_y = control_sorted_result.cumsum(
                    ) / control_sorted_result.shape[0]
                    control_df = pd.DataFrame({
                        'correct': control_y,
                        'char_percent': control_x
                    })
                    control_df['Dataset'] = 'Test Questions'
                    control_df['Guessing_Model'] = ' Human'

                    adv_correct_positions = gameplay['adv_correct_positions']
                    adv_wrong_positions = gameplay['adv_wrong_positions']
                    adv_positions = adv_correct_positions + adv_wrong_positions
                    adv_positions = np.array(control_positions)
                    adv_result = np.array(
                        len(adv_correct_positions) * [1] +
                        len(adv_wrong_positions) * [0])
                    argsort_adv = np.argsort(adv_positions)
                    adv_x = adv_positions[argsort_adv]
                    adv_sorted_result = adv_result[argsort_adv]
                    adv_y = adv_sorted_result.cumsum(
                    ) / adv_sorted_result.shape[0]
                    adv_df = pd.DataFrame({
                        'correct': adv_y,
                        'char_percent': adv_x
                    })
                    adv_df['Dataset'] = 'Challenge Questions'
                    adv_df['Guessing_Model'] = ' Human'

                    human_df = pd.concat([control_df, adv_df])
                    p = p + (geom_line(data=human_df))

            return p
        else:
            return (
                ggplot(self.char_plot_df) +
                aes(x='char_percent', y='correct', color='Guessing_Model') +
                stat_smooth(
                    method='mavg', se=False, method_args={'window': 500}) +
                scale_y_continuous(breaks=np.linspace(0, 1, 21)))
Example #49
0
def test_facet_wrap_not_as_table_direction_v():
    p = g + facet_wrap('~var1', as_table=False, dir='v')
    assert p == 'facet_wrap_not_as_table_direction_v'
Example #50
0
 def plot_n_train_vs_accuracy(self):
     return (ggplot(self.combined_df) + facet_wrap('seen') +
             aes(x='n_train', fill='Outcome') + geom_histogram(binwidth=1))
Example #51
0
#-----------------------------
#libraries https://pythonplot.com/#bar-count

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#pip install plotnine #similar to ggplots
#https://plotnine.readthedocs.io/en/stable/index.html
import plotnine  #ggplot type

from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
help(plotnine.facet_wrap)
from plotnine.data import mtcars

(ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() +
 stat_smooth(method='lm') + facet_wrap('~gear'))

from plotnine import *
(ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) +
 geom_bar(position='fill'))

(ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) +
 geom_bar(position='fill') +
 geom_text(aes(label='stat(count)'), stat='count', position='fill'))

(ggplot(mpg) + aes(x='manufacturer') + geom_bar(size=20) + coord_flip() +
 labs(y='Count', x='Manufacturer', title='Number of Cars by Make'))
#https://plotnine.readthedocs.io/en/stable/tutorials/miscellaneous-order-plot-series.html

from pydataset import data
data()
Example #52
0
def control_list(in_file=None,
                 out_dir=None,
                 reference_gene_file=None,
                 log2=False,
                 page_width=None,
                 page_height=None,
                 user_img_file=None,
                 page_format=None,
                 pseudo_count=1,
                 set_colors=None,
                 dpi=300,
                 rug=False,
                 jitter=False,
                 skip_first=False):
    # -------------------------------------------------------------------------
    #
    # Check in_file content
    #
    # -------------------------------------------------------------------------

    for p, line in enumerate(in_file):

        line = chomp(line)
        line = line.split("\t")

        if len(line) > 2:
            message("Need a two columns file.",
                    type="ERROR")
        if skip_first:
            if p == 0:
                continue
        try:
            fl = float(line[1])
        except ValueError:
            msg = "It seems that column 2 of input file"
            msg += " contains non numeric values. "
            msg += "Check that no header is present and that "
            msg += "columns are ordered properly. "
            msg += "Or use '--skip-first'. "
            message(msg, type="ERROR")

        if log2:
            fl = fl + pseudo_count
            if fl <= 0:
                message("Can not log transform negative/zero values. Add a pseudo-count.",
                        type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check colors
    #
    # -------------------------------------------------------------------------

    set_colors = set_colors.split(",")

    if len(set_colors) != 2:
        message("Need two colors. Please fix.", type="ERROR")

    mcolors_name = mcolors.cnames

    for i in set_colors:
        if i not in mcolors_name:
            if not is_hex_color(i):
                message(i + " is not a valid color. Please fix.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Preparing output files
    #
    # -------------------------------------------------------------------------

    # Preparing pdf file name
    file_out_list = make_outdir_and_file(out_dir, ["control_list.txt",
                                                   "reference_list.txt",
                                                   "diagnostic_diagrams." + page_format],
                                         force=True)

    control_file, reference_file_out, img_file = file_out_list

    if user_img_file is not None:

        os.unlink(img_file.name)
        img_file = user_img_file

        if not img_file.name.endswith(page_format):
            msg = "Image format should be: {f}. Please fix.".format(f=page_format)
            message(msg, type="ERROR")

        test_path = os.path.abspath(img_file.name)
        test_path = os.path.dirname(test_path)

        if not os.path.exists(test_path):
            os.makedirs(test_path)

    # -------------------------------------------------------------------------
    #
    # Read the reference list
    #
    # -------------------------------------------------------------------------

    try:
        reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        message("No genes in --reference-gene-file.", type="ERROR")

    reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True)

    # -------------------------------------------------------------------------
    #
    # Delete duplicates
    #
    # -------------------------------------------------------------------------

    before = len(reference_genes)
    reference_genes = reference_genes.drop_duplicates(['gene'])
    after = len(reference_genes)

    msg = "%d duplicate lines have been deleted in reference file."
    message(msg % (before - after))

    # -------------------------------------------------------------------------
    #
    # Read expression data and add the pseudo_count
    #
    # -------------------------------------------------------------------------

    if skip_first:
        exp_data = pd.read_csv(in_file.name, sep="\t",
                               header=None, index_col=None,
                               skiprows=[0], names=['exprs'])
    else:

        exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0)

    exp_data.exprs = exp_data.exprs.values + pseudo_count

    # -------------------------------------------------------------------------
    #
    # log transformation
    #
    # -------------------------------------------------------------------------

    ylabel = 'Expression'

    if log2:
        if len(exp_data.exprs.values[exp_data.exprs.values == 0]):
            message("Can't use log transformation on zero or negative values. Use -p.",
                    type="ERROR")
        else:
            exp_data.exprs = np.log2(exp_data.exprs.values)
            ylabel = 'log2(Expression)'

    # -------------------------------------------------------------------------
    #
    # Are reference gene found in control list
    #
    # -------------------------------------------------------------------------

    # Sort in increasing order
    exp_data = exp_data.sort_values('exprs')

    #  Vector with positions indicating which in the
    # expression data list are found in reference_gene

    reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index]

    msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found)
    message(msg)

    not_found = [x for x in reference_genes['gene'] if x not in exp_data.index]

    if len(not_found):
        if len(not_found) == len(reference_genes):
            message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR")
        else:
            message("List of reference genes not found :%s" % not_found)
    else:
        message("All reference genes were found.")

    # -------------------------------------------------------------------------
    #
    # Search for genes with matched signal
    #
    # -------------------------------------------------------------------------

    exp_data_save = exp_data.copy()

    control_list = list()

    nb_candidate_left = exp_data.shape[0] - len(reference_genes_found)

    message("Searching for genes with matched signal.")

    if nb_candidate_left < len(reference_genes_found):
        message("Not enough element to perform selection. Exiting", type="ERROR")

    for i in reference_genes_found:
        not_candidates = reference_genes_found + control_list
        not_candidates = list(set(not_candidates))

        diff = abs(exp_data.loc[i] - exp_data)
        control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist())

    # -------------------------------------------------------------------------
    #
    # Prepare a dataframe for plotting
    #
    # -------------------------------------------------------------------------

    message("Preparing a dataframe for plotting.")

    reference = exp_data_save.loc[reference_genes_found].sort_values('exprs')
    reference = reference.assign(genesets=['Reference'] * reference.shape[0])

    control = exp_data_save.loc[control_list].sort_values('exprs')
    control = control.assign(genesets=['Control'] * control.shape[0])

    data = pd.concat([reference, control])
    data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index)
    data['genesets'] = Categorical(data['genesets'])

    # -------------------------------------------------------------------------
    #
    # Diagnostic plots
    #
    # -------------------------------------------------------------------------

    p = ggplot(data, aes(x='sets', y='exprs', fill='genesets'))

    p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors)))

    p += geom_violin(color=None)

    p += xlab('Gene sets') + ylab(ylabel)

    p += facet_wrap('~genesets')

    if rug:
        p += geom_rug()

    if jitter:
        p += geom_jitter()

    p += theme_bw()
    p += theme(axis_text_x=element_blank())

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    #
    # Saving
    #
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()
        message("Saving diagram to file : " + img_file.name)
        message("Be patient. This may be long for large datasets.")

        try:
            p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False)
        except PlotnineError as err:
            message("Plotnine message: " + err.message)
            message("Plotnine encountered an error.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # write results
    #
    # -------------------------------------------------------------------------

    exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t")
    exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
        ('probability', pipeline.predict_proba(X)[:, 1])
    ])    
    predict_df = predict_df.append(df)

predict_df['probability_str'] = predict_df['probability'].apply('{:.1%}'.format)


# In[27]:

# Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy)
(predict_df
    .sort_values('decision_function', ascending=False)
    .query("status == 0 and feature_set == 'full'")
    .head(10)
)


# In[28]:

predict_df['status_'] = predict_df['status'].map(
    lambda x: 'negative' if x == 0 else 'positive')

(gg.ggplot(predict_df, gg.aes(x='probability', 
                              fill='status_'))
 + gg.geom_density(alpha=0.6)
 + gg.facet_wrap('~feature_set', ncol=1)
 + gg.labs(x='probability', y='density')
 + gg.guides(fill=gg.guide_legend(title=""))
 + theme_cognoma())

Example #54
0
    def plot(df: 'DataFrame',
             group_colname: str = None,
             time_colname: str = None,
             max_num_groups: int = 1,
             split_dt: Optional[np.datetime64] = None,
             **kwargs) -> 'DataFrame':
        """
        :param df: The output of `.to_dataframe()`.
        :param group_colname: The name of the group-column.
        :param time_colname: The name of the time-column.
        :param max_num_groups: Max. number of groups to plot; if the number of groups in the dataframe is greater than
        this, a random subset will be taken.
        :param split_dt: If supplied, will draw a vertical line at this date (useful for showing pre/post validation).
        :param kwargs: Further keyword arguments to pass to `plotnine.theme` (e.g. `figure_size=(x,y)`)
        :return: A plot of the predicted and actual values.
        """

        from plotnine import (
            ggplot, aes, geom_line, geom_ribbon, facet_grid, facet_wrap, theme_bw, theme, ylab, geom_vline
        )

        is_components = ('process' in df.columns and 'state_element' in df.columns)

        if group_colname is None:
            group_colname = 'group'
            if group_colname not in df.columns:
                raise TypeError("Please specify group_colname")
        if time_colname is None:
            time_colname = 'time'
            if 'time' not in df.columns:
                raise TypeError("Please specify time_colname")

        df = df.copy()
        if df[group_colname].nunique() > max_num_groups:
            subset_groups = df[group_colname].drop_duplicates().sample(max_num_groups).tolist()
            if len(subset_groups) < df[group_colname].nunique():
                print("Subsetting to groups: {}".format(subset_groups))
            df = df.loc[df[group_colname].isin(subset_groups), :]
        num_groups = df[group_colname].nunique()

        aes_kwargs = {'x': time_colname}
        if is_components:
            aes_kwargs['group'] = 'state_element'

        plot = (
                ggplot(df, aes(**aes_kwargs)) +
                geom_line(aes(y='mean'), color='#4C6FE7', size=1.5, alpha=.75) +
                geom_ribbon(aes(ymin='lower', ymax='upper'), color=None, alpha=.25) +
                ylab("")
        )

        if is_components:
            num_processes = df['process'].nunique()
            if num_groups > 1 and num_processes > 1:
                raise ValueError("Cannot plot components for > 1 group and > 1 processes.")
            elif num_groups == 1:
                plot = plot + facet_wrap(f"~ measure + process", scales='free_y', labeller='label_both')
                if 'figure_size' not in kwargs:
                    from plotnine.facets.facet_wrap import n2mfrow
                    nrow, _ = n2mfrow(len(df[['process', 'measure']].drop_duplicates().index))
                    kwargs['figure_size'] = (12, nrow * 2.5)
            else:
                plot = plot + facet_grid(f"{group_colname} ~ measure", scales='free_y', labeller='label_both')
                if 'figure_size' not in kwargs:
                    kwargs['figure_size'] = (12, num_groups * 2.5)

            if (df.groupby('measure')['process'].nunique() <= 1).all():
                plot = plot + geom_line(aes(y='mean', color='state_element'), size=1.5)

        else:
            if 'actual' in df.columns:
                plot = plot + geom_line(aes(y='actual'))
            if num_groups > 1:
                plot = plot + facet_grid(f"{group_colname} ~ measure", scales='free_y', labeller='label_both')
            else:
                plot = plot + facet_wrap("~measure", scales='free_y', labeller='label_both')

            if 'figure_size' not in kwargs:
                kwargs['figure_size'] = (12, 5)

        if split_dt:
            plot = plot + geom_vline(xintercept=np.datetime64(split_dt), linetype='dashed')

        return plot + theme_bw() + theme(**kwargs)
Example #55
0
                                   x['k'],
                                   x['resubAccuracy'],
                                   x['testAccuracy'])
                                  for x in repeatedKnnResults],
                                 columns = ['p',
                                            'k',
                                            'resubAccuracy',
                                            'testAccuracy'])

ggdata = pd.concat(
    [DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'resub',
                'Accuracy' : knnResultsSimplified.resubAccuracy}),
     DataFrame({'p' : knnResultsSimplified.p,
                'k' : knnResultsSimplified.k.apply(int),
                'type' : 'test',
                'Accuracy' : knnResultsSimplified.testAccuracy})],
    axis = 0
)

plt.close()
ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy',
                               color='type', group='type', linetype='type'))
ggo += gg.facet_wrap('~ k')
ggo += gg.scale_x_log10()
ggo += gg.geom_point(alpha=0.6)
ggo += gg.stat_smooth()
ggo += gg.theme_bw()
print(ggo)
Example #56
0
p9.ggplot(data=df_p4k, mapping=p9.aes(x="score")) + p9.geom_density()

# In[6]:

df_p4k_sum = df_p4k.groupby("genre").mean()
df_p4k_sum

# In[25]:

df_p4k_best = df_p4k[df_p4k['best'] == 1]
p9.ggplot(data=df_p4k_best, mapping=p9.aes(x="score")) + p9.geom_density()

# In[26]:

p9.ggplot(data=df_p4k, mapping=p9.aes(
    x="score")) + p9.facet_wrap("~genre") + p9.geom_density()

# Word Clouds of the review

# In[23]:

wc_text = " ".join(df_p4k['review'].head(10).as_matrix().astype('str'))
wc_text = " ".join(stripNonAlphaNum(wc_text))
p4k_wordcloud = WordCloud().generate(wc_text)
wordcloud = WordCloud(max_font_size=40).generate(wc_text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")

# Split word clouds by genre

# In[8]: