def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text(aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False) + geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black') + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, )) return plt
def show_community_prediction( self, percent_kept: float = 0.95, side_cut_from: str = "both", num_samples: int = 1000, ): """ Plot samples from the community prediction on this question :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param num_samples: number of samples from the community :return: ggplot graphics object """ community_samples = pd.DataFrame(data={ "samples": [self.sample_community() for _ in range(0, num_samples)] } # type: ignore ) (_xmin, _xmax) = self.get_central_quantiles(community_samples, percent_kept=percent_kept, side_cut_from=side_cut_from) title_name = ( f"Q: {self.name}" if self.name else "\n".join(textwrap.wrap(self.data["title"], 60)) + "\n\n" # type: ignore ) return (ggplot(community_samples, aes("samples")) + geom_density(fill="#b3cde3", alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name + "Community Predictions") + ergo_theme)
def test_changing_xlim_in_stat_density(): n = 100 _xlim = (5, 10) df = pd.DataFrame({'x': np.linspace(_xlim[0] - 1, _xlim[1] + 1, n)}) p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim)) # No exceptions p._build()
def test_coord_trans_backtransforms(): df = pd.DataFrame({'x': [-np.inf, np.inf], 'y': [1, 2]}) p = (ggplot(df, aes('x', 'y')) + geom_line(size=2) + xlim(1, 2) + coord_trans(x='log10') ) assert p == 'coord_trans_backtransform'
def test_changing_xlim_in_stat_density(): n = 100 _xlim = (5, 10) df = pd.DataFrame({'x': np.linspace(_xlim[0] - 1, _xlim[1] + 1, n)}) p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim)) # No exceptions with pytest.warns(PlotnineWarning): # warns about removed points. p._build()
def test_changing_xlim_in_stat_density(): n = 100 _xlim = (5, 10) df = pd.DataFrame({'x': np.linspace(_xlim[0]-1, _xlim[1]+1, n)}) p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim) ) # No exceptions p._build()
def scatterplot(cls, df): Utils.check_and_make_dir("Figures/Scatterplots") df = df[(df['index'] != 'Overall') & (df['index'] != 'No ROI')] # Remove No ROI and Overall rows df = df.groupby([config.table_cols, config.table_rows]).apply( lambda x: x.sort_values(['Mean'])) # Group by parameters and sort df = df.reset_index(drop=True) # Reset index to remove grouping scatterplots = ['roi_ordered', 'stat_ordered'] if config.table_row_order == 'roi': scatterplots.remove('stat') elif config.table_row_order == 'statorder': scatterplots.remove('roi_ordered') for scatterplot in scatterplots: if config.verbose: print(f"Saving {scatterplot} scatterplot!") if scatterplot == 'roi_ordered': roi_ord = pd.Categorical(df['index'], categories=df['index'].unique() ) # Order rows based on first facet else: roi_ord = pd.Categorical( df.groupby(['MB', 'SENSE' ]).cumcount()) # Order each facet individually figure_table = ( pltn.ggplot(df, pltn.aes(x="Mean", y=roi_ord)) + pltn.geom_point(na_rm=True, size=1) + pltn.geom_errorbarh( pltn.aes(xmin="Mean-Conf_Int_95", xmax="Mean+Conf_Int_95"), na_rm=True, height=None) + pltn.xlim(0, None) + pltn.scale_y_discrete(labels=[]) + pltn.ylab(config.table_y_label) + pltn.xlab(config.table_x_label) + pltn.facet_grid('{rows}~{cols}'.format(rows=config.table_rows, cols=config.table_cols), drop=True, labeller="label_both") + pltn.theme_538() # Set theme + pltn.theme( panel_grid_major_y=pltn.themes.element_line(alpha=0), panel_grid_major_x=pltn.themes.element_line(alpha=1), panel_background=pltn.element_rect(fill="gray", alpha=0.1), dpi=config.plot_dpi)) figure_table.save( f"Figures/Scatterplots/{scatterplot}_scatterplot.png", height=config.plot_scale, width=config.plot_scale * 3, verbose=False, limitsize=False)
def density_plot1(num_matches_per_round: int, match_lengths_from_one_round: list): """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """ match_lengths = pd.DataFrame( {'Match length': match_lengths_from_one_round}) (plt.ggplot(match_lengths, plt.aes(x='Match length')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.theme_classic() + plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
def ggimg(image, mapping=None, data=None, dpi=80): w, h = image.size return ( ggplot(mapping=mapping, data=data) + scale_y_reverse(limits=(0, h)) + xlim(0, w) + scale_color_discrete(guide=False) # removes legend for line color + theme_image(w, h, dpi=dpi) + annotate( "rect", xmin=0, xmax=w, ymin=0, ymax=h, color="black", fill=None ) # box around image )
def plot_continuous_distribution(data_table, continuous_metric_name, segment_name, title, xlim=None): filtered_data = data_table[ pd.notnull(data_table[continuous_metric_name]) & pd.notnull(data_table[continuous_metric_name])] result = plot.ggplot(data=filtered_data) + plot.aes(x=continuous_metric_name, color=segment_name) + \ plot.geom_density() + plot.labs(x=continuous_metric_name, title=title, fill=segment_name) if pd.notnull(xlim): result = result + plot.xlim(xlim) return result
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) + scale_y_continuous(labels=comma_format()) + ggtitle("Intensity of Design Pattern Use") + xlab("Percentage of Classes Participating in Design Pattern") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_title_y=element_text(margin={"r": 40}), subplots_adjust={ "wspace": 0.3, "hspace": 0.5 })).save(file_path, width=24, height=24)
def test_inplace_add(): p = _p = ggplot(df) p += aes('x', 'y') assert p is _p p += geom_point() assert p is _p p += stat_identity() assert p is _p p += scale_x_continuous() assert p is _p with pytest.warns(PlotnineWarning): # Warning for; replacing existing scale added above p += xlim(0, 10) assert p is _p p += lims(y=(0, 10)) assert p is _p p += labs(x='x') assert p is _p p += coord_trans() assert p is _p p += facet_null() assert p is _p p += annotate('point', 5, 5, color='red', size=5) assert p is _p p += guides() assert p is _p p += theme_gray() assert p is _p th = _th = theme_gray() th += theme(aspect_ratio=1) assert th is _th
def gene_log_HR_plot(inFile, pcaFile=None, model=None): # get logHRs par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) t_logHR = par["means"]["logHR"][components, 0] tf_logHR = par["means"]["logHR"][tf_components, 0] t_logHR_sd = par["stds"]["logHR"][components, 0] tf_logHR_sd = par["stds"]["logHR"][tf_components, 0] # get pca if pcaFile is None: pcaFile = inFile.replace("_params.hdf5", "_pca.pkl") with open(pcaFile, "rb") as buff: pca = pickle.load(buff) # prep dataframe n_genes = pca.components_.shape[1] if model is None: logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)]) else: logHR_df = pd.DataFrame(index=model.counts.index) logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR) logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR) logHR_df["tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0)) logHR_df["non-tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0)) logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 # make plot lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_point(alpha=0.3, color="red") + pn.geom_abline()) return pl, logHR_df
def main(argv: List[str]) -> None: parser = argparse.ArgumentParser() parser.add_argument("roll_rule", type=RollRule, choices=list(RollRule)) parser.add_argument("--num_iterations", type=int, default=10000) parser.add_argument("--seed", type=int, default=None) parser.add_argument("--plot_file", default="ability_roll_distribution.png") args = parser.parse_args(argv) if args.seed is not None: random.seed(args.seed) # Run the simulation and process the data roll_counts = simulate(args.roll_rule, args.num_iterations) data = process_data(roll_counts) # Calculate statistics mean = sum(data["value"] * data["percent"] / 100.0) mode = data.iloc[data["count"].idxmax()]["value"] stddev = math.sqrt( sum(data["percent"] / 100.0 * (data["value"] - mean)**2.0)) skewness = pearson_first_skewness(mean, mode, stddev) # Print out result information print(data) print() print("Mean:", mean) print("Mode:", mode) print("Standard deviation:", stddev) print("Skewness:", skewness) # Plot the data plot = (plt9.ggplot(data, plt9.aes("value", "percent")) + plt9.geom_bar(stat="identity") + plt9.geom_vline(xintercept=mean, color="black") + plt9.xlim(0, 21) + plt9.ylab("Chance (%)") + plt9.xlab("Ability Score") + plt9.ggtitle("Ability Score Distribution ({} iterations)".format( args.num_iterations))) plot.save(args.plot_file, dpi=300) print("Wrote plot image to:", args.plot_file)
def density_plot2(num_matches_per_round: int, match_lengths_from_one_round: list, match_lengths_from_one_round_with_blowouts: list): """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """ match_lengths_blowout = pd.DataFrame({ 'Match length': np.concatenate([ match_lengths_from_one_round, match_lengths_from_one_round_with_blowouts ]), 'Blowouts': np.concatenate([ np.repeat('No', num_matches_per_round), np.repeat('Yes', num_matches_per_round) ]) }) (plt.ggplot(match_lengths_blowout, plt.aes(x='Match length', color='Blowouts')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) + plt.theme_classic()).save( filename='figures/match_length_with_blowout_density_plot.png')
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = ( ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text( aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False ) + geom_segment( aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black' ) + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, ) ) return plt
def log_HR_plot(inFile, label_unit=10, log_scale_color=True): par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components]) logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0] logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0] logHR_df["component"] = components logHR_df["label"] = [ logHR_df.index[i] if i <= label_unit else "" for i in components ] logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0] logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0] logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"]) logHR_df["non-tumor -log10(p-value)"] = -np.log10( logHR_df["non-tumor p-value"]) lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot( pn.aes( "non-tumor logHR", "tumor logHR", color="non-tumor p-value", fill="tumor p-value", label="label", ), logHR_df, ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() + pn.geom_point() + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) if log_scale_color: pl += pn.scale_color_cmap(trans="log") pl += pn.scale_fill_cmap(trans="log") lb = min( logHR_df["non-tumor -log10(p-value)"].min(), logHR_df["tumor -log10(p-value)"].min(), ) ub = max( logHR_df["non-tumor -log10(p-value)"].max(), logHR_df["tumor -log10(p-value)"].max(), ) pl_p = (pn.ggplot( pn.aes( "non-tumor -log10(p-value)", "tumor -log10(p-value)", color="component", label="label", ), logHR_df, ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) return pl, pl_p, logHR_df
def search_room(dataframe: pd.DataFrame) -> bool: # Search top 100 top100 = st.sidebar.checkbox( "Filter top 100 apartments", help="filter only the top 100 apartments by price", ) # Search by price min_price, max_price = st.sidebar.slider( "Search apartments by price", min(dataframe.price), max(dataframe.price), (min(dataframe.price), max(dataframe.price)), help="Insert the min and max price", ) # Search by review_scores_rating # Search by room type # Search by Beds # Search by Beds # Search by Bathrooms # Search by Accomodates # Select columns for plot to_select = st.sidebar.multiselect( "Seleziona le colonne che vuoi visualizzare", list(dataframe.columns), [i for i in list(dataframe.columns)], help="Seleziona le colonne che vuoi considerare", ) if top100: dataframe = dataframe.groupby("price").head(100) dataframe_filtered = dataframe[to_select] dataframe_filtered = dataframe_filtered.loc[ dataframe.price.between(min_price, max_price) ] # Launch the data visualization main_room_type(dataframe_filtered) st.sidebar.markdown("Select plot axis") axis1 = st.sidebar.selectbox( "Select first axis", list(dataframe_filtered.columns) ) axis2 = st.sidebar.selectbox( "Select second axis", list(dataframe_filtered.columns) ) scatterplot = st.sidebar.button( "Scatterplot", key="bscatterplot", help="Launch the scatterplot" ) if scatterplot: fig = px.scatter(dataframe_filtered, x=axis1, y=axis2) st.markdown(f"Plot with: {axis1}, {axis2}") st.plotly_chart(fig) st.markdown("Raw data used") st.dataframe( dataframe_filtered.style.highlight_max(axis=0) .format({axis2: "{:.2%}"}) .highlight_null(null_color="red") .set_caption("Result table with all the data filtered") ) return True barplot = st.sidebar.button( "Barplot", key="bggplot", help="Launch the ggplot" ) if barplot: st.markdown( "To launch this plot please remember to select all the columns in the data" ) # plot_folder_path = os.path.join(get_folder_path("."), "plots") fig = ( pn.ggplot(dataframe_filtered) + pn.aes(x=axis1, fill=axis2) + pn.geom_bar() + pn.theme(axis_text_x=pn.element_text(angle=45, hjust=1)) ) st.markdown("### Barplot") st.markdown(f"Displaying: {axis1} over {axis2}") st.pyplot( pn.ggplot.draw(fig), clear_figure=True, width=100, height=200, dpi=600, ) # st.image(fig_path) # st.write(fig) # st.pyplot(fig) histogram = st.sidebar.button( "Histogram", key="bp9histogram", help="Launch the ggplot histogram" ) if histogram: fig = ( pn.ggplot(dataframe_filtered) + pn.aes(x="price") + pn.geom_histogram(fill="blue", colour="black", bins=30) + pn.xlim(0, 200) ) st.markdown("### Histogram") st.markdown(f"Displaying: {axis1} over {axis2}") st.pyplot( pn.ggplot.draw(fig), clear_figure=True, width=100, height=200, dpi=600, ) density = st.sidebar.button( "Density", key="bp9density", help="Launch the ggplot density" ) if density: fig = ( pn.ggplot(dataframe_filtered.head(1000)) + pn.aes(x="price") + pn.geom_density(fill="blue", colour="black", alpha=0.5) + pn.xlim(0, 200) ) st.markdown("### Density Plot") st.pyplot( pn.ggplot.draw(fig), clear_figure=True, width=100, height=200, dpi=600, ) latlong = st.sidebar.button( "Latitude-Longitude", key="bp9latlon", help="Launch the ggplot latitude and longitude categorical comparison", ) if latlong: # color categorical variable fig = ( pn.ggplot( dataframe_filtered, pn.aes(x="latitude", y="longitude", colour="room_type"), ) + pn.geom_point(alpha=0.5) ) st.markdown("### Color categorical variable") st.pyplot( pn.ggplot.draw(fig), clear_figure=True, width=100, height=200, dpi=600, ) return True return False
sv = scale_predictors(df, predictor='SVC') # ld = scale_predictors(df, predictor='LDA') nb = scale_predictors(df, predictor='naive_bayes') rn = scale_predictors(df, predictor='Random') ac = scale_predictors(df, predictor='acg_ip_risk') rf = scale_predictors(df, predictor='RandmForest') ct = scale_predictors(df, predictor='cheating') df2 = pd.concat([nb, rn, ac, rf, sv, ct]) # df2 = pd.concat([nb, rn, ac, rf, ct]) print(df2.head(20)) print(df2.describe()) p = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\ pn.geom_step() +\ pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") # pn.scales.scale_x_reverse() p.save(HOME_DIR + 'all_together_d.png', height=8, width=10, units='in', verbose=False) p2 = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\ pn.geom_step() +\ pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") +\ pn.xlim(0, 300) + pn.ylim(0, 300) # pn.scales.scale_x_reverse() p2.save(HOME_DIR + 'all_together_trunc.png', height=8, width=10, units='in', verbose=False) print("Finished!")
plot_title = 'SHAP-Based Clusters in T-SNE SHAP Space' x_axis_label = 'T-SNE Component 1' y_axis_label = 'T-SNE Component 2' xlim = [tsne_results_df.iloc[:, 0].min(), tsne_results_df.iloc[:, 0].max()] ylim = [tsne_results_df.iloc[:, 1].min(), tsne_results_df.iloc[:, 1].max()] plot = (p9.ggplot(tsne_results_df, p9.aes(y=tsne_results_df.columns[1], x=tsne_results_df.columns[0], group=clusters_colname, color=clusters_colname )) + p9.geom_point(size=2) + p9.geom_rug() + p9.stat_ellipse() + p9.xlim(xlim[0], xlim[1]) + p9.ylim(ylim[0], ylim[1]) #+ p9.scale_color_gradient(low='blue', high='yellow') #+ p9.scale_color_manual(values=colors) + p9.theme_light(base_size=18) + p9.ggtitle(plot_title) + p9.labs(y=y_axis_label, x=x_axis_label) ) plot_filename = 'shap_clusters.png' plot.save(plot_filename, width=10, height=10) from IPython.display import Image Image(filename=plot_filename) # + [markdown]
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) prediction_true_scale_samples = self.denormalize_samples( prediction_normed_samples) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_true_scale_samples = samples title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_community() for _ in range(0, num_samples) ], "prediction": prediction_true_scale_samples, }) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: df = pd.DataFrame( data={"prediction": prediction_true_scale_samples}) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) return (ggplot(df, aes("prediction")) + geom_density(fill="#b3cde3", alpha=0.8) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
sensitivities.append(0) especifities_1.append(0) #para que al plotearlo acabe en la diagonal #pintamos ahora la curva import matplotlib.pyplot as plt """%matplotlib inline plt.plot(especifities_1,sensitivities, marker="o", linestyle="--", color="r") x=[i*0.01 for i in range(100)] y=[i*0.01 for i in range(100)] plt.plot(x,y) #pinto la diagonal (el peor modelo que existe) plt.xlabel("1-Especificidad") plt.ylabel("Sensibilidad") plt.title("Curva ROC") #recordemos que mi seleccion de variables era una mierda absoluta """ #cuanto mayor sea el área entre la curva y la diagonal, mejor es el modelo predictivo from sklearn import metrics from plotnine import ggplot, aes, geom_line, geom_area, ggtitle, xlim, ylim #si quiero importar todo pongo solo * espec_1, sensit, _ = metrics.roc_curve(Y_test, prob) df = pd.DataFrame({"x": espec_1, "y": sensit}) auc = metrics.auc(espec_1, sensit) #área bajo la curva print(df.head()) print( ggplot(df, aes(x="x", y="y")) + geom_line() + geom_line(linetype="dashed") + xlim(-0.01, 1.01) + ylim(-0.01, 1.01)) print( ggplot(df, aes(x="x", y="y")) + geom_area(alpha=0.25) + geom_line(aes(y="y")) + ggtitle("Curva ROC y AUC=%s " % str(auc)))
lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5 # drop missing values lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)] lmb_data['demvoteshare_sq'] = lmb_data['demvoteshare_c']**2 #aggregating the data lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)] categories = lmb_data.lagdemvoteshare lmb_data['lagdemvoteshare_100'] = pd.cut(lmb_data.lagdemvoteshare, 100) agg_lmb_data = lmb_data.groupby('lagdemvoteshare_100')['score'].mean().reset_index() lmb_data['gg_group'] = [1 if x>.5 else 0 for x in lmb_data.lagdemvoteshare] agg_lmb_data['lagdemvoteshare'] = np.arange(0.01, 1.01, .01) # plotting p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm", formula = 'y ~ x + I(x**2)') +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lowess") +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm")+\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5)
["Metadata_cell_line", "Metadata_gene_name", "replicate_type"] ), gg.aes(x="correlation_guide")) + \ gg.geom_density(gg.aes(fill="Metadata_cell_line"), alpha=0.4) + \ gg.geom_rug(gg.aes(color="Metadata_cell_line"), show_legend={'color': False}) + \ gg.theme_bw() + \ gg.theme( subplots_adjust={"wspace": 0.2}, axis_text=gg.element_text(size=7), axis_title=gg.element_text(size=9), strip_text=gg.element_text(size=6, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) + \ gg.xlim([-0.5, 1]) + \ gg.xlab("Median Correlation of All Guides Across Genes") + \ gg.ylab("Density") + \ gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \ gg.scale_fill_manual(name="Cell Line", values=["#1b9e77", "#d95f02", "#7570b3"]) + \ gg.scale_color_manual(name="Cell Line", values=["#1b9e77", "#d95f02", "#7570b3"]) ) file = os.path.join("figures", "median-guide-correlation-density") for extension in ['.png', '.pdf']: gg.ggsave(cor_density_gg, filename='{}{}'.format(file, extension), dpi=500, height=2,
def read_data(file): return pd.read_stata( "https://raw.github.com/scunning1975/mixtape/master/" + file) start_is_born = pd.DataFrame({ 'beauty': np.random.normal(size=2500), 'talent': np.random.normal(size=2500) }) start_is_born['score'] = start_is_born['beauty'] + start_is_born['talent'] start_is_born['c85'] = np.percentile(start_is_born['score'], q=85) start_is_born['star'] = 0 start_is_born.loc[start_is_born['score'] > start_is_born['c85'], 'star'] = 1 start_is_born.head() lm = sm.OLS.from_formula('beauty ~ talent', data=start_is_born).fit() p.ggplot(start_is_born, p.aes(x='talent', y='beauty')) + p.geom_point( size=0.5) + p.xlim(-4, 4) + p.ylim(-4, 4) p.ggplot(start_is_born[start_is_born.star == 1], p.aes( x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim( -4, 4) p.ggplot(start_is_born[start_is_born.star == 0], p.aes( x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim( -4, 4)
sizes = [] for sha1, sha2 in zip(commits, commits[1:]): res = subprocess.run(['git', 'diff', '--shortstat', sha1, sha2], stdout=subprocess.PIPE) words = res.stdout.decode().split() plus = 0 minus = 0 for i, word in enumerate(words): if 'insertion' in word: plus = int(words[i - 1]) if 'deletion' in word: minus = int(words[i - 1]) sizes.append({'insertions': plus, 'deletions': minus}) df = pandas.DataFrame(sizes) df['newlines'] = df.insertions - df.deletions df.describe() # show some basic stat for n in (-500, -100): rat = df[df.newlines < n].size / df.size print('<', n, round(rat * 100, 2), '%') for n in (0, 100, 500, 1000, 2000): rat = df[df.newlines > n].size / df.size print('>', n, round(rat * 100, 2), '%') # draw charts (gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(-2000, 0)) (gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(0, 2000))
targene_geo_wt = output[output['status_sign'] == -1] # Output t-test results t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'], b = targene_geo_wt['weight'], equal_var = False) print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0], Decimal(t_results_geo_targene[1]))) # graphical output for predictions p = (gg.ggplot(output, gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) + gg.geom_hline(gg.aes(yintercept=0), linetype='solid') + gg.geom_point(size=4) + gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) + gg.ylim([-0.1, 0.1]) + gg.xlim([-0.001, 1.001]) + gg.theme_seaborn(style='whitegrid') + gg.xlab('Targene Classifier Score') + gg.ylab('') + gg.labs(color='Sample_status') + gg.ggtitle('Mutant vs WT \n') + gg.theme( plot_title=gg.element_text(size=22), axis_title_x=gg.element_text(size=16), axis_text_x=gg.element_text(size=16), axis_text_y=gg.element_blank(), axis_ticks_length=4, axis_ticks_major_y=gg.element_blank(), axis_ticks_minor_y=gg.element_blank(), axis_ticks_minor_x=gg.element_blank(), legend_position=(1.02, 0.8),
def histogram_make(roi, combined_raw_df, list_rois, config, xlimit, save_function, find_xlim_function): if combined_raw_df.empty: if config.verbose: print( 'INFO: Histograms cannot be made for the No ROI category.') return else: thisroi = list_rois[roi] figure = ( pltn.ggplot(combined_raw_df, pltn.aes(x="voxel_value")) + pltn.theme_538() + pltn.geom_histogram( binwidth=config.histogram_binwidth, fill=config.histogram_fig_colour, boundary=0, na_rm=True ) # Boundary centers the bars, na_rm cancels error from setting an xlimit + pltn.facet_grid( f"{config.histogram_fig_y_facet}~{config.histogram_fig_x_facet}", drop=True, labeller="label_both") + pltn.labs(x=config.histogram_fig_label_x, y=config.histogram_fig_label_y) + pltn.theme( panel_grid_minor_x=pltn.themes.element_line(alpha=0), panel_grid_major_x=pltn.themes.element_line(alpha=1), panel_grid_major_y=pltn.element_line(alpha=0), plot_background=pltn.element_rect(fill="white"), panel_background=pltn.element_rect(fill="gray", alpha=0.1), axis_title_x=pltn.element_text( weight='bold', color='black', size=20), axis_title_y=pltn.element_text( weight='bold', color='black', size=20), strip_text_x=pltn.element_text( weight='bold', size=10, color='black'), strip_text_y=pltn.element_text( weight='bold', size=10, color='black'), axis_text_x=pltn.element_text(size=10, color='black'), axis_text_y=pltn.element_text(size=10, color='black'), dpi=config.plot_dpi)) # Display mean or median as vertical lines on plot if config.histogram_show_mean or config.histogram_show_median: figure += pltn.geom_vline(pltn.aes(xintercept="stat_value", color="Statistic"), size=config.histogram_stat_line_size) figure += pltn.scale_color_manual(values=[ config.colorblind_friendly_plot_colours[3], config.colorblind_friendly_plot_colours[1] ]) # Display legend for mean and median if not config.histogram_show_legend: figure += pltn.theme(legend_position='none') if xlimit: # Set y limit of figure (used to make it the same for every barchart) figure += pltn.xlim(-1, xlimit) thisroi += '_same_xlim' else: figure += pltn.xlim(-1, None) returned_xlim = 0 if config.use_same_axis_limits in ('Same limits', 'Create both') and xlimit == 0: returned_xlim = find_xlim_function(thisroi, figure, 'xaxis') if config.use_same_axis_limits == 'Same limits' and xlimit == 0: return returned_xlim elif xlimit != 0: folder = 'Same_xaxis' else: folder = 'Different_xaxis' # Suppress Pandas warning about alignment of non-concatenation axis warnings.simplefilter(action='ignore', category=FutureWarning) save_function(figure, thisroi, config, folder, 'histogram') warnings.simplefilter(action='default', category=FutureWarning) return returned_xlim
def concurrent_agents_plot(experiment_name='graph_indep_concurrent', data_path=_DEFAULT_DATA_PATH, paper_version=True): '''Passing paper_version=True should be used to reproduce Fig. 14 of the paper for K = 1,10,20,50,100. In this case, the labels in the legend are manually ordered by the values of K. Otherwise, the labels are ordered alphabetically.''' df = load_data(data_path, experiment_name) plt_df_per_action = (df.groupby(['agent', 't', 'agent_id', 'action_id']).agg({ 'instant_regret': np.mean }).reset_index()) plt_df_per_period = (df.groupby(['agent', 't']).agg({ 'instant_regret': np.mean }).reset_index()) if not paper_version: p_per_action = ( gg.ggplot(plt_df_per_action) + gg.aes('action_id', 'instant_regret', colour='agent') + gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) + gg.xlim(0, 2.5 * len(plt_df_per_period.groupby('t'))) + gg.scale_colour_brewer(name='agent', type='qual', palette='Set1') + gg.labels.xlab('number of actions') + gg.labels.ylab('per-period regret')) p_per_period = ( gg.ggplot(plt_df_per_period) + gg.aes('t', 'instant_regret', colour='agent') + gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) + gg.scale_colour_brewer(name='agent', type='qual', palette='Set1') + gg.labels.xlab('time period (t)') + gg.labels.ylab('per-period regret')) else: plt_df_per_action['agent_id'] = plt_df_per_action.agent.apply( get_agent_id) plt_df_per_period['agent_id'] = plt_df_per_period.agent.apply( get_agent_id) custom_labels = ['K = 1', 'K = 10', 'K = 20', 'K = 50', 'K = 100'] custom_colors = ["#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00"] p_per_action = ( gg.ggplot(plt_df_per_action) + gg.aes('action_id', 'instant_regret', colour='agent_id') + gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) + gg.xlim(0, 2.5 * len(plt_df_per_period.groupby('t'))) + gg.scale_color_manual( name='agent', labels=custom_labels, values=custom_colors) + gg.labels.xlab('number of actions') + gg.labels.ylab('per-action regret')) p_per_period = ( gg.ggplot(plt_df_per_period) + gg.aes('t', 'instant_regret', colour='agent_id') + gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) + gg.scale_color_manual( name='agent', labels=custom_labels, values=custom_colors) + gg.labels.xlab('time period (t)') + gg.labels.ylab('per-period regret')) plot_dict = {} plot_dict['per_action_plot'] = p_per_action plot_dict['per_period_plot'] = p_per_period return plot_dict
import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from itertools import combinations import plotnine as p # read data import ssl ssl._create_default_https_context = ssl._create_unverified_context def read_data(file): return pd.read_stata( "https://raw.github.com/scunning1975/mixtape/master/" + file) tb = pd.DataFrame({ 'd': np.concatenate((np.repeat(0, 20), np.repeat(1, 20))), 'y': (0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72, 0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.9, -5.13, -2.19, 2.43, -3.83, 0.5, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50) }) p.ggplot() + p.geom_density(tb, p.aes(x='y', color='factor(d)')) + p.xlim( -7, 8) + p.labs(title="Kolmogorov-Smirnov Test") + p.scale_color_discrete( labels=("Control", "Treatment"))