def go_to_time_plot3(large_go_to_time_probs_new: list, large_go_to_time_probs_old: list, average_minutes_per_game_values: list): """ Plot go-to-time probability, old vs. new rules, no blowouts, 300 matches/round """ large_time_prob_data = pd.DataFrame({ 'Average minutes per game': np.concatenate( [average_minutes_per_game_values, average_minutes_per_game_values]), 'P(Go to time)': np.concatenate( [large_go_to_time_probs_new, large_go_to_time_probs_old]), 'Rules': np.concatenate([ np.repeat('New', len(average_minutes_per_game_values)), np.repeat('Old', len(average_minutes_per_game_values)) ]) }) (plt.ggplot( large_time_prob_data, plt.aes(x='Average minutes per game', y='P(Go to time)', color='Rules')) + plt.geom_line() + plt.geom_point() + plt.ylim([0, 1]) + plt.theme_classic()).save( filename='figures/go_to_time_300_matches_prob_plot.png')
def plot_vs_discrete(data_table, discrete_metric_name, metric_name, segment_name, title, ylim=None, aggregate="mean" ): data_filtered = \ data_table.loc[((pd.notnull(data_table[metric_name])) & (pd.notnull(data_table[discrete_metric_name])))][ [discrete_metric_name, metric_name, segment_name]] data_filtered[[metric_name]] = data_filtered[[metric_name]].astype(float) result = data_filtered.groupby([discrete_metric_name, segment_name]).agg({metric_name: aggregate}).reset_index() result[metric_name] = round(result[metric_name], 3) gg_result = plot.ggplot(result) + plot.aes(x=discrete_metric_name, y=metric_name, fill=segment_name, label=metric_name ) + \ plot.geom_bar(stat="identity", position="dodge") + \ plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \ plot.labs(x=discrete_metric_name, y=aggregate + "(" + metric_name + ")", title=title) if pd.notnull(ylim): gg_result = gg_result + plot.ylim(ylim) return gg_result
def go_to_time_plot2(go_to_time_probs_new: list, go_to_time_probs_old: list, go_to_time_blowout_probs_new: list, go_to_time_blowout_probs_old: list, average_minutes_per_game_values: list): """ Plot go-to-time probability, new vs. old rules, blowouts vs. no blowouts, 85 matches/round """ time_prob_blowout_data = pd.DataFrame({ 'Average minutes per game': np.concatenate([ average_minutes_per_game_values, average_minutes_per_game_values, average_minutes_per_game_values, average_minutes_per_game_values ]), 'P(Go to time)': np.concatenate([ go_to_time_probs_new, go_to_time_probs_old, go_to_time_blowout_probs_new, go_to_time_blowout_probs_old ]), 'Rules': np.concatenate([ np.repeat('New, no blowouts', len(average_minutes_per_game_values)), np.repeat('Old, no blowouts', len(average_minutes_per_game_values)), np.repeat('New, blowouts', len(average_minutes_per_game_values)), np.repeat('Old, blowouts', len(average_minutes_per_game_values)) ]) }) (plt.ggplot( time_prob_blowout_data, plt.aes(x='Average minutes per game', y='P(Go to time)', color='Rules')) + plt.geom_line() + plt.geom_point() + plt.ylim([0, 1]) + plt.theme_classic()).save( filename='figures/go_to_time_prob_with_blowouts_plot.png')
def plot_ci_eval(df): molten = pd.melt(df, id_vars=['sample_size'], value_vars=['bootstrap', 'ztest', 'ttest']) return (ggplot(molten, aes(x='sample_size', y='value', color='variable')) + geom_line() + scale_x_log10() + ylim(0, 1))
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text(aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False) + geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black') + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, )) return plt
def plot_fees(fees, title, y_axis, years, filename): p = pn.ggplot(fees, pn.aes('year', y_axis, color = 'conference', shape = 'conference')) + \ pn.geom_point() + \ pn.geom_line() + \ pn.labs(title = title, x = 'Year', y = 'Fee (€)') + \ pn.ylim(0, 1000) + \ pn.theme_light() + \ pn.scale_x_continuous(breaks = years) + \ pn.scale_colour_discrete(name = 'Conference') + \ pn.scale_shape_discrete(name = 'Conference') p.save(filename, width=6, height=3, dpi=300)
def plot_action_proportion(df_agent): """Plot the action proportion for the sub-dataframe for a single agent.""" n_action = np.max(df_agent.action) + 1 plt_data = [] for i in range(n_action): probs = (df_agent.groupby('t').agg({ 'action': lambda x: np.mean(x == i) }).rename(columns={'action': 'action_' + str(i)})) plt_data.append(probs) plt_df = pd.concat(plt_data, axis=1).reset_index() p = (gg.ggplot(pd.melt(plt_df, id_vars='t')) + gg.aes('t', 'value', colour='variable', group='variable') + gg.geom_line(size=1.25, alpha=0.75) + gg.xlab('Timestep (t)') + gg.ylab('Action probability') + gg.ylim(0, 1) + gg.scale_colour_brewer(name='Variable', type='qual', palette='Set1')) return p
def make_plot(name): df = pd.read_csv(f'small_n/results/{name}.csv') molten = pd.melt( df, id_vars=['sample_size'], value_vars=['bootstrap', 'ztest', 'ttest'], var_name='method', value_name='success', ) (ggplot(molten, aes(x='sample_size', y='success', color='method')) + geom_line(size=1) + scale_x_log10() + ylim(0, 1) + geom_hline( yintercept=0.95, linetype='dotted', color='#FF5500', size=3)).save( f'slides/static/plots/{name}.png', height=7.0, width=10, units='in')
def grid_search_models(X,y): # get only exons 4-12 X2 = X[:,3:12] X_train, X_test, y_train, y_test = train_test_split(X2,y,test_size=0.3) #SVM svc = SVC() param_grid = {'C':[0.5,1,2,3,5,6,7,8,9,10],'kernel':['rbf','linear','poly','sigmoid'],'degree':[2,3,4,5,6]} grid_search_svc = GridSearchCV(svc, param_grid, scoring='accuracy') grid_search_svc.fit(X_train, y_train) #logistic regression lr = LogisticRegression() param_grid = {'penalty':['l1','l2'],'C':[0.5,1,2,3,4,5,8,10]} grid_search_lr = GridSearchCV(lr, param_grid, scoring='accuracy') grid_search_lr.fit(X_train, y_train) #decision tree dt = DecisionTreeClassifier() param_grid = {'max_depth': [3, 10, 20, 30], 'max_leaf_nodes': [2, 4, 6, 8],'min_samples_leaf':[1,2,3],'min_samples_split':[2,4,6]} grid_search_dt = RandomizedSearchCV(dt, param_grid, cv=10, scoring='accuracy') grid_search_dt.fit(X_train, y_train) # plot performances data = { 'Model':['SVM']*10 + ['LogisticRegression']*10 + ['DecisionTree']*10, 'Accuracy':list(cross_val_score(grid_search_svc.best_estimator_,X_train,y_train,cv=10)) + \ list(cross_val_score(grid_search_lr.best_estimator_,X_train,y_train,cv=10)) + \ list(cross_val_score(grid_search_dt.best_estimator_,X_train,y_train,cv=10)) } data = pd.DataFrame(data) data['Model'] = pd.Categorical(data['Model'], categories=['SVM','LogisticRegression','DecisionTree'], ordered=True) p = pn.ggplot(data,pn.aes('Model','Accuracy')) + pn.geom_boxplot() + pn.ylim(0,1) p.save('./plots/tumor_genotype_prediction/accuracy-model.png')
def gene_log_HR_plot(inFile, pcaFile=None, model=None): # get logHRs par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) t_logHR = par["means"]["logHR"][components, 0] tf_logHR = par["means"]["logHR"][tf_components, 0] t_logHR_sd = par["stds"]["logHR"][components, 0] tf_logHR_sd = par["stds"]["logHR"][tf_components, 0] # get pca if pcaFile is None: pcaFile = inFile.replace("_params.hdf5", "_pca.pkl") with open(pcaFile, "rb") as buff: pca = pickle.load(buff) # prep dataframe n_genes = pca.components_.shape[1] if model is None: logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)]) else: logHR_df = pd.DataFrame(index=model.counts.index) logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR) logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR) logHR_df["tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0)) logHR_df["non-tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0)) logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 # make plot lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_point(alpha=0.3, color="red") + pn.geom_abline()) return pl, logHR_df
def plot_cor(df): # drop missing correlations out = df[~df['corr'].isnull()] # add pair column out = out.assign(pair=out.col_1 + '&' + out.col_2) # add a sign column sign = ((out['corr'] > 0).astype('int')).to_list() sign = [['Negative', 'Positive'][i] for i in sign] out['sign'] = sign #out = out.sort_values('pair', ascending = False).reset_index(drop = True) # add ind column out['ind'] = [out.shape[0] - i for i in range(out.shape[0])] # plot using bands ggplt = p9.ggplot(data = out, mapping = p9.aes(x = 'pair', y = 'corr')) \ + p9.geom_hline( yintercept = 0, linetype = "dashed", color = "#c2c6cc" ) \ + p9.geom_rect( alpha = 0.4, xmin = out.ind.values - 0.4, xmax = out.ind.values + 0.4, ymin = out.lower.values, ymax = out.upper.values, fill = [['b', '#abaeb3'][int(x > 0.05)] for x in out.p_value] ) \ + p9.geom_segment( x = out.ind.values - 0.4, y = out['corr'].values, xend = out.ind.values + 0.4, yend = out['corr'].values ) \ + p9.coord_flip() \ + p9.ylim(np.min(out.lower.values), np.max(out.upper.values)) \ + p9.labs(x = "", y = "Correlation") return ggplt
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = ( ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text( aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False ) + geom_segment( aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black' ) + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, ) ) return plt
print("\n\nThe predicted acceptable range at age ", str(age), " is from ", str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n") # save csv file outlierfile = filename.replace('.csv', '_outliers.csv') data_output.to_csv(outlierfile, index=False) # plot overlay of IQR and mod-Z score outliers p = ( p9.ggplot(data=data_output, mapping=p9.aes(x='age_rounded', y='value', group='age_rounded')) + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) + p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle( "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)" ) + p9.ylim(-10, 175)) print(p) plotfile = filename.replace('.csv', '_outlierplot') p9.ggsave(plot=p, filename=plotfile) # plot regression x = data_stats_regression['age_rounded'] y = data_stats_regression['median'] plt.plot(x, y, 'o') plt.plot(x, r.func_linear(x, *linear_coeff)) plt.plot(x, r.func_log(x, *log10_coeff)) plt.plot(x, r.func_ln(x, *ln_coeff)) plt.title( "Regression performed on medians of age 1, 3 and 5\ndata with outliers removed" ) plt.show()
def log_HR_plot(inFile, label_unit=10, log_scale_color=True): par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components]) logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0] logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0] logHR_df["component"] = components logHR_df["label"] = [ logHR_df.index[i] if i <= label_unit else "" for i in components ] logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0] logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0] logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"]) logHR_df["non-tumor -log10(p-value)"] = -np.log10( logHR_df["non-tumor p-value"]) lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot( pn.aes( "non-tumor logHR", "tumor logHR", color="non-tumor p-value", fill="tumor p-value", label="label", ), logHR_df, ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() + pn.geom_point() + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) if log_scale_color: pl += pn.scale_color_cmap(trans="log") pl += pn.scale_fill_cmap(trans="log") lb = min( logHR_df["non-tumor -log10(p-value)"].min(), logHR_df["tumor -log10(p-value)"].min(), ) ub = max( logHR_df["non-tumor -log10(p-value)"].max(), logHR_df["tumor -log10(p-value)"].max(), ) pl_p = (pn.ggplot( pn.aes( "non-tumor -log10(p-value)", "tumor -log10(p-value)", color="component", label="label", ), logHR_df, ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) return pl, pl_p, logHR_df
def gene_profile(genes: list, weights: pd.DataFrame, stddev: pd.DataFrame=None, y_axis_label: str=None, highlight_n: int=None, highlight_anno: list=None, figsize: tuple=None, ylim: tuple=None) -> p9.ggplot: """ Parameters ---------- weights : DataFrame of ES weights genes : a single str or list of genes to include in plot as facets highlight_n : number of highest ESw to highlight highlight_anno : specific annotations to highlight figsize : (float, float), optional (default: None) Specify width and height of plot. Returns ------- g : ggplot Todo: * find a better way for sorting cell-types along x-axis * report if gene in genes is not found in df * report if duplicate genes * replace hacky x-axis labelling """ ### Reduce dataframe to genes of interest genes = [str.upper(s) for s in genes] idx = np.char.upper(weights.index.values.astype(str)) mask = np.isin(idx, genes) df_tidy = weights[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in dataframe." stddev_tidy = None if stddev is not None: idx = np.char.upper(stddev.index.values.astype(str)) mask = np.isin(idx, genes) stddev_tidy = stddev[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in stddev dataframe." # Constants, height and width of plot. if figsize is None: H = 5*n_genes W = 15 else: W, H = figsize if ylim is None: ylim = (-1,1) if y_axis_label is None: y_axis_label = "Expression Specificity" ### Convert to tidy / long format if necessary # Org: # ABC ACBG ACMB # POMC 0.0 0.5 0.9 # AGRP 0.2 0.0 0.0 # LEPR 0.1 0.1 0.4 # Tidy: # gene_name annotation es_weight # 1 POMC ABC 0.0 # 2 AGRP ABC 0.6 # 3 LEPR ABC 1.0 df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight") if stddev_tidy is not None: stddev_tidy.index.name = None stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev") df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"]) ### Sort values by gene_name and es_weight and add order # Sorted: # gene_name annotation es_weight x_order # 1 AGRP MOL2 0.0 1 # 2 AGRP ACNT1 0.1 2 # 3 AGRP MOL1 0.2 3 df_tidy = df_tidy.sort_values(by=["index", "weight"]) df_tidy["order"] = np.arange(len(df_tidy)) + 1 ### Generate highlight # Default: highlight top 5 if ((highlight_n is None) and (highlight_anno is None)): highlight_n = 5 # highlight list of if (highlight_anno is not None): df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno) elif (highlight_n is not None): df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n else: df_tidy["highlight"] = np.array([False] * len(df_tidy)) df_highlight = df_tidy[df_tidy["highlight"]] ### Plot # linear function to compute x_axis text-size. # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes). SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes)) # Limits of the order for each index gene / facet, e.g. [0, 266, 531] # These limits are necessary to only plot the labels order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)] def find_nearest(array,value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return array[idx] def getbreaks(lims): # function defined for use in debugging l = find_nearest(order_lims, lims[0]) r = find_nearest(order_lims, lims[1]) breaks = np.arange(l, r) return breaks def getlbls(idx): # function defined for use in debugging idx = idx lbls = df_tidy["annotation"].iloc[idx].values return lbls p = ( ### data p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation")) ### theming + p9.theme_classic() + p9.theme( figure_size = (W,H), axis_ticks_major_x = p9.element_blank(), axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # axis_text_y = p9.element_text(size=W), panel_spacing = 1, strip_background = p9.element_blank() ) + p9.ylim(ylim[0],ylim[1]) + p9.labs( x="", # e.g. "Cell-type" y=y_axis_label, # e.g. "ES weight" ) ### viz # all + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="grey", alpha=0.3, show_legend=False ) + p9.geom_point(mapping=p9.aes(size=2), color="grey", show_legend=False ) # highlight + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), color="dodgerblue", show_legend=False ) + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="dodgerblue", alpha=0.3, show_legend=False ) + p9.facet_wrap("index", scales="free", nrow=n_genes ) + p9.scale_x_continuous( # order_scale is continuous across all annotations # so the scale will look weird for each facet, e.g. # facet 1 may have order 1-7, and facet 2 has order 8-14. # therefore we must use a labeller function to get the # correct labels for each interval of order. breaks = lambda lims: getbreaks(lims), labels = lambda idx: getlbls(idx) ) ) if stddev_tidy is not None: p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="grey", width=0.1)\ + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="dodgerblue", width=0.1) # add labels last for them to be on top p = p + p9.geom_label(data=df_highlight, color = "dodgerblue", adjust_text = {'expand_points': (2,2)} ) return p
targene_geo_mutant = output[output['status_sign'] == 1] targene_geo_wt = output[output['status_sign'] == -1] # Output t-test results t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'], b = targene_geo_wt['weight'], equal_var = False) print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0], Decimal(t_results_geo_targene[1]))) # graphical output for predictions p = (gg.ggplot(output, gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) + gg.geom_hline(gg.aes(yintercept=0), linetype='solid') + gg.geom_point(size=4) + gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) + gg.ylim([-0.1, 0.1]) + gg.xlim([-0.001, 1.001]) + gg.theme_seaborn(style='whitegrid') + gg.xlab('Targene Classifier Score') + gg.ylab('') + gg.labs(color='Sample_status') + gg.ggtitle('Mutant vs WT \n') + gg.theme( plot_title=gg.element_text(size=22), axis_title_x=gg.element_text(size=16), axis_text_x=gg.element_text(size=16), axis_text_y=gg.element_blank(), axis_ticks_length=4, axis_ticks_major_y=gg.element_blank(), axis_ticks_minor_y=gg.element_blank(), axis_ticks_minor_x=gg.element_blank(),
index=image_meta_col_list + ["Ch"], columns=["type"]).reset_index() cp_sat_df.columns = image_meta_col_list + [ "Ch", "PercentMax", "StdIntensity" ] cp_saturation_ymax = max(cp_sat_df.PercentMax) if cp_saturation_ymax < 1: cp_saturation_ymax = 1 cp_saturation_gg = ( gg.ggplot( cp_sat_df, gg.aes(x="StdIntensity", y="PercentMax", label=image_cols["site"]), ) + gg.coord_fixed(ratio=0.25) + gg.geom_text(size=6) + gg.ylim([0, cp_saturation_ymax]) + gg.facet_wrap(["Ch", image_cols["well"]], nrow=len(painting_image_names), scales="free") + gg.theme_bw() + gg.ggtitle(f"Cell Painting Image Saturation \n {plate}") + gg.theme( strip_background=gg.element_rect(colour="black", fill="#fdfff4"), strip_text=gg.element_text(size=7), axis_text=gg.element_text(size=6), subplots_adjust={"wspace": 0.2}, )) output_file = pathlib.Path(output_figuresdir, "cp_saturation.png") if check_if_write(output_file, force, throw_warning=True): cp_saturation_gg.save( output_file, dpi=300, width=(len(cp_sat_df[image_cols["well"]].unique()) + 2),
def barchart_make(roi, df, list_rois, config, ylimit, save_function, find_ylim_function): thisroi = list_rois[roi] current_df = df.loc[df['index'] == thisroi] current_df = current_df.sort_values([config.single_roi_fig_x_axis]) current_df = current_df.reset_index( drop=True) # Reset index to remove grouping current_df[config.single_roi_fig_x_axis] = pd.Categorical( current_df[config.single_roi_fig_x_axis], categories=current_df[config.single_roi_fig_x_axis].unique()) figure = ( pltn.ggplot( current_df, pltn.aes(x=config.single_roi_fig_x_axis, y='Mean', ymin="Mean-Conf_Int_95", ymax="Mean+Conf_Int_95", fill='factor({colour})'.format( colour=config.single_roi_fig_colour))) + pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge( preserve='single', width=0.8), width=0.8, na_rm=True) + pltn.geom_errorbar(size=1, position=pltn.position_dodge( preserve='single', width=0.8)) + pltn.labs(x=config.single_roi_fig_label_x, y=config.single_roi_fig_label_y, fill=config.single_roi_fig_label_fill) + pltn.scale_x_discrete(labels=[]) + pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0), axis_title_x=pltn.element_text( weight='bold', color='black', size=20), axis_title_y=pltn.element_text( weight='bold', color='black', size=20), axis_text_y=pltn.element_text(size=20, color='black'), legend_title=pltn.element_text(size=20, color='black'), legend_text=pltn.element_text(size=18, color='black'), subplots_adjust={'right': 0.85}, legend_position=(0.9, 0.8), dpi=config.plot_dpi) + pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis), color='black', size=20, va='top') + pltn.scale_fill_manual( values=config.colorblind_friendly_plot_colours)) if ylimit: # Set y limit of figure (used to make it the same for every barchart) figure += pltn.ylim(None, ylimit) thisroi += '_same_ylim' returned_ylim = 0 if config.use_same_axis_limits in ('Same limits', 'Create both') and ylimit == 0: returned_ylim = find_ylim_function(thisroi, figure, 'yaxis') if config.use_same_axis_limits == 'Same limits' and ylimit == 0: return returned_ylim elif ylimit != 0: folder = 'Same_yaxis' else: folder = 'Different_yaxis' save_function(figure, thisroi, config, folder, 'barchart') return returned_ylim
cp_sat_df, index=image_meta_col_list + ["Ch"], columns=["type"] ).reset_index() cp_sat_df.columns = image_meta_col_list + ["Ch", "PercentMax", "StdIntensity"] cp_saturation_ymax = max(cp_sat_df.PercentMax) if cp_saturation_ymax < 1: cp_saturation_ymax = 1 cp_saturation_gg = ( gg.ggplot( cp_sat_df, gg.aes(x="StdIntensity", y="PercentMax", label=image_cols["site"]), ) + gg.coord_fixed(ratio=0.25) + gg.geom_text(size=6) + gg.ylim([0, cp_saturation_ymax]) + gg.facet_wrap( ["Ch", image_cols["well"]], nrow=len(painting_image_names), scales="free" ) + gg.theme_bw() + gg.ggtitle(f"Cell Painting Image Saturation \n {plate}") + gg.theme( strip_background=gg.element_rect(colour="black", fill="#fdfff4"), strip_text=gg.element_text(size=7), axis_text=gg.element_text(size=6), subplots_adjust={"wspace": 0.2}, ) ) output_file = pathlib.Path(output_figuresdir, "cp_saturation.png") if check_if_write(output_file, force, throw_warning=True): cp_saturation_gg.save(
out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = C3_scan[i] out_i['dir'] = 'fwd' out = pandas.concat([out, out_i[out.columns]]) for i in range(len(sim_res_rev)): out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = numpy.flip(C3_scan)[i] out_i['dir'] = 'rev' out = pandas.concat([out, out_i[out.columns]]) out.to_csv("sim.txt", sep="\t", index=False) ###################### plotting ################################## g = (ggplot(out, aes('time', 's2', group='signal', color='signal')) + geom_line(size=0.5) + ylim(0, 20000) + scale_color_distiller(palette='RdYlBu', type="diverging") + facet_wrap('~dir') + theme_bw()) g.save(filename="./num_cont_graphs/sim_fwd_rev.png", format="png", width=8, height=4, units='in', verbose=False) eq = out[out.time == max(out.time)] g = (ggplot(eq) + aes(x='signal', y='s2', color='dir') + geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw()) g.save(filename="./num_cont_graphs/sim_bif_diag.png", format="png", width=8,
sensitivities.append(0) especifities_1.append(0) #para que al plotearlo acabe en la diagonal #pintamos ahora la curva import matplotlib.pyplot as plt """%matplotlib inline plt.plot(especifities_1,sensitivities, marker="o", linestyle="--", color="r") x=[i*0.01 for i in range(100)] y=[i*0.01 for i in range(100)] plt.plot(x,y) #pinto la diagonal (el peor modelo que existe) plt.xlabel("1-Especificidad") plt.ylabel("Sensibilidad") plt.title("Curva ROC") #recordemos que mi seleccion de variables era una mierda absoluta """ #cuanto mayor sea el área entre la curva y la diagonal, mejor es el modelo predictivo from sklearn import metrics from plotnine import ggplot, aes, geom_line, geom_area, ggtitle, xlim, ylim #si quiero importar todo pongo solo * espec_1, sensit, _ = metrics.roc_curve(Y_test, prob) df = pd.DataFrame({"x": espec_1, "y": sensit}) auc = metrics.auc(espec_1, sensit) #área bajo la curva print(df.head()) print( ggplot(df, aes(x="x", y="y")) + geom_line() + geom_line(linetype="dashed") + xlim(-0.01, 1.01) + ylim(-0.01, 1.01)) print( ggplot(df, aes(x="x", y="y")) + geom_area(alpha=0.25) + geom_line(aes(y="y")) + ggtitle("Curva ROC y AUC=%s " % str(auc)))
lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5 # drop missing values lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)] lmb_data['demvoteshare_sq'] = lmb_data['demvoteshare_c']**2 #aggregating the data lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)] categories = lmb_data.lagdemvoteshare lmb_data['lagdemvoteshare_100'] = pd.cut(lmb_data.lagdemvoteshare, 100) agg_lmb_data = lmb_data.groupby('lagdemvoteshare_100')['score'].mean().reset_index() lmb_data['gg_group'] = [1 if x>.5 else 0 for x in lmb_data.lagdemvoteshare] agg_lmb_data['lagdemvoteshare'] = np.arange(0.01, 1.01, .01) # plotting p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm", formula = 'y ~ x + I(x**2)') +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lowess") +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm")+\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5)
x_axis_label = 'T-SNE Component 1' y_axis_label = 'T-SNE Component 2' xlim = [tsne_results_df.iloc[:, 0].min(), tsne_results_df.iloc[:, 0].max()] ylim = [tsne_results_df.iloc[:, 1].min(), tsne_results_df.iloc[:, 1].max()] plot = (p9.ggplot(tsne_results_df, p9.aes(y=tsne_results_df.columns[1], x=tsne_results_df.columns[0], group=clusters_colname, color=clusters_colname )) + p9.geom_point(size=2) + p9.geom_rug() + p9.stat_ellipse() + p9.xlim(xlim[0], xlim[1]) + p9.ylim(ylim[0], ylim[1]) #+ p9.scale_color_gradient(low='blue', high='yellow') #+ p9.scale_color_manual(values=colors) + p9.theme_light(base_size=18) + p9.ggtitle(plot_title) + p9.labs(y=y_axis_label, x=x_axis_label) ) plot_filename = 'shap_clusters.png' plot.save(plot_filename, width=10, height=10) from IPython.display import Image Image(filename=plot_filename) # + [markdown] '''
overall_preprint_survival = kmf.survival_function_.reset_index().assign( label="all_papers" ) overall_preprint_survival.head() g = ( p9.ggplot( overall_preprint_survival.assign( timeline=lambda x: pd.to_timedelta(x.timeline, "D") ), p9.aes(x="timeline", y="KM_estimate", color="label"), ) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() + p9.ylim(0, 1) ) print(g) # # Calculate Category Survival Function # This section measures how long it takes for certain categories to get preprints published. entire_preprint_df = pd.DataFrame([], columns=["timeline", "KM_estimate", "category"]) half_life = [] for cat, grouped_df in preprints_w_published_dates.groupby("category"): temp_df = preprints_w_published_dates.query(f"category=='{cat}'") kmf.fit( temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24, event_observed=~temp_df["published_doi"].isna(), )
def main(): args = UserInput() if args.y_lim: y_lim = np.array(args.y_lim, dtype=np.float32) else: y_lim = None if args.size: size = np.array(args.size, dtype=np.float32) else: size = args.size ################################### df_list = [ pd.read_csv(f, sep=args.sep, skipinitialspace=True) for f in args.infile ] ## only take input with 1 or 2 columns; for 2 columns, 1st is always removed lg_list = [] for idx, df in enumerate(df_list): xdf = pd.DataFrame(df.iloc[:, int(args.col) - 1]) if args.col_names: xdf.columns = [args.col_names[idx]] lg_list.append(pd.melt(xdf)) lg_df = pd.concat(lg_list) lg_df.columns = [args.x_name, args.y_name] print(lg_df) ## plotnine method if args.use_p9: import plotnine as p9 Quant = [.25, .5, .75] if y_lim is not None: set_ylim = p9.ylim(y_lim) else: set_ylim = p9.ylim( [lg_df[args.y_name].min(), lg_df[args.y_name].max()]) df_plot = (p9.ggplot( lg_df, p9.aes(x=args.x_name, y=args.y_name, fill=args.x_name)) + p9.geom_violin( width=.75, draw_quantiles=Quant, show_legend=False) + p9.ggtitle(args.title) + p9.theme_classic() + set_ylim + p9.scale_x_discrete(limits=args.col_names) + p9.theme(text=p9.element_text(size=12, color='black'), axis_text_x=p9.element_text(angle=33), panel_grid_major_y=p9.element_line(color='gray', alpha=.5))) p9.ggsave(filename='{0}.violin.{1}'.format(args.outpref, args.img), plot=df_plot, dpi=int(args.dpi), format=args.img, width=size[0], height=size[1], units='in', verbose=False) else: ## Seaborn method import seaborn as sns sns.set(style='whitegrid') ax = sns.violinplot(x=args.x_name, y=args.y_name, data=lg_df, linewidth=1, inner='box') if args.title: ax.set_title(args.title) if y_lim is not None: ax.set(ylim=y_lim) plt.savefig('{0}.violin.{1}'.format(args.outpref, args.img), figsize=tuple(size), format=args.img, dpi=int(args.dpi)) plt.clf()
sv = scale_predictors(df, predictor='SVC') # ld = scale_predictors(df, predictor='LDA') nb = scale_predictors(df, predictor='naive_bayes') rn = scale_predictors(df, predictor='Random') ac = scale_predictors(df, predictor='acg_ip_risk') rf = scale_predictors(df, predictor='RandmForest') ct = scale_predictors(df, predictor='cheating') df2 = pd.concat([nb, rn, ac, rf, sv, ct]) # df2 = pd.concat([nb, rn, ac, rf, ct]) print(df2.head(20)) print(df2.describe()) p = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\ pn.geom_step() +\ pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") # pn.scales.scale_x_reverse() p.save(HOME_DIR + 'all_together_d.png', height=8, width=10, units='in', verbose=False) p2 = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\ pn.geom_step() +\ pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") +\ pn.xlim(0, 300) + pn.ylim(0, 300) # pn.scales.scale_x_reverse() p2.save(HOME_DIR + 'all_together_trunc.png', height=8, width=10, units='in', verbose=False) print("Finished!")
def read_data(file): return pd.read_stata( "https://raw.github.com/scunning1975/mixtape/master/" + file) start_is_born = pd.DataFrame({ 'beauty': np.random.normal(size=2500), 'talent': np.random.normal(size=2500) }) start_is_born['score'] = start_is_born['beauty'] + start_is_born['talent'] start_is_born['c85'] = np.percentile(start_is_born['score'], q=85) start_is_born['star'] = 0 start_is_born.loc[start_is_born['score'] > start_is_born['c85'], 'star'] = 1 start_is_born.head() lm = sm.OLS.from_formula('beauty ~ talent', data=start_is_born).fit() p.ggplot(start_is_born, p.aes(x='talent', y='beauty')) + p.geom_point( size=0.5) + p.xlim(-4, 4) + p.ylim(-4, 4) p.ggplot(start_is_born[start_is_born.star == 1], p.aes( x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim( -4, 4) p.ggplot(start_is_born[start_is_born.star == 0], p.aes( x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim( -4, 4)
def graph_kda(df): try: lm = LinearRegression() X = df['games_ago'] y = df['kda'] X = X.values.reshape(-1, 1) y = y.values.reshape(-1, 1) lm.fit(X,y) coef = np.transpose(lm.coef_) if coef[0] < 0: lm_color = 'green' elif coef[0] > 0: lm_color = 'red' else: lm_color = 'black' kda = ggplot(df) + aes(x='games_ago', y='kda') + geom_line(color="black", linetype='dashed') + geom_point(aes(color='lane', size=3), show_legend={'size': False}) + ylim(0,30) + scale_x_reverse() + geom_smooth(method = "lm", color = lm_color) + xlab('Games Ago') + ylab('KDA') + ggtitle("KDA Over the Last 5 Games") except: pass return kda
roll_credits_times_2 += collection.autorelease() case.record(pokemon, collection, results, slot_machine) print( f"{collection.num_unique()} / {len(pokemon)}, ({len(collection.pokemon)})" ) # Output simulation results data = simulation_data.to_data_frame() num_unique_pokemon_plot = ( plt9.ggplot(data, plt9.aes("roll_num", "num_unique_pokemon", color="case_id")) + plt9.geom_line() + plt9.geom_hline(yintercept=len(pokemon)) + plt9.ylim(0, len(pokemon)) ) num_unique_pokemon_plot.save(args.num_unique_pokemon_plot, dpi=300) print("Output:", args.num_unique_pokemon_plot) data_2 = simulation_data.to_num_missing_data_frame() num_missing_pokemon_plot = ( plt9.ggplot( data_2[data_2["case_id"] == 0], plt9.aes("roll_num", "num_missing", fill="rarity"), ) + plt9.geom_area() + plt9.geom_hline(yintercept=len(pokemon)) + plt9.ylim(0, len(pokemon))
out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = C3_scan[i] out_i['dir'] = 'Low $[S^{**}]$' out = pandas.concat([out, out_i[out.columns]]) for i in range(len(sim_res_rev)): out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = numpy.flip(C3_scan)[i] out_i['dir'] = 'High $[S^{**}]$' out = pandas.concat([out, out_i[out.columns]]) out.to_csv("./num_cont_graphs/sim2.txt", sep="\t", index=False) ###################### plotting ################################## g = (ggplot(out, aes('time', response, group='signal', color='signal')) + geom_line(size=0.5) + ylim(0, 202) + labs(x="time", y="$[S^{**}]$") + scale_color_distiller( palette='RdYlBu', type="diverging", name="$B_{tot}$") + facet_wrap('~dir') + theme_bw()) g.save(filename="./num_cont_graphs/sim_fwd_rev2.png", format="png", width=8, height=4, units='in', verbose=False) eq = out[out.time == max(out.time)] g = (ggplot(eq) + aes(x='signal', y=response, color='dir') + labs(x="$B_{tot}$", y="$[S^{**}]$", color="") + geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw() +
median_ci_l, median_ci_u # In[9]: overall_preprint_survival = kmf.survival_function_.reset_index().assign( label="all_papers") overall_preprint_survival.head() # In[10]: g = (p9.ggplot( overall_preprint_survival.assign( timeline=lambda x: pd.to_timedelta(x.timeline, "D")), p9.aes(x="timeline", y="KM_estimate", color="label"), ) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() + p9.ylim(0, 1)) print(g) # # Calculate Category Survival Function # This section measures how long it takes for certain categories to get preprints published. # In[11]: entire_preprint_df = pd.DataFrame( [], columns=["timeline", "KM_estimate", "category"]) half_life = [] for cat, grouped_df in preprints_w_published_dates.groupby("category"): temp_df = preprints_w_published_dates.query(f"category=='{cat}'") kmf.fit( temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24,