def test_errorbar_aesthetics(): p = (ggplot(df, aes(ymin='ymin', ymax='ymax')) + geom_errorbar(aes('x'), size=2) + geom_errorbar(aes('x+1', alpha='z'), width=0.2, size=2) + geom_errorbar(aes('x+2', linetype='factor(z)'), size=2) + geom_errorbar(aes('x+3', color='z'), size=2) + geom_errorbar(aes('x+4', size='z'))) assert p + _theme == 'errorbar_aesthetics'
def test_errorbar_aesthetics(): p = (ggplot(df, aes(ymin='ymin', ymax='ymax')) + geom_errorbar(aes('x'), size=2) + geom_errorbar(aes('x+1', alpha='z'), width=0.2, size=2) + geom_errorbar(aes('x+2', linetype='factor(z)'), size=2) + geom_errorbar(aes('x+3', color='z'), size=2) + geom_errorbar(aes('x+4', size='z')) ) assert p + _theme == 'errorbar_aesthetics'
# Plot lst_num_partitions = list(all_svcca.index) threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_partitions), 1)), index=lst_num_partitions, columns=['score']) panel_A = ggplot(all_svcca) + geom_line(all_svcca, aes(x=lst_num_partitions, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_partitions, y='score'), color ='darkgrey', size=0.5) \ + geom_errorbar(all_svcca, aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=lst_num_partitions, y='score'), linetype='dashed', size=1, color="darkgrey", show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme( plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"),
## use pandas functionality to compute stat transformations gse75386means = gse75386[['class', 'Gad1']]\ .groupby('class').agg(np.mean).iloc[:, 0] gse75386ses = gse75386[['class', 'Gad1']]\ .groupby('class').agg(lambda x: x.std() / np.sqrt(len(x)))\ .iloc[:, 0] gse75386stats = pd.DataFrame({ 'Gad1 (Mean)': gse75386means, 'SE': gse75386ses, 'ymin': gse75386means - gse75386ses, 'ymax': gse75386means + gse75386ses, 'class': gse75386means.index.values }) ggbarse = ggplot(gse75386stats, gg.aes(x='class', y='Gad1 (Mean)')) +\ gg.geom_bar(alpha=0.6, stat='identity') +\ gg.geom_errorbar(mapping=gg.aes(ymin='ymin', ymax='ymax'), width=0.0001) +\ gg.coord_flip() print(ggbarse) # ggbarse.save('gse75386_gad1_barchart_stat.pdf', format='pdf', # height=1, width=6) ## mean bars +/- standard error using seaborn plt.close() # plt.figure(figsize=(6, 1)) sns.barplot(data=gse75386, y='class', x='Gad1', color='slategray', ci=68) # plt.savefig('gse75386_gad1_barchart_stat.pdf', # format='pdf', bbox_inches='tight') ## ----------------------------------------------------------------- ## GSE75386 boxplot + stripchart ## -----------------------------------------------------------------
lambda x: x.auroc_mean - (critical_val * x.auroc_std) / pd.np.sqrt(x.lf_num_len), 'aupr_upper': lambda x: x.aupr_mean + (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len), 'aupr_lower': lambda x: x.aupr_mean - (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len) })) dev_set_stats_df # In[9]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUROC", color="Model") + p9.scale_color_manual({ "disc_model": "blue", "gen_model": "orange" })) # In[10]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUPR", color="Model") + p9.scale_color_manual({ "disc_model": "blue",
max_depth=5, min_samples_split=2, max_features=5, n_jobs=n_threads) sklearn_forest.fit(X, y) current_timing = (time.time() - start_time) if n >= n_burn_in: timing_data['implementation'].append('scikit-learn 0.23.1') timing_data['threads'].append(n_threads) timing_data['timing'].append(current_timing) df = pd.DataFrame(data=timing_data) df = df.groupby(['implementation', 'threads']).agg(['mean', 'std']).reset_index() df.columns = ['Implementation', 'threads', 'mean', 'std'] print(df) df['error_min'] = df['mean'] - df['std'] df['error_max'] = df['mean'] + df['std'] p = (ggplot( df, aes(x='threads', y='mean', group='Implementation', color='Implementation')) + geom_line() + geom_point() + geom_errorbar(aes(ymin='error_min', ymax='error_max'), width=.2, position=position_dodge(0.05)) + labs(x="Number of threads", y="timing [s]")) p.save(filename='benchmark.png')
# Plot - uncorrected only lst_num_experiments = list(all_svcca.index[0:int(len(all_svcca.index) / 2)]) threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) g = ggplot(all_svcca[all_svcca['Group'] == 'uncorrected']) + geom_line(all_svcca[all_svcca['Group'] == 'uncorrected'], aes(x=lst_num_experiments, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_experiments, y='score'), color ='darkgrey', size=0.5) \ + geom_errorbar(all_svcca[all_svcca['Group'] == 'uncorrected'], aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=lst_num_experiments, y='score'), linetype='dashed', size=1, color="darkgrey", show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme(plot_title=element_text(weight='bold'), plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"),
def gene_profile(genes: list, weights: pd.DataFrame, stddev: pd.DataFrame=None, y_axis_label: str=None, highlight_n: int=None, highlight_anno: list=None, figsize: tuple=None, ylim: tuple=None) -> p9.ggplot: """ Parameters ---------- weights : DataFrame of ES weights genes : a single str or list of genes to include in plot as facets highlight_n : number of highest ESw to highlight highlight_anno : specific annotations to highlight figsize : (float, float), optional (default: None) Specify width and height of plot. Returns ------- g : ggplot Todo: * find a better way for sorting cell-types along x-axis * report if gene in genes is not found in df * report if duplicate genes * replace hacky x-axis labelling """ ### Reduce dataframe to genes of interest genes = [str.upper(s) for s in genes] idx = np.char.upper(weights.index.values.astype(str)) mask = np.isin(idx, genes) df_tidy = weights[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in dataframe." stddev_tidy = None if stddev is not None: idx = np.char.upper(stddev.index.values.astype(str)) mask = np.isin(idx, genes) stddev_tidy = stddev[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in stddev dataframe." # Constants, height and width of plot. if figsize is None: H = 5*n_genes W = 15 else: W, H = figsize if ylim is None: ylim = (-1,1) if y_axis_label is None: y_axis_label = "Expression Specificity" ### Convert to tidy / long format if necessary # Org: # ABC ACBG ACMB # POMC 0.0 0.5 0.9 # AGRP 0.2 0.0 0.0 # LEPR 0.1 0.1 0.4 # Tidy: # gene_name annotation es_weight # 1 POMC ABC 0.0 # 2 AGRP ABC 0.6 # 3 LEPR ABC 1.0 df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight") if stddev_tidy is not None: stddev_tidy.index.name = None stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev") df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"]) ### Sort values by gene_name and es_weight and add order # Sorted: # gene_name annotation es_weight x_order # 1 AGRP MOL2 0.0 1 # 2 AGRP ACNT1 0.1 2 # 3 AGRP MOL1 0.2 3 df_tidy = df_tidy.sort_values(by=["index", "weight"]) df_tidy["order"] = np.arange(len(df_tidy)) + 1 ### Generate highlight # Default: highlight top 5 if ((highlight_n is None) and (highlight_anno is None)): highlight_n = 5 # highlight list of if (highlight_anno is not None): df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno) elif (highlight_n is not None): df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n else: df_tidy["highlight"] = np.array([False] * len(df_tidy)) df_highlight = df_tidy[df_tidy["highlight"]] ### Plot # linear function to compute x_axis text-size. # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes). SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes)) # Limits of the order for each index gene / facet, e.g. [0, 266, 531] # These limits are necessary to only plot the labels order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)] def find_nearest(array,value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return array[idx] def getbreaks(lims): # function defined for use in debugging l = find_nearest(order_lims, lims[0]) r = find_nearest(order_lims, lims[1]) breaks = np.arange(l, r) return breaks def getlbls(idx): # function defined for use in debugging idx = idx lbls = df_tidy["annotation"].iloc[idx].values return lbls p = ( ### data p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation")) ### theming + p9.theme_classic() + p9.theme( figure_size = (W,H), axis_ticks_major_x = p9.element_blank(), axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # axis_text_y = p9.element_text(size=W), panel_spacing = 1, strip_background = p9.element_blank() ) + p9.ylim(ylim[0],ylim[1]) + p9.labs( x="", # e.g. "Cell-type" y=y_axis_label, # e.g. "ES weight" ) ### viz # all + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="grey", alpha=0.3, show_legend=False ) + p9.geom_point(mapping=p9.aes(size=2), color="grey", show_legend=False ) # highlight + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), color="dodgerblue", show_legend=False ) + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="dodgerblue", alpha=0.3, show_legend=False ) + p9.facet_wrap("index", scales="free", nrow=n_genes ) + p9.scale_x_continuous( # order_scale is continuous across all annotations # so the scale will look weird for each facet, e.g. # facet 1 may have order 1-7, and facet 2 has order 8-14. # therefore we must use a labeller function to get the # correct labels for each interval of order. breaks = lambda lims: getbreaks(lims), labels = lambda idx: getlbls(idx) ) ) if stddev_tidy is not None: p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="grey", width=0.1)\ + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="dodgerblue", width=0.1) # add labels last for them to be on top p = p + p9.geom_label(data=df_highlight, color = "dodgerblue", adjust_text = {'expand_points': (2,2)} ) return p
.wls(formula, data=abortion_bf15, weights=abortion_bf15.totpop.values) .fit( cov_type='cluster', cov_kwds={'groups': abortion_bf15.fip.values}, method='pinv') ) reg.summary() abortion_plot = pd.DataFrame( { 'sd': reg.bse['C(repeal)[T.1.0]:C(year)[T.1986.0]':'C(repeal)[T.1.0]:C(year)[T.2000.0]'], 'mean': reg.params['C(repeal)[T.1.0]:C(year)[T.1986.0]':'C(repeal)[T.1.0]:C(year)[T.2000.0]'], 'year': np.arange(1986, 2001) }) abortion_plot['lb'] = abortion_plot['mean'] - abortion_plot['sd']*1.96 abortion_plot['ub'] = abortion_plot['mean'] + abortion_plot['sd']*1.96 ( p.ggplot(abortion_plot, p.aes(x = 'year', y = 'mean')) + p.geom_rect(p.aes(xmin=1985, xmax=1992, ymin=-np.inf, ymax=np.inf), fill="cyan", alpha = 0.01) + p.geom_point() + p.geom_text(p.aes(label = 'year'), ha='right') + p.geom_hline(yintercept = 0) + p.geom_errorbar(p.aes(ymin = 'lb', ymax = 'ub'), width = 0.2, position = p.position_dodge(0.05)) + p.labs(title= "Estimated effect of abortion legalization on gonorrhea") )
# Plot lst_num_partitions = list(summary_df.index) threshold = pd.DataFrame(permuted_uncorrected_scores, index=num_simulated_experiments, columns=['score']) panel_A = ggplot(summary_df) + geom_line(summary_df, aes(x=lst_num_partitions, y='SVCCA score', color='group'), size=1.5) \ + geom_point(aes(x=lst_num_partitions, y='SVCCA score'), color ='darkgrey', size=0.5) \ + geom_errorbar(summary_df, aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=num_simulated_experiments, y='score'), linetype='dashed', size=1, color="darkgrey", show_legend=False) \ + labs(x = "Number of Experiments", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of experiments") \ + theme( plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"),
def barchart_make(roi, df, list_rois, config, ylimit, save_function, find_ylim_function): thisroi = list_rois[roi] current_df = df.loc[df['index'] == thisroi] current_df = current_df.sort_values([config.single_roi_fig_x_axis]) current_df = current_df.reset_index( drop=True) # Reset index to remove grouping current_df[config.single_roi_fig_x_axis] = pd.Categorical( current_df[config.single_roi_fig_x_axis], categories=current_df[config.single_roi_fig_x_axis].unique()) figure = ( pltn.ggplot( current_df, pltn.aes(x=config.single_roi_fig_x_axis, y='Mean', ymin="Mean-Conf_Int_95", ymax="Mean+Conf_Int_95", fill='factor({colour})'.format( colour=config.single_roi_fig_colour))) + pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge( preserve='single', width=0.8), width=0.8, na_rm=True) + pltn.geom_errorbar(size=1, position=pltn.position_dodge( preserve='single', width=0.8)) + pltn.labs(x=config.single_roi_fig_label_x, y=config.single_roi_fig_label_y, fill=config.single_roi_fig_label_fill) + pltn.scale_x_discrete(labels=[]) + pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0), axis_title_x=pltn.element_text( weight='bold', color='black', size=20), axis_title_y=pltn.element_text( weight='bold', color='black', size=20), axis_text_y=pltn.element_text(size=20, color='black'), legend_title=pltn.element_text(size=20, color='black'), legend_text=pltn.element_text(size=18, color='black'), subplots_adjust={'right': 0.85}, legend_position=(0.9, 0.8), dpi=config.plot_dpi) + pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis), color='black', size=20, va='top') + pltn.scale_fill_manual( values=config.colorblind_friendly_plot_colours)) if ylimit: # Set y limit of figure (used to make it the same for every barchart) figure += pltn.ylim(None, ylimit) thisroi += '_same_ylim' returned_ylim = 0 if config.use_same_axis_limits in ('Same limits', 'Create both') and ylimit == 0: returned_ylim = find_ylim_function(thisroi, figure, 'yaxis') if config.use_same_axis_limits == 'Same limits' and ylimit == 0: return returned_ylim elif ylimit != 0: folder = 'Same_yaxis' else: folder = 'Different_yaxis' save_function(figure, thisroi, config, folder, 'barchart') return returned_ylim
(temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24).max()) category_half_life # In[14]: g = (p9.ggplot( category_half_life.query("category!='none'").assign( half_life_time=lambda x: pd.to_timedelta(x.half_life_time, "D"), half_life_ci_l=lambda x: pd.to_timedelta(x.half_life_ci_l, "D"), half_life_ci_u=lambda x: pd.to_timedelta(x.half_life_ci_u, "D"), ), p9.aes(x="category", y="half_life_time", ymin="half_life_ci_l", ymax="half_life_ci_u"), ) + p9.geom_col(fill="#1f78b4") + p9.geom_errorbar() + p9.scale_x_discrete( limits=(category_half_life.query("category!='none'").sort_values( "half_life_time").category.tolist()[::-1]), ) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.coord_flip() + p9.labs( x="Preprint Categories", y="Time Until 50% of Preprints are Published", title="Preprint Category Half-Life", ) + p9.theme_seaborn(context="paper", style="white", font_scale=1.2) + p9.theme(axis_ticks_minor_x=p9.element_blank(), )) g.save("output/preprint_category_halflife.svg", dpi=250) g.save("output/preprint_category_halflife.png", dpi=250) print(g) # Take home Results: # 1. The average amount of time for half of all preprints to be published is 348 days (~1 year)
g = ( p9.ggplot( category_half_life.query("category!='none'").assign( half_life_time=lambda x: pd.to_timedelta(x.half_life_time, "D"), half_life_ci_l=lambda x: pd.to_timedelta(x.half_life_ci_l, "D"), half_life_ci_u=lambda x: pd.to_timedelta(x.half_life_ci_u, "D"), ), p9.aes( x="category", y="half_life_time", ymin="half_life_ci_l", ymax="half_life_ci_u", ), ) + p9.geom_col(fill="#1f78b4") + p9.geom_errorbar() + p9.scale_x_discrete( limits=( category_half_life.query("category!='none'") .sort_values("half_life_time") .category.tolist()[::-1] ), ) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.coord_flip() + p9.labs( x="Preprint Categories", y="Time Until 50% of Preprints are Published", title="Preprint Category Half-Life", ) + p9.theme_seaborn(context="paper", style="white", font_scale=1, font="Arial")
abortion.loc[(abortion.wht==0) & (abortion.male==1), 'bm'] = 1 abortion['bf'] = 0 abortion.loc[(abortion.wht==0) & (abortion.male==0), 'bf'] = 1 abortion_filt = abortion[(abortion.bf==1) & (abortion.age.isin([15,25]))] reg = ( smf .wls("""lnr ~ C(repeal)*C(year) + C(younger)*C(repeal) + C(younger)*C(year) + C(yr)*C(year) + C(fip)*t + acc + ir + pi + alcohol + crack + poverty + income + ur""", data=abortion_filt, weights=abortion_filt.totpop.values) .fit( cov_type='cluster', cov_kwds={'groups': abortion_filt.fip.values}, method='pinv') ) abortion_plot = pd.DataFrame({'sd': reg.bse['C(yr)[T.1]:C(year)[T.1986.0]':'C(yr)[T.1]:C(year)[T.2000.0]'], 'mean': reg.params['C(yr)[T.1]:C(year)[T.1986.0]':'C(yr)[T.1]:C(year)[T.2000.0]'], 'year':np.arange(1986, 2001)}) abortion_plot['lb'] = abortion_plot['mean'] - abortion_plot['sd']*1.96 abortion_plot['ub'] = abortion_plot['mean'] + abortion_plot['sd']*1.96 p.ggplot(abortion_plot, p.aes(x = 'year', y = 'mean')) + p.geom_rect(p.aes(xmin=1986, xmax=1991, ymin=-np.inf, ymax=np.inf), fill = "cyan", alpha = 0.01)+ p.geom_point()+ p.geom_text(p.aes(label = 'year'), ha='right')+ p.geom_hline(yintercept = 0) + p.geom_errorbar(p.aes(ymin = 'lb', ymax = 'ub'), width = 0.2, position = p.position_dodge(0.05)) +\ p.labs(title= "Estimated effect of abortion legalization on gonorrhea")
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Fits logistic regression to predict labels.' """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument( '-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file where clusters have been saved to cluster slot.') # parser.add_argument( # '-ncpu', '--number_cpu', # action='store', # dest='number_cpu', # default=50, # type=int, # help='Number of CPUs to use. Since we are testing the dask backend,\ # this corresponds to the number of CPUs available across all of\ # the worker jobs we spin out.\ # (default: %(default)s)' # ) parser.add_argument('-s', '--sparsity_l1', action='store', dest='sparsity_l1', default=0.0001, type=float, help='Smaller values specify stronger regularization.\ (default: %(default)s)') parser.add_argument('-nepoch', '--number_epoch', action='store', dest='number_epoch', default=25, type=int, help='Number of epochs.\ (default: %(default)s)') parser.add_argument( '-bs', '--batch_size', action='store', dest='batch_size', default=32, type=int, help='Batch size. Divides the dataset into n batches and updates the\ weights at the end of each one.\ (default: %(default)s)') parser.add_argument( '-tsc', '--train_size_cells', action='store', dest='train_size_cells', default=0, type=int, help='Number of cells to use for training set. If > 0 all\ remaining cells not randomly selected for training will be used\ for the test set. Overrides <train_size_fraction>.\ (default: %(default)s)') parser.add_argument('-tsf', '--train_size_fraction', action='store', dest='train_size_fraction', default=0.67, type=float, help='Fraction of the data to use for training set.\ (default: %(default)s)') parser.add_argument( '--dict_add', action='store', dest='dict_add', default='', type=str, help='Additional information to add to output model_report.\ Format: key::value:::key2::value2.\ Example: method::leiden:::resolution::3.0\ (default: %(default)s)') parser.add_argument('--grid_search', action='store_true', dest='grid_search', default=False, help='Run a grid search of hyperparameters.\ (default: %(default)s)') parser.add_argument('--memory_limit', action='store', dest='memory_limit', default=50, type=int, help='Memory limit in Gb.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: keras_model-<params>)') options = parser.parse_args() verbose = True # Set GPU memory limits gpus = tf.config.list_physical_devices('GPU') print(gpus) if gpus: # For TF v1 # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # For TF v2 try: # Method 1: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Method 2: # Restrict TensorFlow to only allocate 1GB of memory on the first # GPU # tf.config.experimental.set_virtual_device_configuration( # gpus[0], # [tf.config.experimental.VirtualDeviceConfiguration( # memory_limit=options.memory_limit*1024 # )]) # logical_gpus = tf.config.list_logical_devices('GPU') # print( # len(gpus), # "Physical GPUs,", # len(logical_gpus), # "Logical GPUs" # ) except RuntimeError as e: # Virtual devices must be set before GPUs have been initialized print(e) else: raise Exception('ERROR: no GPUs detected.') # Get additional data we are going to append to the output model info dict_add = {} if options.dict_add != '': for item in options.dict_add.split(':::'): _tmp = item.split('::') if len(_tmp) != 2: raise Exception('ERROR: check dict_add.') else: dict_add[_tmp[0]] = _tmp[1] print(dict_add) # Load the AnnData file. # This file should already have clusters identified and saved to the # clusters slot. adata = sc.read_h5ad(filename=options.h5) # Set X to cp10k # adata.X = np.expm1(adata.layers['log1p_cp10k']) # Set X to ln(cp10k+1) # NOTE: Testing with 100k TI dataset, we were able to achieve higher # accuracy with log1p_cp10k - likely becuase better spread in distribution. adata.X = adata.layers['log1p_cp10k'] # Set X to raw counts # adata.X = adata.layers['counts'] # Add some info from adata to dict_add for key, value in adata.uns['neighbors']['params'].items(): dict_add['neighbors__{}'.format(key)] = value for key, value in adata.uns['cluster']['params'].items(): dict_add['cluster__{}'.format(key)] = value # If train_size_cells, override the fraction so that the total number of # cells in the training set will be equal to train_size_cells. train_size_fraction = options.train_size_fraction if options.train_size_cells > 0: if options.train_size_cells >= adata.n_obs: raise Exception('Invalid train_size_cells.') train_size_fraction = ( 1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs)) if verbose: print( 'Set train_size_fraction to: {}.'.format(train_size_fraction)) if verbose: print('Number cells training ({}) and testing ({}).'.format( int(train_size_fraction * adata.n_obs), int((1 - train_size_fraction) * adata.n_obs))) # Set X and y X = adata.X y = adata.obs['cluster'].values # Set other variables sparsity_l1 = options.sparsity_l1 n_epochs = options.number_epoch batch_size = options.batch_size # Center and scale the data if sp.sparse.issparse(X): X = X.todense() X_std = X scaler = preprocessing.StandardScaler(with_mean=True, with_std=True) X_std = scaler.fit_transform(X) if verbose: print('center={} scale={}'.format(True, True)) # One hot encode y (the cell type classes) # encode class values as integers encoder = preprocessing.LabelEncoder() encoder.fit(y) print('Found {} clusters'.format(len(encoder.classes_))) # Define the model # NOTE: Defaults determined via grid search of 160k TI single cells def classification_model(optimizer='sgd', activation='softmax', loss='categorical_crossentropy', sparsity_l1__activity=0.0001, sparsity_l2__activity=0.0, sparsity_l1__kernel=0.0, sparsity_l2__kernel=0.0, sparsity_l1__bias=0.0, sparsity_l2__bias=0.0): # create model model = Sequential() # Use a “softmax” activation function in the output layer. This is to # ensure the output values are in the range of 0 and 1 and may be used # as predicted probabilities. # # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax # Softmax assigns decimal probabilities to each class in a multi-class # problem. Those decimal probabilities must add up to 1.0. This # additional constraint helps training converge more quickly than it # otherwise would. Softmax is implemented through a neural network # layer just before the output layer. The Softmax layer must have the # same number of nodes as the output layer. # Softmax assumes that each example is a member of exactly one class. # # Softmax should be used for multi-class prediction with single label # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture # NOTE: input dimension = number of features your data has model.add( Dense( len(encoder.classes_), # output dim is number of classes use_bias=True, # intercept activation=activation, # softmax, sigmoid activity_regularizer=L1L2(l1=sparsity_l1__activity, l2=sparsity_l2__activity), kernel_regularizer=L1L2(l1=sparsity_l1__kernel, l2=sparsity_l2__kernel), bias_regularizer=L1L2(l1=sparsity_l1__bias, l2=sparsity_l2__bias), input_dim=X.shape[1])) # Example of adding additional layers # model.add(Dense(8, input_dim=4, activation='relu')) # model.add(Dense(3, activation='softmax')) # Metrics to check out over training epochs mets = [ # loss, keras.metrics.CategoricalAccuracy(name='categorical_accuracy'), # keras.metrics.TruePositives(name='tp'), # keras.metrics.FalsePositives(name='fp'), # keras.metrics.TrueNegatives(name='tn'), # keras.metrics.FalseNegatives(name='fn'), # keras.metrics.Precision(name='precision'), # keras.metrics.Recall(name='recall'), # keras.metrics.AUC(name='auc'), keras.metrics.BinaryAccuracy(name='accuracy') ] # Use Adam gradient descent optimization algorithm with a logarithmic # loss function, which is called “categorical_crossentropy” in Keras. # UPDATE: sgd works better emperically. model.compile( optimizer=optimizer, # adam, sgd loss=loss, metrics=mets) return model # Now, either call a grid search or specific model fit if options.grid_search: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' out_file_base = '{}-grid_search'.format(out_file_base) # Call grid search of various parameters grid_result, df_grid_result = keras_grid( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, n_epochs=n_epochs, batch_size=batch_size) # NOTE: This will fail because can't pickle KerasClassifier. This is # fine though becuase results are saved in tsv.gz format below. # Save the results # out_f = '{}-grid_result.gz'.format(out_file_base) # joblib.dump( # grid_result, # out_f, # compress=('gzip', 3) # ) # Load the model # lr = joblib.load( # 'test-lr_model.joblib.gz' # ) # print(lr) # Save the results of our search to tsv out_f = '{}-grid_result.tsv.gz'.format(out_file_base) df_grid_result.to_csv(out_f, sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) # Add a single columns that summarizes params param_columns = [ col for col in df_grid_result.columns if 'param__' in col ] df_grid_result['params'] = df_grid_result[param_columns].astype( str).apply(lambda x: '-'.join(x), axis=1) # Plot the distribution of accuracy across folds split_columns = [ col for col in df_grid_result.columns if 'split' in col ] split_columns = [col for col in split_columns if '_test_score' in col] df_plt = pd.melt(df_grid_result, id_vars=['params'], value_vars=split_columns) gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_boxplot(alpha=0.8) gplt = gplt + plt9.geom_jitter(alpha=0.75) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0 # limits=[0, 1] ) gplt = gplt + plt9.labs(x='Parameters', y='Score', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-score.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) # Plot the mean time and std err for fitting results gplt = plt9.ggplot(df_grid_result, plt9.aes(x='params', y='mean_fit_time')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_point() gplt = gplt + plt9.geom_errorbar(plt9.aes( ymin='mean_fit_time-std_fit_time', ymax='mean_fit_time+std_fit_time'), width=0.2, position=plt9.position_dodge(0.05)) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-fit_time.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) else: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' # out_file_base = '{}-center={}-scale={}'.format( # out_file_base, # center, # scale # ) out_file_base = '{}-batch_size={}-epochs={}'.format( out_file_base, batch_size, n_epochs) out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format( out_file_base, str(sparsity_l1).replace('.', 'pt'), str(train_size_fraction).replace('.', 'pt')) # Fit the specific model and save the results model, model_report, y_prob_df, history = fit_model_keras( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, sparsity_l1=sparsity_l1, sparsity_l2=0.0, n_epochs=n_epochs, batch_size=batch_size, train_size_fraction=train_size_fraction) # Save the model, weights (coefficients), and bias (intercept) model.save('{}.h5'.format(out_file_base), overwrite=True, include_optimizer=True) # Save the model and weights (coefficients) seperately # open('{}.json'.format(out_file_base), 'w').write(model.to_json()) open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml()) model.save_weights('{}-weights.h5'.format(out_file_base)) # Example read functions # model = model_from_yaml(open('my_model_architecture.yaml').read()) # model.load_weights('my_model_weights.h5') # Save the model report # Add column telling us if this is cluster or summary value is_cluster = [] for i in model_report.index: if i in encoder.classes_: is_cluster.append(True) else: is_cluster.append(False) model_report['is_cluster'] = is_cluster # Add in extra data model_report['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): model_report[key] = value print(model_report) out_f = '{}-model_report.tsv.gz'.format(out_file_base) model_report.to_csv(out_f, sep='\t', index=True, index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Save the test results - each row is a cell and the columns are the # prob of that cell belonging to a particular class. # Add in extra data y_prob_df['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): y_prob_df[key] = value out_f = '{}-test_result.tsv.gz'.format(out_file_base) y_prob_df.to_csv( out_f, sep='\t', index=False, # NOTE: Not adding the label to test_result index. # index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Make a matrix of weights per gene # Columns = genes tested and rows = cell type label weight, bias = model.layers[-1].get_weights() # weight, bias = model.get_layer("output").get_weights() df_weights = pd.DataFrame.from_records( weight, index=adata.var.index, # index is gene columns=encoder.classes_) # Save the weights dataframe. out_f = '{}-weights.tsv.gz'.format(out_file_base) df_weights.to_csv(out_f, sep='\t', index=True, index_label='ensembl_gene_id', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Plot the number of features with non-zero coefficients in each # cluster. out_f = '{}-n_features.png'.format(out_file_base) df_plt = pd.DataFrame({ 'classes': df_weights.columns, 'features': (df_weights != 0).sum(axis=0) }) df_plt = df_plt.set_index('classes') # print(df_plt) # Add in catgories with no predictive model (e.g., becuase they were # too few in training). for i in adata.obs['cluster'].cat.categories: if i not in df_plt.index: df_plt = df_plt.append( pd.Series([0], index=df_plt.columns, name=i)) fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4)) # plt.bar(lr.classes_, n_features) plt.bar(df_plt.index, df_plt['features']) plt.xlabel('Cluster') plt.ylabel('Features with coefficient != 0') plt.xticks(rotation=90) for i in df_plt.index: plt.annotate(str(df_plt.loc[i, 'features']), xy=(i, df_plt.loc[i, 'features'])) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) # Plot ROC of the test and truth. out_f = '{}-roc.png'.format(out_file_base) fig = plt.figure() cell_label_true = y_prob_df.pop('cell_label_true') # Drop columns that are not cell type labels for i in y_prob_df.columns: if 'class__' not in i: del y_prob_df[i] plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot metrics vs cluster size to see if smaller clusters have poorer # metric measures. df_plt = model_report.fillna(0) for i in df_plt.index: if i not in encoder.classes_: df_plt = df_plt.drop(i) for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']: out_f = '{}-cluster_size_{}.png'.format(out_file_base, i) fig = plt.figure() plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5) plt.xlabel('Number of cells in cluster (full dataset)') plt.ylabel(i) if i in ['AUC', 'f1-score', 'average_precision_score']: plt.ylim(0, 1) elif i == 'MCC': plt.ylim(-1, 1) # Add annotation of the cluster for index, row in df_plt.iterrows(): if row['n_cells_full_dataset'] == 0: print('ERROP: n_cells_full_dataset = 0 for {}.'.format( index)) plt.annotate( index, # this is the text (row['n_cells_full_dataset'], row[i]), # point to label textcoords='offset points', # how to position the text xytext=(0, 10), # distance from text to points (x,y) ha='center' # horiz alignment can be left, right, center ) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.xscale('log', basex=10) fig.savefig('{}-cluster_size_{}_log10.png'.format( out_file_base, i), dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot history of metrics over epochs for dat_i in history.history.keys(): fig = plt.figure() plt.plot(history.history[dat_i]) plt.ylabel(dat_i) plt.xlabel('Epoch') fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i), dpi=300, bbox_inches='tight') plt.close(fig)
threshold = pd.DataFrame( pd.np.tile( permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) panel_A = ggplot(all_svcca) + geom_line(all_svcca, aes(x=lst_num_experiments, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_experiments, y='score'), color ='darkgrey', size=0.5) \ + geom_errorbar(all_svcca, aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=lst_num_experiments, y='score'), linetype='dashed', size=1.5, color="darkgrey", show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme(plot_title=element_text(weight='bold'), plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"),
# + geom_line(data = inc, mapping=aes(x="julian", y="uniqueID"), colour="black") # + geom_smooth(data=inc, mapping=aes(x="julian", y="uniqueID"), colour="black", method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}) + scale_x_continuous(labels=label_x, limits=[xmin, xmax])).save("figs/ACI_BARW_incend.png", height=8, width=8, dpi=150) inc = barw_nest.groupby("julian", as_index=False).uniqueID.count().reset_index() (ggplot(data=inc, mapping=aes(x="julian", y="uniqueID")) + xlab("Day") + ylab("Number of nest initiation/hatch") + geom_smooth(method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}) + annotate("rect", xmin=[inc_start, hatch_start], xmax=[inc_end, hatch_end], ymin=-math.inf, ymax=math.inf, alpha=0.1, fill=["red", "blue"]) + annotate("text", x=[inc_lbl_pos, hatch_lbl_pos], y=4.5, label=["incubation", "hatch"]) + scale_x_continuous(labels=label_x, limits=[xmin, xmax])).save("figs/Nest_BARW_incend.png", height=8, width=8, dpi=150) res3 = aci.loc[aci.site == "Barrow"] res3 = res3.groupby(["plot"], as_index=False).apply(check_dates, site_data) res3.reset_index() res3 = res3.groupby(["plot", "julian"], as_index=False).agg({"ACI": ["mean", "std"], "lat": "mean", "lon": "mean"}) res3.columns = pd.Index(join_tuple(i, "_") for i in res3.columns) res3 (ggplot(data=res3, mapping=aes(x='julian', y='ACI_mean', colour='plot')) + xlab("Day") + ylab("Mean daily ACI (standardized)") + facet_grid("plot~", scales="free") + geom_point() + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) + geom_smooth(method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}) + scale_x_continuous(labels=label_x)) # .save("figs/ACI_BARW_plots2.png", height=12, width=8, dpi=150)
'auroc_upper': lambda x: x.auroc_mean + (critical_val * x.auroc_std)/pd.np.sqrt(x.lf_num_len), 'auroc_lower': lambda x: x.auroc_mean - (critical_val * x.auroc_std)/pd.np.sqrt(x.lf_num_len), 'aupr_upper': lambda x: x.aupr_mean + (critical_val * x.aupr_std)/pd.np.sqrt(x.lf_num_len), 'aupr_lower': lambda x: x.aupr_mean - (critical_val * x.aupr_std)/pd.np.sqrt(x.lf_num_len) }) ) dev_disc_df.head(2) # In[7]: g = ( p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="auroc_mean", linetype="model", color="relation")) + p9.geom_point() + p9.geom_errorbar(p9.aes(ymin="auroc_lower", ymax="auroc_upper")) + p9.geom_line(p9.aes(group="model")) + p9.scale_x_discrete(limits=[0, 1, 6, 11, 16, 'All']) + p9.scale_color_manual(values={ "DaG": mcolors.to_hex(color_map["DaG"]), 'CtD': mcolors.to_hex(color_map["CtD"]), "CbG": mcolors.to_hex(color_map["CbG"]), "GiG": mcolors.to_hex(color_map["GiG"]), }, guide=False) + p9.facet_wrap("relation") + p9.labs( title="Disc Model Performance (Tune Set)", ) + p9.xlab("Number of Label Functions") + p9.ylab("AUROC") + p9.theme_bw()