def zero_center_fucci(green_fucci, red_fucci, u_plate, well_plate, plate): '''Zero center and rescale FUCCI data in the log space''' log_green_fucci, log_red_fucci = np.log10(green_fucci), np.log10(red_fucci) wp_p_dict = dict([(str(p), plate == p) for p in u_plate]) logmed_green_fucci_p = dict([(str(p), np.log10(np.median(green_fucci[wp_p_dict[str(p)]]))) for p in u_plate]) logmed_red_fucci_p = dict([(str(p), np.log10(np.median(red_fucci[wp_p_dict[str(p)]]))) for p in u_plate]) logmed_green_fucci = np.array([logmed_green_fucci_p[wp.split("_")[1]] for wp in well_plate]) logmed_red_fucci = np.array([logmed_red_fucci_p[wp.split("_")[1]] for wp in well_plate]) log_green_fucci_zeroc = np.array(log_green_fucci) - logmed_green_fucci log_red_fucci_zeroc = np.array(log_red_fucci) - logmed_red_fucci log_green_fucci_zeroc_rescale = (log_green_fucci_zeroc - np.min(log_green_fucci_zeroc)) / np.max(log_green_fucci_zeroc) log_red_fucci_zeroc_rescale = (log_red_fucci_zeroc - np.min(log_red_fucci_zeroc)) / np.max(log_red_fucci_zeroc) fucci_data = np.column_stack([log_green_fucci_zeroc_rescale,log_red_fucci_zeroc_rescale]) result = (log_green_fucci, log_red_fucci, log_green_fucci_zeroc, log_red_fucci_zeroc, log_green_fucci_zeroc_rescale, log_red_fucci_zeroc_rescale, fucci_data) # Pickle the results utils.np_save_overwriting("output/pickles/log_green_fucci_zeroc.npy", log_green_fucci_zeroc) utils.np_save_overwriting("output/pickles/log_red_fucci_zeroc.npy", log_red_fucci_zeroc) utils.np_save_overwriting("output/pickles/log_green_fucci_zeroc_rescale.npy", log_green_fucci_zeroc_rescale) utils.np_save_overwriting("output/pickles/log_red_fucci_zeroc_rescale.npy", log_red_fucci_zeroc_rescale) utils.np_save_overwriting("output/pickles/fucci_data.npy", fucci_data) return result
def general_plots(u_plates): '''Make plots to illustrate the results of the scRNA-Seq analysis''' valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" adata, phases = read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, u_plates) # QC plots before filtering sc.pl.highest_expr_genes(adata, n_top=20, show=False, save="AllCells.pdf") # Post filtering QC do_log_normalization = True do_remove_blob = False adata, phasesfilt = qc_filtering(adata, do_log_normalization, do_remove_blob) adata = zero_center_fucci(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) sc.pl.highly_variable_genes(adata, show=False, save="AllCells.pdf") # Louvain clustering and UMAP plots # Idea: based on the expression of all genes, do the cell cycle phases cluster together? # Execution: scanpy methods: UMAP statistics first, then make UMAP # Output: UMAP plots sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) sc.tl.louvain(adata) utils.np_save_overwriting("output/pickles/louvain.npy", adata.obs["louvain"]) sc.tl.umap(adata) plt.rcParams['figure.figsize'] = (10, 10) sc.pl.umap(adata, color=["phase"], show=False, save="AllCellsSeqCenterPhase.pdf") # General display of RNA abundances in TPMs sbn.displot(np.concatenate(adata.X), color="tab:orange") plt.xlabel("TPM") plt.ylabel("Density") plt.savefig("figures/rna_abundance_density.pdf") plt.close()
def metacompartments(u_well_plates, compartment_dict, my_df_filtered_variation): '''Get the compartments for the unique wellplates''' wp_iscell = np.asarray([ compartment_dict[wp].lower().startswith("cell") if wp in compartment_dict else False for wp in u_well_plates ]) wp_isnuc = np.asarray([ compartment_dict[wp].lower().startswith("nuc") if wp in compartment_dict else False for wp in u_well_plates ]) wp_iscyto = np.asarray([ compartment_dict[wp].lower().startswith("cyto") if wp in compartment_dict else False for wp in u_well_plates ]) # Pickle the results utils.np_save_overwriting("output/pickles/wp_iscell.npy", wp_iscell) utils.np_save_overwriting("output/pickles/wp_isnuc.npy", wp_isnuc) utils.np_save_overwriting("output/pickles/wp_iscyto.npy", wp_iscyto) wp_nocompartmentinfo = ~wp_iscell & ~wp_isnuc & ~wp_iscyto print( f"{sum(wp_nocompartmentinfo)}: samples without compartment information; to be filtered since they're biologically defined as CCD and not included in the analysis" ) print( f"{len(my_df_filtered_variation)}: number of cells before filtering for compartment information" ) my_df_filtered_compartmentvariation = my_df_filtered_variation[ ~np.isin(my_df_filtered_variation. well_plate, u_well_plates[wp_nocompartmentinfo])] print( f"{len(my_df_filtered_compartmentvariation)}: number of cells before filtering for compartment information" ) return wp_iscell, wp_isnuc, wp_iscyto, my_df_filtered_compartmentvariation
def pseudotime_protein(fucci_data, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases): '''Generate a polar coordinate model of cell cycle progression based on the FUCCI intensities''' # Generate model polar_coord_results = fucci_polar_coords(fucci_data[:,0], fucci_data[:,1], "Protein") pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results # Sort results by pseudotime sort_results = pol_sort(pol_sort_inds, more_than_start, less_than_start, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases) pol_sort_well_plate, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_fred, pol_sort_fgreen, pol_sort_mockbulk_phases = sort_results # Generate some plots fucci_hist2d(centered_data, cart_data_ur, start_pt, g1_end_pt, g1s_end_pt, "Protein", R_2, start_phi, 200, True, pol_sort_well_plate, pol_sort_ab_nuc, pol_sort_centered_data0, pol_sort_centered_data1) plot_fucci_intensities_on_pseudotime(pol_sort_norm_rev, pol_sort_centered_data1, pol_sort_centered_data0) # pickle the results utils.np_save_overwriting("output/pickles/pol_sort_well_plate.npy", pol_sort_well_plate) utils.np_save_overwriting("output/pickles/pol_sort_well_plate_imgnb.npy", pol_sort_well_plate_imgnb) utils.np_save_overwriting("output/pickles/pol_sort_well_plate_imgnb_objnb.npy", pol_sort_well_plate_imgnb_objnb) utils.np_save_overwriting("output/pickles/pol_sort_norm_rev.npy", pol_sort_norm_rev) utils.np_save_overwriting("output/pickles/pol_sort_ab_nuc.npy", pol_sort_ab_nuc) utils.np_save_overwriting("output/pickles/pol_sort_ab_cyto.npy", pol_sort_ab_cyto) utils.np_save_overwriting("output/pickles/pol_sort_ab_cell.npy", pol_sort_ab_cell) utils.np_save_overwriting("output/pickles/pol_sort_mt_cell.npy", pol_sort_mt_cell) utils.np_save_overwriting("output/pickles/pol_sort_area_cell.npy", pol_sort_area_cell) utils.np_save_overwriting("output/pickles/pol_sort_area_nuc.npy", pol_sort_area_nuc) utils.np_save_overwriting("output/pickles/pol_sort_fred.npy", pol_sort_fred) utils.np_save_overwriting("output/pickles/pol_sort_fgreen.npy", pol_sort_fgreen) utils.np_save_overwriting("output/pickles/pol_sort_centered_data0.npy", pol_sort_centered_data0) utils.np_save_overwriting("output/pickles/pol_sort_centered_data1.npy", pol_sort_centered_data1) return (pol_sort_well_plate, pol_sort_norm_rev, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_centered_data1, pol_sort_centered_data0, pol_sort_mockbulk_phases)
def gaussian_clustering_analysis(alpha_gauss, doGeneratePlots, g1, sph, g2, wp_ensg, well_plate, u_well_plates, ab_cell, ab_nuc, ab_cyto, mt_cell, wp_iscell, wp_isnuc, wp_iscyto): '''Analyze the results of Gaussian clustering of FUCCI data for each protein antibody staining''' wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_mt_kruskal = [],[],[],[] curr_wp_phases = [] mockbulk_phases = np.array([" "] * len(ab_cell)) fileprefixes = np.array([f"{ensg}_{sum(wp_ensg[:ei] == ensg)}" for ei, ensg in enumerate(wp_ensg)]) for iii, wp in enumerate(u_well_plates): curr_well_inds = well_plate==wp curr_wp_g1 = curr_well_inds & g1 curr_wp_sph = curr_well_inds & sph curr_wp_g2 = curr_well_inds & g2 curr_wp_phase_list = get_phase_strings(g1[curr_well_inds], sph[curr_well_inds], g2[curr_well_inds]) mockbulk_phases[curr_well_inds] = np.asarray(curr_wp_phase_list) curr_wp_phases.append(curr_wp_phase_list) wp_cell_kruskal.append(scipy.stats.kruskal(ab_cell[curr_wp_g1], ab_cell[curr_wp_sph], ab_cell[curr_wp_g2])[1]) wp_nuc_kruskal.append(scipy.stats.kruskal(ab_nuc[curr_wp_g1], ab_nuc[curr_wp_sph], ab_nuc[curr_wp_g2])[1]) wp_cyto_kruskal.append(scipy.stats.kruskal(ab_cyto[curr_wp_g1], ab_cyto[curr_wp_sph], ab_cyto[curr_wp_g2])[1]) wp_mt_kruskal.append(scipy.stats.kruskal(mt_cell[curr_wp_g1], mt_cell[curr_wp_sph], mt_cell[curr_wp_g2])[1]) max_val_for_norm = np.max(ab_cell[curr_well_inds] if wp_iscell[iii] else ab_nuc[curr_well_inds] if wp_isnuc[iii] else ab_cyto[curr_well_inds]) max_mt_for_norm = np.max(mt_cell[curr_well_inds]) if doGeneratePlots: gaussian_boxplot_result( (ab_cell[curr_wp_g1] if wp_iscell[iii] else ab_nuc[curr_wp_g1] if wp_isnuc[iii] else ab_cyto[curr_wp_g1]) / max_val_for_norm, (ab_cell[curr_wp_sph] if wp_iscell[iii] else ab_nuc[curr_wp_sph] if wp_isnuc[iii] else ab_cyto[curr_wp_sph]) / max_val_for_norm, (ab_cell[curr_wp_g2] if wp_iscell[iii] else ab_nuc[curr_wp_g2] if wp_isnuc[iii] else ab_cyto[curr_wp_g2]) / max_val_for_norm, "figures/GaussianBoxplots", fileprefixes[iii]) gaussian_boxplot_result( mt_cell[curr_wp_g1] / max_mt_for_norm, mt_cell[curr_wp_sph] / max_mt_for_norm, mt_cell[curr_wp_g2] / max_mt_for_norm, "figures/GaussianBoxplots_mt", f"{fileprefixes[iii]}_mt") # multiple testing correction for protein of interest wp_comp_kruskal_gaussccd_p = utils.values_comp(wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_iscell, wp_isnuc, wp_iscyto) wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp = utils.benji_hoch(alpha_gauss, wp_comp_kruskal_gaussccd_p) utils.np_save_overwriting("output/pickles/wp_comp_kruskal_gaussccd_adj.npy", wp_comp_kruskal_gaussccd_adj) utils.np_save_overwriting("output/pickles/wp_pass_kruskal_gaussccd_bh_comp.npy", wp_pass_kruskal_gaussccd_bh_comp) # multiple testing correction for microtubules wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt = utils.benji_hoch(alpha_gauss, wp_mt_kruskal) utils.np_save_overwriting("output/pickles/wp_mt_kruskal_gaussccd_adj.npy", wp_mt_kruskal_gaussccd_adj) utils.np_save_overwriting("output/pickles/wp_pass_gaussccd_bh_mt.npy", wp_pass_gaussccd_bh_mt) # save the phase information utils.np_save_overwriting("output/pickles/curr_wp_phases.npy", np.array(curr_wp_phases, dtype=object)) utils.np_save_overwriting("output/pickles/mockbulk_phases.npy", np.array(mockbulk_phases)) print(f"{len(wp_pass_kruskal_gaussccd_bh_comp)}: number of genes tested") print(f"{sum(wp_pass_kruskal_gaussccd_bh_comp)}: number of passing genes at {alpha_gauss*100}% FDR in compartment") return wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp, wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt
def analyze_ccd_variation_by_mvavg_rna(adata, wp_ensg, ccd_comp, bioccd, adata_nonccdproteins, adata_regevccdgenes, biotype_to_use, use_isoforms=False, make_mvavg_plots_isoforms=False): expression_data = adata.X # log normalized normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:,None]).T fucci_time_inds = np.argsort(adata.obs["fucci_time"]) norm_exp_sort = np.take(normalized_exp_data, fucci_time_inds, axis=0) moving_averages = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort, WINDOW) mvavg_xvals = MovingAverages.mvavg(adata.obs["fucci_time"][fucci_time_inds], WINDOW) cell_cycle_variance = np.var(moving_averages, 0) total_variance = np.var(norm_exp_sort, 0) total_gini = np.apply_along_axis(utils.gini, 0, norm_exp_sort) percent_ccd_variance = cell_cycle_variance / total_variance avg_expression = np.median(norm_exp_sort, 0) # randomize and calculate the mean difference in percent variances from random percent_ccd_variance_rng, mean_diff_from_rng = [],[] perms = np.asarray([np.random.permutation(len(adata.obs)) for nnn in np.arange(PERMUTATIONS if not use_isoforms else PERMUTATIONS_ISOFORMS)]) picklePath = f"output/pickles/percent_ccd_variance_rng{'' if not use_isoforms else 'Isoforms'}.npy" meandiffPath = f"output/pickles/mean_diff_from_rng{'' if not use_isoforms else 'Isoforms'}.npy" if not os.path.exists(picklePath): # norm_exp_sort_perm = np.asarray([np.take(normalized_exp_data, perm, axis=0) for perm in perms]) # moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 1, norm_exp_sort_perm, WINDOW) # percent_ccd_variance_rng = np.var(moving_averages_perm, axis=1) / np.var(norm_exp_sort_perm, axis=1) for iii, perm in enumerate(perms): if iii % 50 == 0: print(f"permutation {iii}") norm_exp_sort_perm = np.take(normalized_exp_data, perm, axis=0) moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_perm, WINDOW) percent_ccd_variance_rng.append( np.var(moving_averages_perm, axis=0) / np.var(norm_exp_sort_perm, axis=0)) utils.np_save_overwriting(picklePath, percent_ccd_variance_rng) else: percent_ccd_variance_rng = np.load(picklePath, allow_pickle=True) percent_ccd_variance_rng = np.asarray(percent_ccd_variance_rng) mean_diff_from_rng = np.mean((percent_ccd_variance - percent_ccd_variance_rng).T, 1) utils.np_save_overwriting(meandiffPath, mean_diff_from_rng) # Statistical testing based on randomization analysis alpha_ccd = 0.01 pass_meandiff = mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM ccd_var_comp_rng_wilcoxp = np.apply_along_axis(scipy.stats.wilcoxon, 1, (percent_ccd_variance - percent_ccd_variance_rng).T, None, "wilcox", False, "greater").T[1].T eq_percvar_adj, pass_eq_percvar_adj = utils.bonf(alpha_ccd, ccd_var_comp_rng_wilcoxp) gtpass_eq_percvar_adj = pass_eq_percvar_adj & (percent_ccd_variance > np.median(percent_ccd_variance_rng, axis=0)) ccdprotein = np.isin(adata.var_names, np.concatenate((wp_ensg[ccd_comp], bioccd))) gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"]) gene_ids = list(gene_info["gene_id"]) gene_names = list(gene_info["name"]) gene_id_name = dict([(gene_ids[idxx], gene_names[idxx]) for idxx in range(len(gene_info))]) ccdstring = np.array(["No "] * len(ccdprotein)) ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp])] = "Pseudotime" ccdstring[np.isin(adata.var_names, bioccd)] = "Mitotic" ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp]) & np.isin(adata.var_names, bioccd)] = "Pseudotime&Mitotic" percent_variance_tests = pd.DataFrame( {"gene" : adata.var_names, "name" : [gene_id_name[x] if x in gene_id_name else "" for x in adata.var_names], "ccd_transcript" : pass_meandiff, "regev_ccd" : adata_regevccdgenes, "ccd_protein" : ccdstring, "nonccd_protein" : adata_nonccdproteins, "mean_diff_from_rng":mean_diff_from_rng, "-log10 CCD FDR":-np.log10(eq_percvar_adj)}) percent_variance_tests.to_csv(f"output/transcript_regulation{biotype_to_use}{'' if not use_isoforms else 'Isoforms'}.csv", index=False) # And keep track of the ccd genes with and without transcript regulation ccdtranscript = pass_meandiff ccdprotein_transcript_regulated = ccdprotein & pass_meandiff ccdprotein_nontranscript_regulated = ccdprotein & ~pass_meandiff ccdtranscript_names = np.array(adata.var_names)[ccdtranscript] proteinccd_transcript_regulated_names = np.array(adata.var_names)[ccdprotein_transcript_regulated] proteinccd_nontranscript_regulated_names = np.array(adata.var_names)[ccdprotein_nontranscript_regulated] utils.np_save_overwriting(f"output/pickles/ccdprotein{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein) # pseudotime/mitotic ccd, might not have all the proteins, since this only has proteins not filtered in RNA-Seq analysis utils.np_save_overwriting(f"output/pickles/ccdtranscript{'' if not use_isoforms else 'Isoforms'}.npy", ccdtranscript) utils.np_save_overwriting(f"output/pickles/ccdprotein_transcript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_transcript_regulated) utils.np_save_overwriting(f"output/pickles/ccdprotein_nontranscript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_nontranscript_regulated) pd.DataFrame({"gene" : ccdtranscript_names}).to_csv(f"output/all_ccdtranscript_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : proteinccd_transcript_regulated_names}).to_csv(f"output/proteinccd_transcript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : proteinccd_nontranscript_regulated_names}).to_csv(f"output/proteinccd_nontranscript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : adata.var_names}).to_csv(f"output/gene_names{'' if not use_isoforms else 'Isoforms'}.csv") # make folders mvpercs = [] if use_isoforms and not make_mvavg_plots_isoforms else mvavg_plots_pergene(adata, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, use_isoforms) if not use_isoforms or make_mvavg_plots_isoforms: folder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNAPseudotimes{'' if not use_isoforms else 'Isoforms'}" ccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}" ccdtransnonccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptNonCCDProtein{'' if not use_isoforms else 'Isoforms'}" nonccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}" nonccdfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCD" for f in [ccdtransccdprotfolder,ccdtransnonccdprotfolder,nonccdtransccdprotfolder,nonccdfolder]: if not os.path.exists(f): os.mkdir(f) # CCD transcript & not CCD protein for ensg in adata.var_names[ccdtranscript & ~ccdprotein]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransnonccdprotfolder, ensg +'_mvavg.pdf')) # CCD transcript & CCD Protein for ensg in adata.var_names[ccdprotein_transcript_regulated]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransccdprotfolder, ensg +'_mvavg.pdf')) # Not CCD transcript & CCD Protein for ensg in adata.var_names[ccdprotein_nontranscript_regulated]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdtransccdprotfolder, ensg +'_mvavg.pdf')) # Non-CCD for ensg in adata.var_names[~ccdtranscript & ~ccdprotein]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdfolder, ensg+'_mvavg.pdf')) # Figures of merit with open("output/figuresofmerit.txt", "a") as file: fom = "--- RNA pseudotime\n\n" fom += f"We identified {sum(ccdtranscript)} {'genes' if use_isoforms else 'transcript isoforms'} of {len(ccdtranscript)} protein-coding {'genes' if use_isoforms else 'transcript isoforms'} analyzed ({100 * sum(ccdtranscript) / len(ccdtranscript)}%) to have variance in expression levels correlated to cell cycle progression" + "\n\n" if not use_isoforms: fom += f"We can attribute only {100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}% of proteomic cell cycle regulation to transcriptomic cycling with single-cell RNA sequencing" + "\n\n" fom += f"This includes {100 * sum(np.isin(adata.var_names[mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM], adata.var_names[adata_regevccdgenes])) / sum(adata_regevccdgenes)}% of known CCD transcripts. Of these, {sum(ccdprotein_transcript_regulated)} were also cell cycle dependent proteins ({100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}%). Of the {sum(ccdprotein)} CCD proteins, {sum(ccdprotein_nontranscript_regulated)} did not have CCD transcripts, including DUSP18 (Figure 2E). There were {sum(ccdtranscript & adata_nonccdproteins)} CCD transcripts that were Non-CCD as proteins." + "\n\n" fom += f"" + "\n\n" print(fom) file.write(fom) return percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs
def read_sample_info(df): '''Get the metadata for all the samples''' plate = np.asarray(df.plate) u_plate = np.unique(plate) well_plate = np.asarray(df.well_plate) imgnb = np.asarray(df.ImageNumber) well_plate_imgnb = np.asarray( [f"{wp}_{imgnb[i]}" for i, wp in enumerate(well_plate)]) u_well_plates = np.unique(well_plate) ab_objnum = np.asarray(df.ObjectNumber) well_plate_imgnb_objnb = np.asarray( [f"{wp}_{imgnb[i]}_{ab_objnum[i]}" for i, wp in enumerate(well_plate)]) area_cell = np.asarray(df.Area_cell) area_nuc = np.asarray(df.AreaShape_Area) area_cyto = np.asarray(df.Area_cyto) name_df = pd.read_csv( "input/ProteinData/FucciStainingSummaryFirstPlates.csv") wppp1, ensggg1, abbb1, rrrr, cccc1 = list(name_df["well_plate"]), list( name_df["ENSG"]), list(name_df["Antibody"]), list( name_df["Results_final_update"]), list(name_df["Compartment"]) name_df2 = pd.read_csv( "input/ProteinData/FucciStainingSummarySecondPlates.csv") wppp2, ensggg2, abbb2, cccc2 = list(name_df2["well_plate"]), list( name_df2["ENSG"]), list(name_df2["Antibody"]), list( name_df2["Compartment"]) wppp, ensggg, abbb, cccc = wppp1 + wppp2, ensggg1 + ensggg2, abbb1 + abbb2, cccc1 + cccc2 ensg_dict = dict([(wppp[i], ensggg[i]) for i in range(len(wppp))]) ab_dict = dict([(wppp[i], abbb[i]) for i in range(len(wppp))]) result_dict = dict([(wppp[i], rrrr[i]) for i in range(len(wppp1))]) compartment_dict = dict([(wppp[i], cccc[i]) for i in range(len(wppp))]) ENSG = np.asarray( [ensg_dict[wp] if wp in ensg_dict else "" for wp in well_plate]) antibody = np.asarray( [ab_dict[wp] if wp in ab_dict else "" for wp in well_plate]) result = np.asarray( [result_dict[wp] if wp in result_dict else "" for wp in well_plate]) compartment = np.asarray([ compartment_dict[wp] if wp in compartment_dict else "" for wp in well_plate ]) # Pickle the results if not os.path.exists("output/"): os.mkdir("output/") if not os.path.exists("output/pickles/"): os.mkdir("output/pickles/") if not os.path.exists("figures/"): os.mkdir("figures/") utils.np_save_overwriting("output/pickles/plate.npy", plate) utils.np_save_overwriting("output/pickles/u_plate.npy", u_plate) utils.np_save_overwriting("output/pickles/u_well_plates.npy", u_well_plates) utils.np_save_overwriting("output/pickles/area_cell.npy", area_cell) utils.np_save_overwriting("output/pickles/area_nuc.npy", area_nuc) utils.np_save_overwriting("output/pickles/area_cyto.npy", area_cyto) utils.np_save_overwriting("output/pickles/well_plate.npy", well_plate) utils.np_save_overwriting("output/pickles/well_plate_imgnb.npy", well_plate_imgnb) utils.np_save_overwriting("output/pickles/well_plate_imgnb_objnb.npy", well_plate_imgnb_objnb) return plate, u_plate, well_plate, well_plate_imgnb, u_well_plates, ab_objnum, area_cell, area_nuc, area_cyto, ensg_dict, ab_dict, result_dict, compartment_dict, ENSG, antibody, result, compartment
def read_sample_data(df): '''Read antibody intensity data for each sample and save it to a file for later use.''' # Antibody data (mean intensity) ab_nuc = np.asarray([ df.Intensity_MeanIntensity_ResizedAb, df.Intensity_IntegratedIntensity_ResizedAb, df.Intensity_IntegratedIntensity_ResizedAb / df.AreaShape_Area ][INTENSITY_SWITCH]) ab_cyto = np.asarray([ df.Mean_ab_Cyto, df.Integrated_ab_cyto, df.Integrated_ab_cyto / df.AreaShape_Area ][INTENSITY_SWITCH]) ab_cell = np.asarray([ df.Mean_ab_cell, df.Integrated_ab_cell, df.Integrated_ab_cell / df.AreaShape_Area ][INTENSITY_SWITCH]) mt_cell = np.asarray([ df.Mean_mt_cell, df.Integrated_mt_cell, df.Integrated_mt_cell / df.AreaShape_Area ][INTENSITY_SWITCH]) # Fucci data (mean intensity) green_fucci = np.asarray(df.Intensity_MeanIntensity_CorrResizedGreenFUCCI) red_fucci = np.asarray(df.Intensity_MeanIntensity_CorrResizedRedFUCCI) # Pickle the results utils.np_save_overwriting("output/pickles/ab_nuc.npy", ab_nuc) utils.np_save_overwriting("output/pickles/ab_cyto.npy", ab_cyto) utils.np_save_overwriting("output/pickles/ab_cell.npy", ab_cell) utils.np_save_overwriting("output/pickles/mt_cell.npy", mt_cell) utils.np_save_overwriting("output/pickles/green_fucci.npy", green_fucci) utils.np_save_overwriting("output/pickles/red_fucci.npy", red_fucci) return ab_nuc, ab_cyto, ab_cell, mt_cell, green_fucci, red_fucci
def previous_results(u_well_plates, result_dict, ensg_dict, ab_dict): '''Process the results metadata into lists of previously annotated CCD proteins''' wp_ensg = np.asarray( [ensg_dict[wp] if wp in ensg_dict else "" for wp in u_well_plates]) wp_ab = np.asarray( [ab_dict[wp] if wp in ab_dict else "" for wp in u_well_plates]) wp_prev_ccd = np.asarray([ wp in result_dict and result_dict[wp].startswith("ccd") for wp in u_well_plates ]) wp_prev_notccd = np.asarray([ wp in result_dict and result_dict[wp].startswith("notccd") for wp in u_well_plates ]) wp_prev_negative = np.asarray([ wp in result_dict and result_dict[wp].startswith("negative") for wp in u_well_plates ]) prev_ccd_ensg = wp_ensg[wp_prev_ccd] prev_notccd_ensg = wp_ensg[wp_prev_notccd] prev_negative_ensg = wp_ensg[wp_prev_negative] # Pickle the results utils.np_save_overwriting("output/pickles/wp_ensg.npy", wp_ensg) utils.np_save_overwriting("output/pickles/wp_ab.npy", wp_ab) utils.np_save_overwriting("output/pickles/wp_prev_ccd.npy", wp_prev_ccd) utils.np_save_overwriting("output/pickles/wp_prev_notccd.npy", wp_prev_notccd) utils.np_save_overwriting("output/pickles/wp_prev_negative.npy", wp_prev_negative) utils.np_save_overwriting("output/pickles/prev_ccd_ensg.npy", prev_ccd_ensg) utils.np_save_overwriting("output/pickles/prev_notccd_ensg.npy", prev_notccd_ensg) utils.np_save_overwriting("output/pickles/prev_negative_ensg.npy", prev_negative_ensg) return wp_ensg, wp_ab, wp_prev_ccd, wp_prev_notccd, wp_prev_negative, prev_ccd_ensg, prev_notccd_ensg, prev_negative_ensg
def calculate_variation(use_log, u_well_plates, wp_iscell, wp_isnuc, wp_iscyto, pol_sort_well_plate, pol_sort_ab_cell, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_mt_cell, pol_sort_well_plate_imgnb): '''Calculate overall variation of protein staining intensity in single cells''' var_cell, var_nuc, var_cyto, var_mt = [], [], [], [ ] # mean intensity variances per antibody cv_cell, cv_nuc, cv_cyto, cv_mt = [], [], [], [] gini_cell, gini_nuc, gini_cyto, gini_mt = [], [], [], [ ] # mean intensity ginis per antibody mean_mean_cell, mean_mean_nuc, mean_mean_cyto, mean_mean_mt = [], [], [], [ ] # mean mean-intensity cell_counts = [] wpi_img = [] gini_cell_img, gini_nuc_img, gini_cyto_img, gini_mt_img = [], [], [], [ ] # mean intensity g per field of view var_cell_img, var_nuc_img, var_cyto_img, var_mt_img = [], [], [], [ ] # mean intensity variances per field of view cv_cell_img, cv_nuc_img, cv_cyto_img, cv_mt_img = [], [], [], [] # The variance needs to be calculated separately for each well because they all have different numbers of cells for well in u_well_plates: curr_well_inds = pol_sort_well_plate == well curr_ab_cell = pol_sort_ab_cell[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_cell[curr_well_inds]) curr_ab_nuc = pol_sort_ab_nuc[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_nuc[curr_well_inds]) curr_ab_cyto = pol_sort_ab_cyto[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_cyto[curr_well_inds]) curr_mt_cell = pol_sort_mt_cell[ curr_well_inds] if not use_log else np.log10( pol_sort_mt_cell[curr_well_inds]) cell_counts.append(len(curr_ab_cell)) var_cell.append(np.var(curr_ab_cell)) var_nuc.append(np.var(curr_ab_nuc)) var_cyto.append(np.var(curr_ab_cyto)) var_mt.append(np.var(curr_mt_cell)) cv_cell.append(scipy.stats.variation(curr_ab_cell)) cv_nuc.append(scipy.stats.variation(curr_ab_nuc)) cv_cyto.append(scipy.stats.variation(curr_ab_cyto)) cv_mt.append(scipy.stats.variation(curr_mt_cell)) gini_cell.append(utils.gini(curr_ab_cell)) gini_nuc.append(utils.gini(curr_ab_nuc)) gini_cyto.append(utils.gini(curr_ab_cyto)) gini_mt.append(utils.gini(curr_mt_cell)) # Save the mean mean intensities mean_mean_cell.append(np.mean(curr_ab_cell)) mean_mean_nuc.append(np.mean(curr_ab_nuc)) mean_mean_cyto.append(np.mean(curr_ab_cyto)) mean_mean_mt.append(np.mean(curr_mt_cell)) curr_well_plate_imgnbs = pol_sort_well_plate_imgnb[curr_well_inds] curr_wpi_img = [] curr_gini_cell_img, curr_gini_nuc_img, curr_gini_cyto_img, curr_gini_mt_img = [],[],[],[] # mean intensity variances per field of view curr_var_cell_img, curr_var_nuc_img, curr_var_cyto_img, curr_var_mt_img = [],[],[],[] # mean intensity variances per field of view curr_cv_cell_img, curr_cv_nuc_img, curr_cv_cyto_img, curr_cv_mt_img = [], [], [], [] for wpi in np.unique(curr_well_plate_imgnbs): curr_wpis = pol_sort_well_plate_imgnb == wpi curr_ab_cell = pol_sort_ab_cell[ curr_wpis] if not use_log else np.log10( pol_sort_ab_cell[curr_wpis]) curr_ab_nuc = pol_sort_ab_nuc[ curr_wpis] if not use_log else np.log10( pol_sort_ab_nuc[curr_wpis]) curr_ab_cyto = pol_sort_ab_cyto[ curr_wpis] if not use_log else np.log10( pol_sort_ab_cyto[curr_wpis]) curr_mt_cell = pol_sort_mt_cell[ curr_wpis] if not use_log else np.log10( pol_sort_mt_cell[curr_wpis]) curr_wpi_img.append(wpi) curr_var_cell_img.append(np.var(curr_ab_cell)) curr_var_nuc_img.append(np.var(curr_ab_nuc)) curr_var_cyto_img.append(np.var(curr_ab_cyto)) curr_var_mt_img.append(np.var(curr_mt_cell)) curr_gini_cell_img.append(utils.gini(curr_ab_cell)) curr_gini_nuc_img.append(utils.gini(curr_ab_nuc)) curr_gini_cyto_img.append(utils.gini(curr_ab_cyto)) curr_gini_mt_img.append(utils.gini(curr_mt_cell)) curr_cv_cell_img.append(scipy.stats.variation(curr_ab_cell)) curr_cv_nuc_img.append(scipy.stats.variation(curr_ab_nuc)) curr_cv_cyto_img.append(scipy.stats.variation(curr_ab_cyto)) curr_cv_mt_img.append(scipy.stats.variation(curr_mt_cell)) wpi_img.append(curr_wpi_img) var_cell_img.append(curr_var_cell_img) var_nuc_img.append(curr_var_nuc_img) var_cyto_img.append(curr_var_cyto_img) var_mt_img.append(curr_var_mt_img) gini_cell_img.append(curr_gini_cell_img) gini_nuc_img.append(curr_gini_nuc_img) gini_cyto_img.append(curr_gini_cyto_img) gini_mt_img.append(curr_gini_mt_img) cv_cell_img.append(curr_cv_cell_img) cv_nuc_img.append(curr_cv_nuc_img) cv_cyto_img.append(curr_cv_cyto_img) cv_mt_img.append(curr_cv_mt_img) print( "Plotting average intensities of proteins and microtubules by batch.") plot_average_intensities_by_batch(u_well_plates, mean_mean_cell, mean_mean_nuc, mean_mean_cyto, mean_mean_mt, wp_iscell, wp_isnuc, wp_iscyto) print("Making general plots for variance, CV, and gini by compartment") var_cell, var_nuc, var_cyto, var_mt = np.array(var_cell), np.array( var_nuc), np.array(var_cyto), np.array(var_mt) gini_cell, gini_nuc, gini_cyto, gini_mt = np.array(gini_cell), np.array( gini_nuc), np.array(gini_cyto), np.array(gini_mt) cv_cell, cv_nuc, cv_cyto, cv_mt = np.array(cv_cell), np.array( cv_nuc), np.array(cv_cyto), np.array(cv_mt) utils.general_boxplot( (var_cell, var_cyto, var_nuc, var_mt), ("var_cell", "var_cyto", "var_nuc", "var_mt"), "Metacompartment", f"Variance using {'log' if use_log else 'natural'} intensity values", "", True, "figures/VarianceBoxplot.png") utils.general_boxplot( (cv_cell, cv_cyto, cv_nuc, cv_mt), ("cv_cell", "cv_cyto", "cv_nuc", "cv_mt"), "Metacompartment", f"Coeff. of Var. using {'log' if use_log else 'natural'} intensity values", "", True, "figures/CVBoxplot.png") utils.general_boxplot( (gini_cell, gini_cyto, gini_nuc, gini_mt), ("gini_cell", "gini_cyto", "gini_nuc", "gini_mt"), "Metacompartment", f"Gini using {'log' if use_log else 'natural'} intensity values", "", True, "figures/GiniBoxplot.png") print( "Making general plots for variance, CV, and gini in the compartment the protein localizes to" ) mean_mean_comp = utils.values_comp(mean_mean_cell, mean_mean_nuc, mean_mean_cyto, wp_iscell, wp_isnuc, wp_iscyto) cv_comp = utils.values_comp(cv_cell, cv_nuc, cv_cyto, wp_iscell, wp_isnuc, wp_iscyto) gini_comp = utils.values_comp(gini_cell, gini_nuc, gini_cyto, wp_iscell, wp_isnuc, wp_iscyto) var_comp = utils.values_comp(var_cell, var_nuc, var_cyto, wp_iscell, wp_isnuc, wp_iscyto) utils.general_scatter(var_comp, var_mt, "var_comp", "var_mt", "figures/var_comp_mt.png") utils.general_scatter(cv_comp, cv_mt, "cv_comp", "cv_mt", "figures/cv_comp_mt.png") utils.general_scatter(gini_comp, gini_mt, "gini_comp", "gini_mt", "figures/gini_comp_mt.png") utils.general_scatter(var_comp, mean_mean_comp, "var_comp", f"{'log10' if use_log else 'natural'} intensity", "figures/VarianceVsIntensityComp.png") print("Comparing image to sample variance") var_comp_img = utils.values_comp(var_cell_img, var_nuc_img, var_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) gini_comp_img = utils.values_comp(gini_cell_img, gini_nuc_img, gini_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) cv_comp_img = utils.values_comp(cv_cell_img, cv_nuc_img, cv_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) utils.general_scatter( np.concatenate([[var_comp[i]] * len(vvv) for i, vvv in enumerate(var_comp_img)]), np.concatenate(var_comp_img), "variance within compartment", "variance for each image", "figures/VarianceByImage.png") utils.general_scatter( np.concatenate([[gini_comp[i]] * len(vvv) for i, vvv in enumerate(gini_comp_img)]), np.concatenate(gini_comp_img), "gini within compartment", "gini for each image", "figures/GiniByImage.png") utils.general_scatter( np.concatenate([[cv_comp[i]] * len(vvv) for i, vvv in enumerate(cv_comp_img)]), np.concatenate(cv_comp_img), "cv within compartment", "cv for each image", "figures/CVByImage.png") print( np.concatenate(wpi_img)[np.argmax(np.concatenate(var_comp_img))] + ": the image with the max variance") plt.hist( np.concatenate( [vvv / var_comp[i] for i, vvv in enumerate(var_comp_img)])) # plt.show() plt.close() high_var_img = np.concatenate(wpi_img)[np.concatenate( [vvv > 4 * var_comp[i] for i, vvv in enumerate(var_comp_img)])] print( f"{high_var_img}: the images with greater than 4x the variance of the whole sample" ) norm_cv_img = np.concatenate( [vvv / cv_comp[i] for i, vvv in enumerate(cv_comp_img)]) plt.hist(norm_cv_img) # plt.show() plt.close() cutoff = np.mean(norm_cv_img) + 3 * np.std(norm_cv_img) high_cv_img = np.concatenate(wpi_img)[norm_cv_img > cutoff] print( f"{high_cv_img}: the images with greater than 4x the variance of the whole sample" ) np.intersect1d(high_var_img, high_cv_img) # Pickle and return main results utils.np_save_overwriting("output/pickles/mean_mean_comp.npy", mean_mean_comp) utils.np_save_overwriting("output/pickles/cv_comp.npy", cv_comp) utils.np_save_overwriting("output/pickles/gini_comp.npy", gini_comp) utils.np_save_overwriting("output/pickles/var_comp.npy", var_comp) utils.np_save_overwriting("output/pickles/cv_cell.npy", cv_cell) utils.np_save_overwriting("output/pickles/gini_cell.npy", gini_cell) utils.np_save_overwriting("output/pickles/var_cell.npy", var_cell) return mean_mean_comp, var_comp, gini_comp, cv_comp, var_cell, gini_cell, cv_cell, var_mt, gini_mt, cv_mt