def statistical_properties_table(self): '''Multiple testing correction; summarize the statistical results''' #self.test_results # (comparisonName, propertyName), (valuename, valueA, valueB, valueA_count, valueB_count, testName, testPvalue) self.pvalues = np.array([x[1][6] for x in self.test_results.items()], dtype=float) self.pvalues_bonf, self.pass_bonf = np.zeros(len( self.pvalues)), np.zeros(len(self.pvalues)) # Correct equal variance tests testnames = np.array([x[1][5] for x in self.test_results.items()]) eqvartests = testnames == "EqualVarianceLevene" eqvar_bonf, eqvar_bonfpass = utils.bonf(0.05, self.pvalues[eqvartests]) self.pvalues_bonf[eqvartests] = eqvar_bonf self.pass_bonf[eqvartests] = eqvar_bonfpass # Correct property-wise tests properties = np.array([x[0][0] for x in self.test_results.items()]) for prop in np.unique(properties): isprop = properties == prop prop_bonf, prop_bonfpass = utils.bonf(0.05, self.pvalues[isprop]) self.pvalues_bonf[isprop] = prop_bonf self.pass_bonf[isprop] = prop_bonfpass pd.DataFrame({ "Property": properties, "Comparison": [x[0][1] for x in self.test_results.items()], "ValueName": [x[1][0] for x in self.test_results.items()], "ValueA": [x[1][1] for x in self.test_results.items()], "ValueB": [x[1][2] for x in self.test_results.items()], "TestName": testnames, "PValue": self.pvalues, "PValue_Bonf": self.pvalues_bonf, "PValue_BonfAlpha0.05Pass": self.pass_bonf }).to_csv("output/PropertyStatistics.csv", index=False)
def analyze_ccd_variation_by_phase_rna(adata, normalized_exp_data, biotype_to_use): stages = np.array(adata.obs["phase"]) g1_exp = np.take(normalized_exp_data, np.nonzero(stages == "G1")[0], axis=0) s_exp = np.take(normalized_exp_data, np.nonzero(stages == "S-ph")[0], axis=0) g2_exp = np.take(normalized_exp_data, np.nonzero(stages == "G2M")[0], axis=0) tests_fp = [ scipy.stats.kruskal(g1_exp[:, geneidx], s_exp[:, geneidx], g2_exp[:, geneidx]) for geneidx in range(len(g1_exp[0, :])) ] pvals = [p for (F, p) in tests_fp] pvals_corrected_BH, reject_BH = utils.benji_hoch(0.01, pvals) pvals_correctedBonf, rejectBonf = utils.bonf(0.01, pvals) bulk_phase_tests = pd.DataFrame({ "gene": adata.var_names, "pvalue": pvals, "pvaladj_BH": pvals_corrected_BH, "reject_BH": reject_BH, "pvaladj_B": pvals_correctedBonf, "reject_B": rejectBonf }) bulk_phase_tests.to_csv( f"output/phase_clustered_transcript_CCD_analysis_{biotype_to_use}.csv") return bulk_phase_tests
def analyze_ccd_variation_by_mvavg_rna(adata, wp_ensg, ccd_comp, bioccd, adata_nonccdproteins, adata_regevccdgenes, biotype_to_use, use_isoforms=False, make_mvavg_plots_isoforms=False): expression_data = adata.X # log normalized normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:,None]).T fucci_time_inds = np.argsort(adata.obs["fucci_time"]) norm_exp_sort = np.take(normalized_exp_data, fucci_time_inds, axis=0) moving_averages = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort, WINDOW) mvavg_xvals = MovingAverages.mvavg(adata.obs["fucci_time"][fucci_time_inds], WINDOW) cell_cycle_variance = np.var(moving_averages, 0) total_variance = np.var(norm_exp_sort, 0) total_gini = np.apply_along_axis(utils.gini, 0, norm_exp_sort) percent_ccd_variance = cell_cycle_variance / total_variance avg_expression = np.median(norm_exp_sort, 0) # randomize and calculate the mean difference in percent variances from random percent_ccd_variance_rng, mean_diff_from_rng = [],[] perms = np.asarray([np.random.permutation(len(adata.obs)) for nnn in np.arange(PERMUTATIONS if not use_isoforms else PERMUTATIONS_ISOFORMS)]) picklePath = f"output/pickles/percent_ccd_variance_rng{'' if not use_isoforms else 'Isoforms'}.npy" meandiffPath = f"output/pickles/mean_diff_from_rng{'' if not use_isoforms else 'Isoforms'}.npy" if not os.path.exists(picklePath): # norm_exp_sort_perm = np.asarray([np.take(normalized_exp_data, perm, axis=0) for perm in perms]) # moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 1, norm_exp_sort_perm, WINDOW) # percent_ccd_variance_rng = np.var(moving_averages_perm, axis=1) / np.var(norm_exp_sort_perm, axis=1) for iii, perm in enumerate(perms): if iii % 50 == 0: print(f"permutation {iii}") norm_exp_sort_perm = np.take(normalized_exp_data, perm, axis=0) moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_perm, WINDOW) percent_ccd_variance_rng.append( np.var(moving_averages_perm, axis=0) / np.var(norm_exp_sort_perm, axis=0)) utils.np_save_overwriting(picklePath, percent_ccd_variance_rng) else: percent_ccd_variance_rng = np.load(picklePath, allow_pickle=True) percent_ccd_variance_rng = np.asarray(percent_ccd_variance_rng) mean_diff_from_rng = np.mean((percent_ccd_variance - percent_ccd_variance_rng).T, 1) utils.np_save_overwriting(meandiffPath, mean_diff_from_rng) # Statistical testing based on randomization analysis alpha_ccd = 0.01 pass_meandiff = mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM ccd_var_comp_rng_wilcoxp = np.apply_along_axis(scipy.stats.wilcoxon, 1, (percent_ccd_variance - percent_ccd_variance_rng).T, None, "wilcox", False, "greater").T[1].T eq_percvar_adj, pass_eq_percvar_adj = utils.bonf(alpha_ccd, ccd_var_comp_rng_wilcoxp) gtpass_eq_percvar_adj = pass_eq_percvar_adj & (percent_ccd_variance > np.median(percent_ccd_variance_rng, axis=0)) ccdprotein = np.isin(adata.var_names, np.concatenate((wp_ensg[ccd_comp], bioccd))) gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"]) gene_ids = list(gene_info["gene_id"]) gene_names = list(gene_info["name"]) gene_id_name = dict([(gene_ids[idxx], gene_names[idxx]) for idxx in range(len(gene_info))]) ccdstring = np.array(["No "] * len(ccdprotein)) ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp])] = "Pseudotime" ccdstring[np.isin(adata.var_names, bioccd)] = "Mitotic" ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp]) & np.isin(adata.var_names, bioccd)] = "Pseudotime&Mitotic" percent_variance_tests = pd.DataFrame( {"gene" : adata.var_names, "name" : [gene_id_name[x] if x in gene_id_name else "" for x in adata.var_names], "ccd_transcript" : pass_meandiff, "regev_ccd" : adata_regevccdgenes, "ccd_protein" : ccdstring, "nonccd_protein" : adata_nonccdproteins, "mean_diff_from_rng":mean_diff_from_rng, "-log10 CCD FDR":-np.log10(eq_percvar_adj)}) percent_variance_tests.to_csv(f"output/transcript_regulation{biotype_to_use}{'' if not use_isoforms else 'Isoforms'}.csv", index=False) # And keep track of the ccd genes with and without transcript regulation ccdtranscript = pass_meandiff ccdprotein_transcript_regulated = ccdprotein & pass_meandiff ccdprotein_nontranscript_regulated = ccdprotein & ~pass_meandiff ccdtranscript_names = np.array(adata.var_names)[ccdtranscript] proteinccd_transcript_regulated_names = np.array(adata.var_names)[ccdprotein_transcript_regulated] proteinccd_nontranscript_regulated_names = np.array(adata.var_names)[ccdprotein_nontranscript_regulated] utils.np_save_overwriting(f"output/pickles/ccdprotein{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein) # pseudotime/mitotic ccd, might not have all the proteins, since this only has proteins not filtered in RNA-Seq analysis utils.np_save_overwriting(f"output/pickles/ccdtranscript{'' if not use_isoforms else 'Isoforms'}.npy", ccdtranscript) utils.np_save_overwriting(f"output/pickles/ccdprotein_transcript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_transcript_regulated) utils.np_save_overwriting(f"output/pickles/ccdprotein_nontranscript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_nontranscript_regulated) pd.DataFrame({"gene" : ccdtranscript_names}).to_csv(f"output/all_ccdtranscript_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : proteinccd_transcript_regulated_names}).to_csv(f"output/proteinccd_transcript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : proteinccd_nontranscript_regulated_names}).to_csv(f"output/proteinccd_nontranscript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv") pd.DataFrame({"gene" : adata.var_names}).to_csv(f"output/gene_names{'' if not use_isoforms else 'Isoforms'}.csv") # make folders mvpercs = [] if use_isoforms and not make_mvavg_plots_isoforms else mvavg_plots_pergene(adata, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, use_isoforms) if not use_isoforms or make_mvavg_plots_isoforms: folder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNAPseudotimes{'' if not use_isoforms else 'Isoforms'}" ccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}" ccdtransnonccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptNonCCDProtein{'' if not use_isoforms else 'Isoforms'}" nonccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}" nonccdfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCD" for f in [ccdtransccdprotfolder,ccdtransnonccdprotfolder,nonccdtransccdprotfolder,nonccdfolder]: if not os.path.exists(f): os.mkdir(f) # CCD transcript & not CCD protein for ensg in adata.var_names[ccdtranscript & ~ccdprotein]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransnonccdprotfolder, ensg +'_mvavg.pdf')) # CCD transcript & CCD Protein for ensg in adata.var_names[ccdprotein_transcript_regulated]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransccdprotfolder, ensg +'_mvavg.pdf')) # Not CCD transcript & CCD Protein for ensg in adata.var_names[ccdprotein_nontranscript_regulated]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdtransccdprotfolder, ensg +'_mvavg.pdf')) # Non-CCD for ensg in adata.var_names[~ccdtranscript & ~ccdprotein]: shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdfolder, ensg+'_mvavg.pdf')) # Figures of merit with open("output/figuresofmerit.txt", "a") as file: fom = "--- RNA pseudotime\n\n" fom += f"We identified {sum(ccdtranscript)} {'genes' if use_isoforms else 'transcript isoforms'} of {len(ccdtranscript)} protein-coding {'genes' if use_isoforms else 'transcript isoforms'} analyzed ({100 * sum(ccdtranscript) / len(ccdtranscript)}%) to have variance in expression levels correlated to cell cycle progression" + "\n\n" if not use_isoforms: fom += f"We can attribute only {100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}% of proteomic cell cycle regulation to transcriptomic cycling with single-cell RNA sequencing" + "\n\n" fom += f"This includes {100 * sum(np.isin(adata.var_names[mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM], adata.var_names[adata_regevccdgenes])) / sum(adata_regevccdgenes)}% of known CCD transcripts. Of these, {sum(ccdprotein_transcript_regulated)} were also cell cycle dependent proteins ({100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}%). Of the {sum(ccdprotein)} CCD proteins, {sum(ccdprotein_nontranscript_regulated)} did not have CCD transcripts, including DUSP18 (Figure 2E). There were {sum(ccdtranscript & adata_nonccdproteins)} CCD transcripts that were Non-CCD as proteins." + "\n\n" fom += f"" + "\n\n" print(fom) file.write(fom) return percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs
def kinase_families(self): '''Investigate whether there are differences in upstream kinases from mapped proteins''' print("Running kinase family analysis") kinaseFams = pd.read_csv( "input/ProteinProperties/KinHubKinaseFamilies.csv") kinFamDict = dict([(row[7], row[4]) for idx, row in kinaseFams.iterrows()]) phosphositeplus = pd.read_csv( "input/ProteinProperties/KinaseSubstrateDatasetWithoutHeader.txt.gz", sep="\t") phosphositeplus["KINASE_FAMILY"] = [ kinFamDict[x] if x in kinFamDict else "Other" for x in phosphositeplus["KIN_ACC_ID"] ] humanPhosphoInFamily = (phosphositeplus["SUB_ORGANISM"] == "human") & ( phosphositeplus["KINASE_FAMILY"] != "Other" ) # drop Other, as it's not an actual grouping phosHuman = phosphositeplus[humanPhosphoInFamily] # Are there any overrepresented kinase families upstream phosphosites on CCD or non-CCD proteins? labels_comb_ccd, counts_comb_ccd, values_comb_ccd, counts_comb_mappedminus_ccd, values_comb_mappedminus_ccd, fisher_comb_ccd = self.proportion_test( phosHuman, self.names_ccdprotein) labels_comb_nonccd, counts_comb_nonccd, values_comb_nonccd, counts_comb_mappedminus_nonccd, values_comb_mappedminus_nonccd, fisher_comb_nonccd = self.proportion_test( phosHuman, self.names_nonccdprotein) labels_comb_transreg, counts_comb_transreg, values_comb_transreg, counts_comb_mappedminus_transreg, values_comb_mappedminus_transreg, fisher_comb_transreg = self.proportion_test( phosHuman, self.names_ccdprotein_transcript_regulated) labels_comb_nontransreg, counts_comb_nontransreg, values_comb_nontransreg, counts_comb_mappedminus_nontransreg, values_comb_mappedminus_nontransreg, fisher_comb_nontransreg = self.proportion_test( phosHuman, self.names_ccdprotein_nontranscript_regulated) allfisher = np.array(np.concatenate( (fisher_comb_ccd[:, 1], fisher_comb_nonccd[:, 1], fisher_comb_transreg[:, 1], fisher_comb_nontransreg[:, 1])), dtype=float) pvals_corrected_bonf, reject_bonf = utils.bonf(0.05, allfisher) pd.DataFrame({ "Group": np.concatenate( (["ccd"] * len(labels_comb_ccd), ["nonccd"] * len(labels_comb_nonccd), ["transregCCD"] * len(labels_comb_nonccd), ["nontransregCCD"] * len(labels_comb_nontransreg))), "KinaseFamily": np.concatenate((labels_comb_ccd, labels_comb_nonccd, labels_comb_transreg, labels_comb_nontransreg)), "PhosphositeCount_MappedProteome": np.concatenate( (counts_comb_mappedminus_ccd, counts_comb_mappedminus_nonccd, counts_comb_mappedminus_transreg, counts_comb_mappedminus_nontransreg)), "FractionPhosphositesDownstreamOfKinase_MappedProteome": np.concatenate( (values_comb_mappedminus_ccd, values_comb_mappedminus_nonccd, values_comb_mappedminus_transreg, values_comb_mappedminus_nontransreg)), "PhosphositeCount": np.concatenate((counts_comb_ccd, counts_comb_nonccd, counts_comb_transreg, counts_comb_nontransreg)), "FractionPhosphositesDownstreamOfKinase": np.concatenate((values_comb_ccd, values_comb_nonccd, values_comb_transreg, values_comb_nontransreg)), "FisherPValue": np.concatenate( (fisher_comb_ccd[:, 1], fisher_comb_nonccd[:, 1], fisher_comb_transreg[:, 1], fisher_comb_nontransreg[:, 1])), "FisherPValue_BonfCorrected": pvals_corrected_bonf, "FisherPValue_BonfAlpha0.05Pass": reject_bonf, }).to_csv("output/upstreamKinaseResults.csv", index=False)