def statistical_properties_table(self):
        '''Multiple testing correction; summarize the statistical results'''
        #self.test_results # (comparisonName, propertyName),  (valuename, valueA, valueB, valueA_count, valueB_count, testName, testPvalue)
        self.pvalues = np.array([x[1][6] for x in self.test_results.items()],
                                dtype=float)
        self.pvalues_bonf, self.pass_bonf = np.zeros(len(
            self.pvalues)), np.zeros(len(self.pvalues))

        # Correct equal variance tests
        testnames = np.array([x[1][5] for x in self.test_results.items()])
        eqvartests = testnames == "EqualVarianceLevene"
        eqvar_bonf, eqvar_bonfpass = utils.bonf(0.05, self.pvalues[eqvartests])
        self.pvalues_bonf[eqvartests] = eqvar_bonf
        self.pass_bonf[eqvartests] = eqvar_bonfpass

        # Correct property-wise tests
        properties = np.array([x[0][0] for x in self.test_results.items()])
        for prop in np.unique(properties):
            isprop = properties == prop
            prop_bonf, prop_bonfpass = utils.bonf(0.05, self.pvalues[isprop])
            self.pvalues_bonf[isprop] = prop_bonf
            self.pass_bonf[isprop] = prop_bonfpass

        pd.DataFrame({
            "Property": properties,
            "Comparison": [x[0][1] for x in self.test_results.items()],
            "ValueName": [x[1][0] for x in self.test_results.items()],
            "ValueA": [x[1][1] for x in self.test_results.items()],
            "ValueB": [x[1][2] for x in self.test_results.items()],
            "TestName": testnames,
            "PValue": self.pvalues,
            "PValue_Bonf": self.pvalues_bonf,
            "PValue_BonfAlpha0.05Pass": self.pass_bonf
        }).to_csv("output/PropertyStatistics.csv", index=False)
def analyze_ccd_variation_by_phase_rna(adata, normalized_exp_data,
                                       biotype_to_use):
    stages = np.array(adata.obs["phase"])
    g1_exp = np.take(normalized_exp_data,
                     np.nonzero(stages == "G1")[0],
                     axis=0)
    s_exp = np.take(normalized_exp_data,
                    np.nonzero(stages == "S-ph")[0],
                    axis=0)
    g2_exp = np.take(normalized_exp_data,
                     np.nonzero(stages == "G2M")[0],
                     axis=0)
    tests_fp = [
        scipy.stats.kruskal(g1_exp[:, geneidx], s_exp[:, geneidx],
                            g2_exp[:, geneidx])
        for geneidx in range(len(g1_exp[0, :]))
    ]
    pvals = [p for (F, p) in tests_fp]
    pvals_corrected_BH, reject_BH = utils.benji_hoch(0.01, pvals)
    pvals_correctedBonf, rejectBonf = utils.bonf(0.01, pvals)
    bulk_phase_tests = pd.DataFrame({
        "gene": adata.var_names,
        "pvalue": pvals,
        "pvaladj_BH": pvals_corrected_BH,
        "reject_BH": reject_BH,
        "pvaladj_B": pvals_correctedBonf,
        "reject_B": rejectBonf
    })
    bulk_phase_tests.to_csv(
        f"output/phase_clustered_transcript_CCD_analysis_{biotype_to_use}.csv")
    return bulk_phase_tests
예제 #3
0
def analyze_ccd_variation_by_mvavg_rna(adata, wp_ensg, ccd_comp, bioccd, adata_nonccdproteins, adata_regevccdgenes, 
               biotype_to_use, use_isoforms=False, make_mvavg_plots_isoforms=False):
    expression_data = adata.X # log normalized
    normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:,None]).T
    fucci_time_inds = np.argsort(adata.obs["fucci_time"])
    norm_exp_sort = np.take(normalized_exp_data, fucci_time_inds, axis=0)
    moving_averages = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort, WINDOW)
    mvavg_xvals = MovingAverages.mvavg(adata.obs["fucci_time"][fucci_time_inds], WINDOW)
    cell_cycle_variance = np.var(moving_averages, 0)
    total_variance = np.var(norm_exp_sort, 0)
    total_gini = np.apply_along_axis(utils.gini, 0, norm_exp_sort)
    percent_ccd_variance = cell_cycle_variance / total_variance
    avg_expression = np.median(norm_exp_sort, 0)

    # randomize and calculate the mean difference in percent variances from random
    percent_ccd_variance_rng, mean_diff_from_rng = [],[]
    perms = np.asarray([np.random.permutation(len(adata.obs)) for nnn in np.arange(PERMUTATIONS if not use_isoforms else PERMUTATIONS_ISOFORMS)])
    picklePath = f"output/pickles/percent_ccd_variance_rng{'' if not use_isoforms else 'Isoforms'}.npy"
    meandiffPath = f"output/pickles/mean_diff_from_rng{'' if not use_isoforms else 'Isoforms'}.npy"
    if not os.path.exists(picklePath):
        # norm_exp_sort_perm = np.asarray([np.take(normalized_exp_data, perm, axis=0) for perm in perms])
        # moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 1, norm_exp_sort_perm, WINDOW)
        # percent_ccd_variance_rng = np.var(moving_averages_perm, axis=1) / np.var(norm_exp_sort_perm, axis=1)
        for iii, perm in enumerate(perms):
            if iii % 50 == 0: print(f"permutation {iii}")
            norm_exp_sort_perm = np.take(normalized_exp_data, perm, axis=0)
            moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_perm, WINDOW)
            percent_ccd_variance_rng.append(
                    np.var(moving_averages_perm, axis=0) / np.var(norm_exp_sort_perm, axis=0))
        utils.np_save_overwriting(picklePath, percent_ccd_variance_rng)
    else: 
        percent_ccd_variance_rng = np.load(picklePath, allow_pickle=True)
    percent_ccd_variance_rng = np.asarray(percent_ccd_variance_rng)
    mean_diff_from_rng = np.mean((percent_ccd_variance - percent_ccd_variance_rng).T, 1)
    utils.np_save_overwriting(meandiffPath, mean_diff_from_rng)

    # Statistical testing based on randomization analysis
    alpha_ccd = 0.01
    pass_meandiff = mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM
    ccd_var_comp_rng_wilcoxp = np.apply_along_axis(scipy.stats.wilcoxon, 1, (percent_ccd_variance - percent_ccd_variance_rng).T, None, "wilcox", False, "greater").T[1].T
    eq_percvar_adj, pass_eq_percvar_adj = utils.bonf(alpha_ccd, ccd_var_comp_rng_wilcoxp)
    gtpass_eq_percvar_adj = pass_eq_percvar_adj & (percent_ccd_variance > np.median(percent_ccd_variance_rng, axis=0))

    ccdprotein = np.isin(adata.var_names, np.concatenate((wp_ensg[ccd_comp], bioccd)))
    gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"])
    gene_ids = list(gene_info["gene_id"])
    gene_names = list(gene_info["name"])
    gene_id_name = dict([(gene_ids[idxx], gene_names[idxx]) for idxx in range(len(gene_info))])
    ccdstring = np.array(["No                 "] * len(ccdprotein))
    ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp])] = "Pseudotime"
    ccdstring[np.isin(adata.var_names, bioccd)] = "Mitotic"
    ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp]) & np.isin(adata.var_names, bioccd)] = "Pseudotime&Mitotic"
    percent_variance_tests = pd.DataFrame(
        {"gene" : adata.var_names, 
        "name" : [gene_id_name[x] if x in gene_id_name else "" for x in adata.var_names],
        "ccd_transcript" : pass_meandiff, 
        "regev_ccd" : adata_regevccdgenes,
        "ccd_protein" : ccdstring,
        "nonccd_protein" : adata_nonccdproteins,
        "mean_diff_from_rng":mean_diff_from_rng,
        "-log10 CCD FDR":-np.log10(eq_percvar_adj)})
    percent_variance_tests.to_csv(f"output/transcript_regulation{biotype_to_use}{'' if not use_isoforms else 'Isoforms'}.csv", index=False)

    # And keep track of the ccd genes with and without transcript regulation
    ccdtranscript = pass_meandiff
    ccdprotein_transcript_regulated = ccdprotein & pass_meandiff
    ccdprotein_nontranscript_regulated = ccdprotein & ~pass_meandiff
    ccdtranscript_names = np.array(adata.var_names)[ccdtranscript]
    proteinccd_transcript_regulated_names = np.array(adata.var_names)[ccdprotein_transcript_regulated]
    proteinccd_nontranscript_regulated_names = np.array(adata.var_names)[ccdprotein_nontranscript_regulated]
    utils.np_save_overwriting(f"output/pickles/ccdprotein{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein) # pseudotime/mitotic ccd, might not have all the proteins, since this only has proteins not filtered in RNA-Seq analysis
    utils.np_save_overwriting(f"output/pickles/ccdtranscript{'' if not use_isoforms else 'Isoforms'}.npy", ccdtranscript)
    utils.np_save_overwriting(f"output/pickles/ccdprotein_transcript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_transcript_regulated)
    utils.np_save_overwriting(f"output/pickles/ccdprotein_nontranscript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_nontranscript_regulated)
    pd.DataFrame({"gene" : ccdtranscript_names}).to_csv(f"output/all_ccdtranscript_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : proteinccd_transcript_regulated_names}).to_csv(f"output/proteinccd_transcript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : proteinccd_nontranscript_regulated_names}).to_csv(f"output/proteinccd_nontranscript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : adata.var_names}).to_csv(f"output/gene_names{'' if not use_isoforms else 'Isoforms'}.csv")
    
    # make folders
    mvpercs = [] if use_isoforms and not make_mvavg_plots_isoforms else mvavg_plots_pergene(adata, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, use_isoforms)
    if not use_isoforms or make_mvavg_plots_isoforms:
        folder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNAPseudotimes{'' if not use_isoforms else 'Isoforms'}"
        ccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        ccdtransnonccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptNonCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        nonccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        nonccdfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCD"
        for f in [ccdtransccdprotfolder,ccdtransnonccdprotfolder,nonccdtransccdprotfolder,nonccdfolder]:
            if not os.path.exists(f): os.mkdir(f)
        # CCD transcript & not CCD protein
        for ensg in adata.var_names[ccdtranscript & ~ccdprotein]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransnonccdprotfolder, ensg +'_mvavg.pdf'))
        # CCD transcript & CCD Protein
        for ensg in adata.var_names[ccdprotein_transcript_regulated]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransccdprotfolder, ensg +'_mvavg.pdf'))
        # Not CCD transcript & CCD Protein
        for ensg in adata.var_names[ccdprotein_nontranscript_regulated]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdtransccdprotfolder, ensg +'_mvavg.pdf'))
        # Non-CCD 
        for ensg in adata.var_names[~ccdtranscript & ~ccdprotein]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdfolder, ensg+'_mvavg.pdf'))

    # Figures of merit
    with open("output/figuresofmerit.txt", "a") as file:
        fom = "--- RNA pseudotime\n\n"
        fom += f"We identified {sum(ccdtranscript)} {'genes' if use_isoforms else 'transcript isoforms'} of {len(ccdtranscript)} protein-coding {'genes' if use_isoforms else 'transcript isoforms'} analyzed ({100 * sum(ccdtranscript) / len(ccdtranscript)}%) to have variance in expression levels correlated to cell cycle progression" + "\n\n"
        if not use_isoforms:
            fom += f"We can attribute only {100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}% of proteomic cell cycle regulation to transcriptomic cycling with single-cell RNA sequencing" + "\n\n"
            fom += f"This includes {100 * sum(np.isin(adata.var_names[mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM], adata.var_names[adata_regevccdgenes])) / sum(adata_regevccdgenes)}% of known CCD transcripts. Of these, {sum(ccdprotein_transcript_regulated)} were also cell cycle dependent proteins ({100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}%). Of the {sum(ccdprotein)} CCD proteins, {sum(ccdprotein_nontranscript_regulated)} did not have CCD transcripts, including DUSP18 (Figure 2E). There were {sum(ccdtranscript & adata_nonccdproteins)} CCD transcripts that were Non-CCD as proteins." + "\n\n"
        fom += f"" + "\n\n"
        print(fom)
        file.write(fom)
    
    return percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs
    def kinase_families(self):
        '''Investigate whether there are differences in upstream kinases from mapped proteins'''
        print("Running kinase family analysis")
        kinaseFams = pd.read_csv(
            "input/ProteinProperties/KinHubKinaseFamilies.csv")
        kinFamDict = dict([(row[7], row[4])
                           for idx, row in kinaseFams.iterrows()])
        phosphositeplus = pd.read_csv(
            "input/ProteinProperties/KinaseSubstrateDatasetWithoutHeader.txt.gz",
            sep="\t")
        phosphositeplus["KINASE_FAMILY"] = [
            kinFamDict[x] if x in kinFamDict else "Other"
            for x in phosphositeplus["KIN_ACC_ID"]
        ]
        humanPhosphoInFamily = (phosphositeplus["SUB_ORGANISM"] == "human") & (
            phosphositeplus["KINASE_FAMILY"] != "Other"
        )  # drop Other, as it's not an actual grouping
        phosHuman = phosphositeplus[humanPhosphoInFamily]

        # Are there any overrepresented kinase families upstream phosphosites on CCD or non-CCD proteins?
        labels_comb_ccd, counts_comb_ccd, values_comb_ccd, counts_comb_mappedminus_ccd, values_comb_mappedminus_ccd, fisher_comb_ccd = self.proportion_test(
            phosHuman, self.names_ccdprotein)
        labels_comb_nonccd, counts_comb_nonccd, values_comb_nonccd, counts_comb_mappedminus_nonccd, values_comb_mappedminus_nonccd, fisher_comb_nonccd = self.proportion_test(
            phosHuman, self.names_nonccdprotein)
        labels_comb_transreg, counts_comb_transreg, values_comb_transreg, counts_comb_mappedminus_transreg, values_comb_mappedminus_transreg, fisher_comb_transreg = self.proportion_test(
            phosHuman, self.names_ccdprotein_transcript_regulated)
        labels_comb_nontransreg, counts_comb_nontransreg, values_comb_nontransreg, counts_comb_mappedminus_nontransreg, values_comb_mappedminus_nontransreg, fisher_comb_nontransreg = self.proportion_test(
            phosHuman, self.names_ccdprotein_nontranscript_regulated)

        allfisher = np.array(np.concatenate(
            (fisher_comb_ccd[:, 1], fisher_comb_nonccd[:, 1],
             fisher_comb_transreg[:, 1], fisher_comb_nontransreg[:, 1])),
                             dtype=float)
        pvals_corrected_bonf, reject_bonf = utils.bonf(0.05, allfisher)
        pd.DataFrame({
            "Group":
            np.concatenate(
                (["ccd"] * len(labels_comb_ccd),
                 ["nonccd"] * len(labels_comb_nonccd),
                 ["transregCCD"] * len(labels_comb_nonccd),
                 ["nontransregCCD"] * len(labels_comb_nontransreg))),
            "KinaseFamily":
            np.concatenate((labels_comb_ccd, labels_comb_nonccd,
                            labels_comb_transreg, labels_comb_nontransreg)),
            "PhosphositeCount_MappedProteome":
            np.concatenate(
                (counts_comb_mappedminus_ccd, counts_comb_mappedminus_nonccd,
                 counts_comb_mappedminus_transreg,
                 counts_comb_mappedminus_nontransreg)),
            "FractionPhosphositesDownstreamOfKinase_MappedProteome":
            np.concatenate(
                (values_comb_mappedminus_ccd, values_comb_mappedminus_nonccd,
                 values_comb_mappedminus_transreg,
                 values_comb_mappedminus_nontransreg)),
            "PhosphositeCount":
            np.concatenate((counts_comb_ccd, counts_comb_nonccd,
                            counts_comb_transreg, counts_comb_nontransreg)),
            "FractionPhosphositesDownstreamOfKinase":
            np.concatenate((values_comb_ccd, values_comb_nonccd,
                            values_comb_transreg, values_comb_nontransreg)),
            "FisherPValue":
            np.concatenate(
                (fisher_comb_ccd[:, 1], fisher_comb_nonccd[:, 1],
                 fisher_comb_transreg[:, 1], fisher_comb_nontransreg[:, 1])),
            "FisherPValue_BonfCorrected":
            pvals_corrected_bonf,
            "FisherPValue_BonfAlpha0.05Pass":
            reject_bonf,
        }).to_csv("output/upstreamKinaseResults.csv", index=False)