def ccd_analysis_of_spikeins(adata_spikeins, perms): '''ERCC spikeins were used as an internal control. We can use them to get an idea of the noise for this analysis.''' expression_data_spike = adata_spikeins.X # log normalized normalized_exp_data_spike = (expression_data_spike.T / np.max(expression_data_spike, axis=0)[:,None]).T fucci_time_inds_spike = np.argsort(adata_spikeins.obs["fucci_time"]) # fucci_time_sort_spike = np.take(np.array(adata_spikeins.obs["fucci_time"]), fucci_time_inds_spike) norm_exp_sort_spike = np.take(normalized_exp_data_spike, fucci_time_inds_spike, axis=0) moving_averages_spike = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_spike, 100) cell_cycle_variance_spike = np.apply_along_axis(np.var, 0, moving_averages_spike) total_variance_spike = np.apply_along_axis(np.var, 0, norm_exp_sort_spike) total_cv_spike = np.apply_along_axis(scipy.stats.variation, 0, norm_exp_sort_spike) percent_ccd_variance_spike = cell_cycle_variance_spike / total_variance_spike # avg_expression_spike = np.apply_along_axis(np.median, 0, norm_exp_sort_spike) print("Percent variance of spike-in:") print(f"mean +/- stdev of spike-in variance explained by cell cycle: {np.mean(percent_ccd_variance_spike)} +/- {np.std(percent_ccd_variance_spike)}") print(f"median of spike-in variance explained by cell cycle: {np.median(percent_ccd_variance_spike)}") percent_ccd_variance_rng_spike = [] for iii, perm in enumerate(perms): if iii % 1000 == 0: print(f"permutation {iii}") norm_exp_sort_perm_spike = np.take(normalized_exp_data_spike, perm, axis=0) moving_averages_perm_spike = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_perm_spike, WINDOW) percent_ccd_variance_rng_spike.append( np.var(moving_averages_perm_spike, axis=0) / np.var(norm_exp_sort_perm_spike, axis=0)) percent_ccd_variance_rng_spike = np.asarray(percent_ccd_variance_rng_spike) mean_diff_from_rng_spike = np.mean((percent_ccd_variance_spike - percent_ccd_variance_rng_spike).T, 1) print("Percent additional variance CCD than random of spike-in") print(f"mean +/- stdev of spike-in mean additional percent variance from random: {np.mean(mean_diff_from_rng_spike)} +/- {np.std(mean_diff_from_rng_spike)}") print(f"median of spike-in addtional variance explained by cell cycle than random: {np.median(mean_diff_from_rng_spike)}") utils.general_boxplot((percent_ccd_variance_spike, mean_diff_from_rng_spike), ("Percent Variance\nCCD Spike-In", "Percent Additional\nCCD Variance Spike-In"), "", "Percent Variance CCD", "", True, "figures/RNASpikeinVarianceBoxplot.png")
def analyze_cnv_calls(adata, ccdtranscript): '''Take results from cnvkit calls to analyze effects of copy number variation''' cnsresults = pd.read_csv("input/RNAData/CnsCallSummary.tsv", sep="\t") cnsresults_gene = cnsresults["gene"] cnsresults_allgenes = np.concatenate([g.split(',') for g in cnsresults_gene]) genenamedict = utils.getGeneNameDict() adata_names = np.array(utils.ccd_gene_names_gapped(adata.var_names[ccdtranscript], genenamedict)) adata_ccd_isInCns = adata[np.isin(adata.obs["Well_Plate"], cnsresults.columns), np.arange(len(ccdtranscript))[ccdtranscript][np.isin(adata_names, cnsresults_allgenes)]] adata_ccd_isInCns_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns.var_names, genenamedict) cnsresultIdx = np.array([[n in genelist for genelist in cnsresults_gene] for n in adata_ccd_isInCns_names]) geneInJustOneList = np.array([sum(x) == 1 for x in cnsresultIdx]) adata_ccd_isInCns_inJustOneList = adata_ccd_isInCns[:, geneInJustOneList] adata_ccd_isInCns_inJustOneList_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns_inJustOneList.var_names, genenamedict) cnsresultIdx_inJustOneList = cnsresultIdx[geneInJustOneList] cnsResultsCellData = np.array(cnsresults)[:, np.isin(cnsresults.columns, adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])] # evaluate consistency of CNVs heatmap = np.zeros(cnsResultsCellData.T.shape) heatmap[cnsResultsCellData.T == -5] = -1 heatmap[(cnsResultsCellData.T > -5) & (cnsResultsCellData.T < 1)] = 0 heatmap[cnsResultsCellData.T == 1] = 1 heatmap[cnsResultsCellData.T == 2] = 2 heatmap[cnsResultsCellData.T > 2] = 3 clustergrid = sbn.clustermap(heatmap[:,:-8], col_cluster=False) plt.savefig("figures/CnvConsistency.pdf") plt.close() # heatmaps for phases adata_idx = np.array([list(adata.obs["Well_Plate"]).index(wp) for wp in cnsresults.columns[np.isin(cnsresults.columns, adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])]]) sbn.heatmap([adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "G1", adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "S-ph", adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "G2M"], yticklabels=["G1", "S", "G2"]) plt.savefig("figures/CnvConsistencyPhases.pdf") plt.close() # is there enrichment for phase in the highly amplified genes? # print(adata_ccd_isInCns.obs["phase"][clustergrid.dendrogram_row.reordered_ind[:100]].value_counts()) # yes, so is there correlation? x = adata_ccd_isInCns.obs["fucci_time"] y = np.mean(cnsResultsCellData, axis=0) linearModel = scipy.stats.linregress(np.asarray(x).astype(float), np.asarray(y).astype(float)) plt.scatter(x * fucci.TOT_LEN, y) plt.scatter(x * fucci.TOT_LEN, linearModel.intercept + x * linearModel.slope) plt.xlabel("Cell Division Time, hrs") plt.ylabel("Mean CNV of All Chromosome Arms") plt.savefig("figures/CnvCorrelation.pdf") plt.close() print(f"{linearModel[3]}: p-value for nonzero slope by two-sided t test") residualLinearModel = scipy.stats.linregress(np.asarray(x).astype(float), np.asarray(y - (linearModel.intercept + x * linearModel.slope)).astype(float)) residualNormality = scipy.stats.normaltest(np.asarray(y - (linearModel.intercept + x * linearModel.slope))) print(f"{residualLinearModel[3]}: p-value for nonzero slope of residuals by two-sided t-test") print(f"{residualNormality[1]}: p-value for normality of residuals") # what if we only look at one phase? G1 before doubling? for all genes? adata_names = np.array(utils.ccd_gene_names_gapped(adata.var_names, genenamedict)) adata_ccd_isInCns = adata[np.isin(adata.obs["Well_Plate"], cnsresults.columns) & (adata.obs["phase"] == "G1"), np.arange(len(adata_names))[np.isin(adata_names, cnsresults_allgenes)]] adata_ccd_isInCns_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns.var_names, genenamedict) cnsresultIdx = np.array([[n in genelist for genelist in cnsresults_gene] for n in adata_ccd_isInCns_names]) geneInJustOneList = np.array([sum(x) == 1 for x in cnsresultIdx]) adata_ccd_isInCns_inJustOneList = adata_ccd_isInCns[:, geneInJustOneList] adata_ccd_isInCns_inJustOneList_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns_inJustOneList.var_names, genenamedict) cnsresultIdx_inJustOneList = cnsresultIdx[geneInJustOneList] cnsResultsCellData = np.array(cnsresults)[:, np.isin(cnsresults.columns, adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])] cnvAmplified, cnvPvalOneSided = [],[] cnvDeleted, cnvPvalOneSidedDeleted = [],[] amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll = [],[],[] for ii, tpm in enumerate(adata_ccd_isInCns.X.T[geneInJustOneList]): cnv = np.concatenate(cnsResultsCellData[cnsresultIdx_inJustOneList[ii],:]) missingData = cnv == -5 amplified, amplifiedTpms = cnv[~missingData & (cnv > 1)], tpm[~missingData & (cnv > 1)] neutral, neutralTpms = cnv[~missingData & (cnv == 1)], tpm[~missingData & (cnv == 1)] deletion, deletionTpms = cnv[~missingData & (cnv < 1)], tpm[~missingData & (cnv < 1)] cnvAmplified.append(np.median(amplifiedTpms) > np.median(tpm[~missingData])) cnvPvalOneSided.append(scipy.stats.kruskal(amplifiedTpms, neutralTpms)[1] * 2) cnvDeleted.append(np.median(deletionTpms) < np.median(tpm[~missingData])) cnvPvalOneSidedDeleted.append(scipy.stats.kruskal(deletionTpms, neutralTpms)[1] * 2) amplifiedTpmsAll.extend(amplifiedTpms) neutralTpmsAll.extend(neutralTpms) deletionTpmsAll.extend(deletionTpms) cnvAmplified = np.asarray(cnvAmplified) cnvTestPvals_BH, cnvTestPvals_rejectBH = utils.benji_hoch(0.01, cnvPvalOneSided) cnvTestPvalsDel_BH, cnvTestPvalsDel_rejectBH = utils.benji_hoch(0.01, cnvPvalOneSidedDeleted) print(f"{sum(cnvAmplified & cnvTestPvals_rejectBH)}: number of novel CCD with significantly higher expression with amplified CNVs than neutral") print(f"{sum(cnvDeleted & cnvTestPvalsDel_rejectBH)}: number of novel CCD with significantly higher expression with amplified CNVs than neutral") utils.general_boxplot([amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll], ["amplified", "neutral", "deletion"], "", "logTPMs", "", False, "figures/CNVStateBoxplot.pdf") print(f"Of {len(cnvAmplified)} genes:") print(f"{scipy.stats.kruskal(amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll)[1]}: kruskal two sided pval that there's a difference between the three") print(f"{scipy.stats.kruskal(amplifiedTpmsAll, neutralTpmsAll)[1]}: kruskal two sided pval that there's a difference between amplified/neutral")
def calculate_variation(use_log, u_well_plates, wp_iscell, wp_isnuc, wp_iscyto, pol_sort_well_plate, pol_sort_ab_cell, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_mt_cell, pol_sort_well_plate_imgnb): '''Calculate overall variation of protein staining intensity in single cells''' var_cell, var_nuc, var_cyto, var_mt = [], [], [], [ ] # mean intensity variances per antibody cv_cell, cv_nuc, cv_cyto, cv_mt = [], [], [], [] gini_cell, gini_nuc, gini_cyto, gini_mt = [], [], [], [ ] # mean intensity ginis per antibody mean_mean_cell, mean_mean_nuc, mean_mean_cyto, mean_mean_mt = [], [], [], [ ] # mean mean-intensity cell_counts = [] wpi_img = [] gini_cell_img, gini_nuc_img, gini_cyto_img, gini_mt_img = [], [], [], [ ] # mean intensity g per field of view var_cell_img, var_nuc_img, var_cyto_img, var_mt_img = [], [], [], [ ] # mean intensity variances per field of view cv_cell_img, cv_nuc_img, cv_cyto_img, cv_mt_img = [], [], [], [] # The variance needs to be calculated separately for each well because they all have different numbers of cells for well in u_well_plates: curr_well_inds = pol_sort_well_plate == well curr_ab_cell = pol_sort_ab_cell[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_cell[curr_well_inds]) curr_ab_nuc = pol_sort_ab_nuc[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_nuc[curr_well_inds]) curr_ab_cyto = pol_sort_ab_cyto[ curr_well_inds] if not use_log else np.log10( pol_sort_ab_cyto[curr_well_inds]) curr_mt_cell = pol_sort_mt_cell[ curr_well_inds] if not use_log else np.log10( pol_sort_mt_cell[curr_well_inds]) cell_counts.append(len(curr_ab_cell)) var_cell.append(np.var(curr_ab_cell)) var_nuc.append(np.var(curr_ab_nuc)) var_cyto.append(np.var(curr_ab_cyto)) var_mt.append(np.var(curr_mt_cell)) cv_cell.append(scipy.stats.variation(curr_ab_cell)) cv_nuc.append(scipy.stats.variation(curr_ab_nuc)) cv_cyto.append(scipy.stats.variation(curr_ab_cyto)) cv_mt.append(scipy.stats.variation(curr_mt_cell)) gini_cell.append(utils.gini(curr_ab_cell)) gini_nuc.append(utils.gini(curr_ab_nuc)) gini_cyto.append(utils.gini(curr_ab_cyto)) gini_mt.append(utils.gini(curr_mt_cell)) # Save the mean mean intensities mean_mean_cell.append(np.mean(curr_ab_cell)) mean_mean_nuc.append(np.mean(curr_ab_nuc)) mean_mean_cyto.append(np.mean(curr_ab_cyto)) mean_mean_mt.append(np.mean(curr_mt_cell)) curr_well_plate_imgnbs = pol_sort_well_plate_imgnb[curr_well_inds] curr_wpi_img = [] curr_gini_cell_img, curr_gini_nuc_img, curr_gini_cyto_img, curr_gini_mt_img = [],[],[],[] # mean intensity variances per field of view curr_var_cell_img, curr_var_nuc_img, curr_var_cyto_img, curr_var_mt_img = [],[],[],[] # mean intensity variances per field of view curr_cv_cell_img, curr_cv_nuc_img, curr_cv_cyto_img, curr_cv_mt_img = [], [], [], [] for wpi in np.unique(curr_well_plate_imgnbs): curr_wpis = pol_sort_well_plate_imgnb == wpi curr_ab_cell = pol_sort_ab_cell[ curr_wpis] if not use_log else np.log10( pol_sort_ab_cell[curr_wpis]) curr_ab_nuc = pol_sort_ab_nuc[ curr_wpis] if not use_log else np.log10( pol_sort_ab_nuc[curr_wpis]) curr_ab_cyto = pol_sort_ab_cyto[ curr_wpis] if not use_log else np.log10( pol_sort_ab_cyto[curr_wpis]) curr_mt_cell = pol_sort_mt_cell[ curr_wpis] if not use_log else np.log10( pol_sort_mt_cell[curr_wpis]) curr_wpi_img.append(wpi) curr_var_cell_img.append(np.var(curr_ab_cell)) curr_var_nuc_img.append(np.var(curr_ab_nuc)) curr_var_cyto_img.append(np.var(curr_ab_cyto)) curr_var_mt_img.append(np.var(curr_mt_cell)) curr_gini_cell_img.append(utils.gini(curr_ab_cell)) curr_gini_nuc_img.append(utils.gini(curr_ab_nuc)) curr_gini_cyto_img.append(utils.gini(curr_ab_cyto)) curr_gini_mt_img.append(utils.gini(curr_mt_cell)) curr_cv_cell_img.append(scipy.stats.variation(curr_ab_cell)) curr_cv_nuc_img.append(scipy.stats.variation(curr_ab_nuc)) curr_cv_cyto_img.append(scipy.stats.variation(curr_ab_cyto)) curr_cv_mt_img.append(scipy.stats.variation(curr_mt_cell)) wpi_img.append(curr_wpi_img) var_cell_img.append(curr_var_cell_img) var_nuc_img.append(curr_var_nuc_img) var_cyto_img.append(curr_var_cyto_img) var_mt_img.append(curr_var_mt_img) gini_cell_img.append(curr_gini_cell_img) gini_nuc_img.append(curr_gini_nuc_img) gini_cyto_img.append(curr_gini_cyto_img) gini_mt_img.append(curr_gini_mt_img) cv_cell_img.append(curr_cv_cell_img) cv_nuc_img.append(curr_cv_nuc_img) cv_cyto_img.append(curr_cv_cyto_img) cv_mt_img.append(curr_cv_mt_img) print( "Plotting average intensities of proteins and microtubules by batch.") plot_average_intensities_by_batch(u_well_plates, mean_mean_cell, mean_mean_nuc, mean_mean_cyto, mean_mean_mt, wp_iscell, wp_isnuc, wp_iscyto) print("Making general plots for variance, CV, and gini by compartment") var_cell, var_nuc, var_cyto, var_mt = np.array(var_cell), np.array( var_nuc), np.array(var_cyto), np.array(var_mt) gini_cell, gini_nuc, gini_cyto, gini_mt = np.array(gini_cell), np.array( gini_nuc), np.array(gini_cyto), np.array(gini_mt) cv_cell, cv_nuc, cv_cyto, cv_mt = np.array(cv_cell), np.array( cv_nuc), np.array(cv_cyto), np.array(cv_mt) utils.general_boxplot( (var_cell, var_cyto, var_nuc, var_mt), ("var_cell", "var_cyto", "var_nuc", "var_mt"), "Metacompartment", f"Variance using {'log' if use_log else 'natural'} intensity values", "", True, "figures/VarianceBoxplot.png") utils.general_boxplot( (cv_cell, cv_cyto, cv_nuc, cv_mt), ("cv_cell", "cv_cyto", "cv_nuc", "cv_mt"), "Metacompartment", f"Coeff. of Var. using {'log' if use_log else 'natural'} intensity values", "", True, "figures/CVBoxplot.png") utils.general_boxplot( (gini_cell, gini_cyto, gini_nuc, gini_mt), ("gini_cell", "gini_cyto", "gini_nuc", "gini_mt"), "Metacompartment", f"Gini using {'log' if use_log else 'natural'} intensity values", "", True, "figures/GiniBoxplot.png") print( "Making general plots for variance, CV, and gini in the compartment the protein localizes to" ) mean_mean_comp = utils.values_comp(mean_mean_cell, mean_mean_nuc, mean_mean_cyto, wp_iscell, wp_isnuc, wp_iscyto) cv_comp = utils.values_comp(cv_cell, cv_nuc, cv_cyto, wp_iscell, wp_isnuc, wp_iscyto) gini_comp = utils.values_comp(gini_cell, gini_nuc, gini_cyto, wp_iscell, wp_isnuc, wp_iscyto) var_comp = utils.values_comp(var_cell, var_nuc, var_cyto, wp_iscell, wp_isnuc, wp_iscyto) utils.general_scatter(var_comp, var_mt, "var_comp", "var_mt", "figures/var_comp_mt.png") utils.general_scatter(cv_comp, cv_mt, "cv_comp", "cv_mt", "figures/cv_comp_mt.png") utils.general_scatter(gini_comp, gini_mt, "gini_comp", "gini_mt", "figures/gini_comp_mt.png") utils.general_scatter(var_comp, mean_mean_comp, "var_comp", f"{'log10' if use_log else 'natural'} intensity", "figures/VarianceVsIntensityComp.png") print("Comparing image to sample variance") var_comp_img = utils.values_comp(var_cell_img, var_nuc_img, var_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) gini_comp_img = utils.values_comp(gini_cell_img, gini_nuc_img, gini_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) cv_comp_img = utils.values_comp(cv_cell_img, cv_nuc_img, cv_cyto_img, wp_iscell, wp_isnuc, wp_iscyto) utils.general_scatter( np.concatenate([[var_comp[i]] * len(vvv) for i, vvv in enumerate(var_comp_img)]), np.concatenate(var_comp_img), "variance within compartment", "variance for each image", "figures/VarianceByImage.png") utils.general_scatter( np.concatenate([[gini_comp[i]] * len(vvv) for i, vvv in enumerate(gini_comp_img)]), np.concatenate(gini_comp_img), "gini within compartment", "gini for each image", "figures/GiniByImage.png") utils.general_scatter( np.concatenate([[cv_comp[i]] * len(vvv) for i, vvv in enumerate(cv_comp_img)]), np.concatenate(cv_comp_img), "cv within compartment", "cv for each image", "figures/CVByImage.png") print( np.concatenate(wpi_img)[np.argmax(np.concatenate(var_comp_img))] + ": the image with the max variance") plt.hist( np.concatenate( [vvv / var_comp[i] for i, vvv in enumerate(var_comp_img)])) # plt.show() plt.close() high_var_img = np.concatenate(wpi_img)[np.concatenate( [vvv > 4 * var_comp[i] for i, vvv in enumerate(var_comp_img)])] print( f"{high_var_img}: the images with greater than 4x the variance of the whole sample" ) norm_cv_img = np.concatenate( [vvv / cv_comp[i] for i, vvv in enumerate(cv_comp_img)]) plt.hist(norm_cv_img) # plt.show() plt.close() cutoff = np.mean(norm_cv_img) + 3 * np.std(norm_cv_img) high_cv_img = np.concatenate(wpi_img)[norm_cv_img > cutoff] print( f"{high_cv_img}: the images with greater than 4x the variance of the whole sample" ) np.intersect1d(high_var_img, high_cv_img) # Pickle and return main results utils.np_save_overwriting("output/pickles/mean_mean_comp.npy", mean_mean_comp) utils.np_save_overwriting("output/pickles/cv_comp.npy", cv_comp) utils.np_save_overwriting("output/pickles/gini_comp.npy", gini_comp) utils.np_save_overwriting("output/pickles/var_comp.npy", var_comp) utils.np_save_overwriting("output/pickles/cv_cell.npy", cv_cell) utils.np_save_overwriting("output/pickles/gini_cell.npy", gini_cell) utils.np_save_overwriting("output/pickles/var_cell.npy", var_cell) return mean_mean_comp, var_comp, gini_comp, cv_comp, var_cell, gini_cell, cv_cell, var_mt, gini_mt, cv_mt
def compare_peak_expression_prot_v_rna(adata, wp_ensg, ccd_comp, ccdtranscript, wp_max_pol, wp_max_pol_ccd, sorted_maxpol_array, max_moving_avg_pol, sorted_max_moving_avg_pol_ccd): '''Compare the time of peak expression of protein and RNA''' prot_ccd_ensg = list(wp_ensg[ccd_comp]) rna_ccd_ensg = list(adata.var_names[ccdtranscript]) both_ccd_ensg = np.intersect1d(prot_ccd_ensg, rna_ccd_ensg) both_prot_ccd_idx = np.array( [prot_ccd_ensg.index(ensg) for ensg in both_ccd_ensg]) both_rna_ccd_idx = np.array( [rna_ccd_ensg.index(ensg) for ensg in both_ccd_ensg]) insct_prot_max_pol_ccd = wp_max_pol_ccd[both_prot_ccd_idx] insct_rna_max_pol_ccd = sorted_max_moving_avg_pol_ccd[both_rna_ccd_idx] diff_max_pol = insct_prot_max_pol_ccd - insct_rna_max_pol_ccd #% Sanity check: double check that the names line up prot_names = np.array(prot_ccd_ensg)[both_prot_ccd_idx] rna_names = np.array(rna_ccd_ensg)[both_rna_ccd_idx] print(f"The name arrays are the same: {all(prot_names == rna_names)}") # Alluvial plot showing RNA-protein phase of peak expression for each genes peak_expression_alluvial(diff_max_pol, insct_rna_max_pol_ccd, insct_prot_max_pol_ccd) # Histogram for peak expression utils.general_histogram( diff_max_pol * fucci.TOT_LEN, "Delay in peak protein expression from peak RNA expression, hrs", "Count of CCD Proteins", 0.5, "figures/DelayPeakProteinRNA.pdf") # Scatter for peak expression with colorbar for the delay peak_expression_delay_scatter(insct_rna_max_pol_ccd, insct_prot_max_pol_ccd, diff_max_pol) # Boxplot for delay of peak expression utils.general_boxplot((insct_prot_max_pol_ccd * fucci.TOT_LEN, insct_rna_max_pol_ccd * fucci.TOT_LEN), ("Protein", "RNA"), "", "Peak Expression, hrs", "", True, "figures/DelayPeakProteinRNA_boxplot.png") print(f"Count of prot CCD genes: {len(prot_ccd_ensg)}") print(f"Count of CCD RNA genes: {len(rna_ccd_ensg)}") print( f"Count of intersection betweeen CCD prot and CCD RNA: {len(both_ccd_ensg)}" ) print( f"Median delay of RNA and protein expression time for CCD proteins: {fucci.TOT_LEN * np.median(diff_max_pol)}" ) print( f"Median RNA expression time for CCD proteins: {fucci.TOT_LEN * np.median(insct_rna_max_pol_ccd)}" ) print( f"Median protein expression time for CCD proteins: {fucci.TOT_LEN * np.median(insct_prot_max_pol_ccd)}" ) t, p = scipy.stats.kruskal(insct_rna_max_pol_ccd, insct_prot_max_pol_ccd) print( f"One-sided kruskal for median protein expression time higher than median RNA expression time: {2*p}" ) t, p = scipy.stats.ttest_1samp(diff_max_pol, 0) print( f"One-sided, one-sample t-test for mean delay in protein expression larger than zero: {2*p}" ) #% Output tables pd.DataFrame({ "gene": wp_ensg, "max_pol_protein": wp_max_pol, "max_time_protein": wp_max_pol * fucci.TOT_LEN }).to_csv("output/max_pol_protein.csv", index=False) pd.DataFrame({ "gene": adata.var_names, "max_pol_rna": max_moving_avg_pol, "max_time_rna": max_moving_avg_pol * fucci.TOT_LEN }).to_csv("output/max_pol_rna.csv", index=False) pd.DataFrame({ "gene": both_ccd_ensg, "insct_prot_max_pol_ccd": insct_prot_max_pol_ccd, "insct_rna_max_pol_ccd": insct_rna_max_pol_ccd, "diff_max_pol": diff_max_pol }).to_csv("output/diff_max_pol.csv", index=False) #% Figures of merit peaked_after_g1_prot = sorted_maxpol_array * fucci.TOT_LEN > fucci.G1_LEN wp_ensg_counts_ccd = np.array([ sum([eeee == ensg for eeee in wp_ensg[ccd_comp]]) for ensg in wp_ensg[ccd_comp] ]) with open("output/figuresofmerit.txt", "a") as file: fom = "--- temporal delay\n\n" fom += f"significant delay in peak protein expression compared to transcript expression, {fucci.TOT_LEN * np.median(diff_max_pol)} hours on average" + "\n\n" fom += f"G1 is the longest period of the cell cycle, in which the majority of RNAs ({100 * sum(sorted_max_moving_avg_pol_ccd * fucci.TOT_LEN <=fucci. G1_LEN) / len(sorted_max_moving_avg_pol_ccd)}%) peak in expression" + "\n\n" fom += f"However, the majority ({100 * sum(peaked_after_g1_prot) / len(np.unique(wp_ensg[ccd_comp]))}%) of the proteins peaked towards the end of the cell cycle corresponding to the S&G2 phases" + "\n\n" fom += f"The delay between peak RNA and protein expression for the 50 CCD proteins that also had CCD transcripts was {fucci.TOT_LEN * np.median(diff_max_pol)} hrs on average " + "\n\n" fom += f"this delay indicates that it may take a little less than the same amount of time ({12 - fucci.TOT_LEN * np.median(diff_max_pol)} hrs) to produce a target metabolite after peak expression of an enzyme." + "\n\n" fom += f"" + "\n\n" fom += f"" + "\n\n" fom += f"" + "\n\n" print(fom) file.write(fom)