Ejemplo n.º 1
0
def zero_center_fucci(green_fucci, red_fucci, u_plate, well_plate, plate):
    '''Zero center and rescale FUCCI data in the log space'''
    log_green_fucci, log_red_fucci = np.log10(green_fucci), np.log10(red_fucci)
    wp_p_dict = dict([(str(p), plate == p) for p in u_plate])
    logmed_green_fucci_p = dict([(str(p), np.log10(np.median(green_fucci[wp_p_dict[str(p)]]))) for p in u_plate])
    logmed_red_fucci_p = dict([(str(p), np.log10(np.median(red_fucci[wp_p_dict[str(p)]]))) for p in u_plate])
    logmed_green_fucci = np.array([logmed_green_fucci_p[wp.split("_")[1]] for wp in well_plate])
    logmed_red_fucci = np.array([logmed_red_fucci_p[wp.split("_")[1]] for wp in well_plate])
    log_green_fucci_zeroc = np.array(log_green_fucci) - logmed_green_fucci
    log_red_fucci_zeroc = np.array(log_red_fucci) - logmed_red_fucci
    log_green_fucci_zeroc_rescale = (log_green_fucci_zeroc - np.min(log_green_fucci_zeroc)) / np.max(log_green_fucci_zeroc)
    log_red_fucci_zeroc_rescale = (log_red_fucci_zeroc - np.min(log_red_fucci_zeroc)) / np.max(log_red_fucci_zeroc)
    fucci_data = np.column_stack([log_green_fucci_zeroc_rescale,log_red_fucci_zeroc_rescale])
    result = (log_green_fucci, log_red_fucci,
              log_green_fucci_zeroc, log_red_fucci_zeroc,
              log_green_fucci_zeroc_rescale, log_red_fucci_zeroc_rescale,
              fucci_data)
    
    # Pickle the results
    utils.np_save_overwriting("output/pickles/log_green_fucci_zeroc.npy", log_green_fucci_zeroc)
    utils.np_save_overwriting("output/pickles/log_red_fucci_zeroc.npy", log_red_fucci_zeroc)
    utils.np_save_overwriting("output/pickles/log_green_fucci_zeroc_rescale.npy", log_green_fucci_zeroc_rescale)
    utils.np_save_overwriting("output/pickles/log_red_fucci_zeroc_rescale.npy", log_red_fucci_zeroc_rescale)
    utils.np_save_overwriting("output/pickles/fucci_data.npy", fucci_data)
    
    return result
def general_plots(u_plates):
    '''Make plots to illustrate the results of the scRNA-Seq analysis'''
    valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
    adata, phases = read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, u_plates)

    # QC plots before filtering
    sc.pl.highest_expr_genes(adata, n_top=20, show=False, save="AllCells.pdf")

    # Post filtering QC
    do_log_normalization = True
    do_remove_blob = False
    adata, phasesfilt = qc_filtering(adata, do_log_normalization, do_remove_blob)
    adata = zero_center_fucci(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pl.highly_variable_genes(adata, show=False, save="AllCells.pdf")

    # Louvain clustering and UMAP plots
    # Idea: based on the expression of all genes, do the cell cycle phases cluster together?
    # Execution: scanpy methods: UMAP statistics first, then make UMAP
    # Output: UMAP plots
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.louvain(adata)
    utils.np_save_overwriting("output/pickles/louvain.npy", adata.obs["louvain"])
    sc.tl.umap(adata)
    plt.rcParams['figure.figsize'] = (10, 10)
    sc.pl.umap(adata, color=["phase"], show=False, save="AllCellsSeqCenterPhase.pdf")

    # General display of RNA abundances in TPMs
    sbn.displot(np.concatenate(adata.X), color="tab:orange")
    plt.xlabel("TPM")
    plt.ylabel("Density")
    plt.savefig("figures/rna_abundance_density.pdf")
    plt.close()
def metacompartments(u_well_plates, compartment_dict,
                     my_df_filtered_variation):
    '''Get the compartments for the unique wellplates'''
    wp_iscell = np.asarray([
        compartment_dict[wp].lower().startswith("cell")
        if wp in compartment_dict else False for wp in u_well_plates
    ])
    wp_isnuc = np.asarray([
        compartment_dict[wp].lower().startswith("nuc")
        if wp in compartment_dict else False for wp in u_well_plates
    ])
    wp_iscyto = np.asarray([
        compartment_dict[wp].lower().startswith("cyto")
        if wp in compartment_dict else False for wp in u_well_plates
    ])

    # Pickle the results
    utils.np_save_overwriting("output/pickles/wp_iscell.npy", wp_iscell)
    utils.np_save_overwriting("output/pickles/wp_isnuc.npy", wp_isnuc)
    utils.np_save_overwriting("output/pickles/wp_iscyto.npy", wp_iscyto)

    wp_nocompartmentinfo = ~wp_iscell & ~wp_isnuc & ~wp_iscyto
    print(
        f"{sum(wp_nocompartmentinfo)}: samples without compartment information; to be filtered since they're biologically defined as CCD and not included in the analysis"
    )
    print(
        f"{len(my_df_filtered_variation)}: number of cells before filtering for compartment information"
    )
    my_df_filtered_compartmentvariation = my_df_filtered_variation[
        ~np.isin(my_df_filtered_variation.
                 well_plate, u_well_plates[wp_nocompartmentinfo])]
    print(
        f"{len(my_df_filtered_compartmentvariation)}: number of cells before filtering for compartment information"
    )
    return wp_iscell, wp_isnuc, wp_iscyto, my_df_filtered_compartmentvariation
def pseudotime_protein(fucci_data, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, well_plate, well_plate_imgnb, well_plate_imgnb_objnb,
                        log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases):
    '''Generate a polar coordinate model of cell cycle progression based on the FUCCI intensities'''
    # Generate model
    polar_coord_results = fucci_polar_coords(fucci_data[:,0], fucci_data[:,1], "Protein")
    pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results

    # Sort results by pseudotime
    sort_results = pol_sort(pol_sort_inds, more_than_start, less_than_start, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases)
    pol_sort_well_plate, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_fred, pol_sort_fgreen, pol_sort_mockbulk_phases = sort_results
        
    # Generate some plots
    fucci_hist2d(centered_data, cart_data_ur, start_pt, g1_end_pt, g1s_end_pt, "Protein", R_2, start_phi, 200, True, pol_sort_well_plate, pol_sort_ab_nuc, pol_sort_centered_data0, pol_sort_centered_data1)
    plot_fucci_intensities_on_pseudotime(pol_sort_norm_rev, pol_sort_centered_data1, pol_sort_centered_data0)
    
    # pickle the results
    utils.np_save_overwriting("output/pickles/pol_sort_well_plate.npy", pol_sort_well_plate)
    utils.np_save_overwriting("output/pickles/pol_sort_well_plate_imgnb.npy", pol_sort_well_plate_imgnb)
    utils.np_save_overwriting("output/pickles/pol_sort_well_plate_imgnb_objnb.npy", pol_sort_well_plate_imgnb_objnb)
    utils.np_save_overwriting("output/pickles/pol_sort_norm_rev.npy", pol_sort_norm_rev)
    utils.np_save_overwriting("output/pickles/pol_sort_ab_nuc.npy", pol_sort_ab_nuc)
    utils.np_save_overwriting("output/pickles/pol_sort_ab_cyto.npy", pol_sort_ab_cyto)
    utils.np_save_overwriting("output/pickles/pol_sort_ab_cell.npy", pol_sort_ab_cell)
    utils.np_save_overwriting("output/pickles/pol_sort_mt_cell.npy", pol_sort_mt_cell)
    utils.np_save_overwriting("output/pickles/pol_sort_area_cell.npy", pol_sort_area_cell)
    utils.np_save_overwriting("output/pickles/pol_sort_area_nuc.npy", pol_sort_area_nuc)
    utils.np_save_overwriting("output/pickles/pol_sort_fred.npy", pol_sort_fred)
    utils.np_save_overwriting("output/pickles/pol_sort_fgreen.npy", pol_sort_fgreen)
    utils.np_save_overwriting("output/pickles/pol_sort_centered_data0.npy", pol_sort_centered_data0)
    utils.np_save_overwriting("output/pickles/pol_sort_centered_data1.npy", pol_sort_centered_data1)
    
    return (pol_sort_well_plate, pol_sort_norm_rev, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb,
        pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, 
        pol_sort_area_cell, pol_sort_area_nuc, pol_sort_centered_data1, pol_sort_centered_data0, pol_sort_mockbulk_phases)
Ejemplo n.º 5
0
def gaussian_clustering_analysis(alpha_gauss, doGeneratePlots, g1, sph, g2, 
             wp_ensg, well_plate, u_well_plates, ab_cell, ab_nuc, ab_cyto, mt_cell, wp_iscell, wp_isnuc, wp_iscyto):
    '''Analyze the results of Gaussian clustering of FUCCI data for each protein antibody staining'''
    wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_mt_kruskal = [],[],[],[]
    curr_wp_phases = []
    mockbulk_phases = np.array(["  "] * len(ab_cell))
    fileprefixes = np.array([f"{ensg}_{sum(wp_ensg[:ei] == ensg)}" for ei, ensg in enumerate(wp_ensg)])
    for iii, wp in enumerate(u_well_plates):
        curr_well_inds = well_plate==wp
        curr_wp_g1 = curr_well_inds & g1
        curr_wp_sph = curr_well_inds & sph
        curr_wp_g2 = curr_well_inds & g2
        curr_wp_phase_list = get_phase_strings(g1[curr_well_inds], sph[curr_well_inds], g2[curr_well_inds])
        mockbulk_phases[curr_well_inds] = np.asarray(curr_wp_phase_list)
        curr_wp_phases.append(curr_wp_phase_list)
        wp_cell_kruskal.append(scipy.stats.kruskal(ab_cell[curr_wp_g1], ab_cell[curr_wp_sph], ab_cell[curr_wp_g2])[1])
        wp_nuc_kruskal.append(scipy.stats.kruskal(ab_nuc[curr_wp_g1], ab_nuc[curr_wp_sph], ab_nuc[curr_wp_g2])[1])
        wp_cyto_kruskal.append(scipy.stats.kruskal(ab_cyto[curr_wp_g1], ab_cyto[curr_wp_sph], ab_cyto[curr_wp_g2])[1])
        wp_mt_kruskal.append(scipy.stats.kruskal(mt_cell[curr_wp_g1], mt_cell[curr_wp_sph], mt_cell[curr_wp_g2])[1])
        max_val_for_norm = np.max(ab_cell[curr_well_inds] if wp_iscell[iii] else ab_nuc[curr_well_inds] if wp_isnuc[iii] else ab_cyto[curr_well_inds])
        max_mt_for_norm = np.max(mt_cell[curr_well_inds])
        if doGeneratePlots:
            gaussian_boxplot_result(
                    (ab_cell[curr_wp_g1] if wp_iscell[iii] else ab_nuc[curr_wp_g1] if wp_isnuc[iii] else ab_cyto[curr_wp_g1]) / max_val_for_norm,
                    (ab_cell[curr_wp_sph] if wp_iscell[iii] else ab_nuc[curr_wp_sph] if wp_isnuc[iii] else ab_cyto[curr_wp_sph]) / max_val_for_norm,
                    (ab_cell[curr_wp_g2] if wp_iscell[iii] else ab_nuc[curr_wp_g2] if wp_isnuc[iii] else ab_cyto[curr_wp_g2]) / max_val_for_norm,
                    "figures/GaussianBoxplots", fileprefixes[iii])
            gaussian_boxplot_result(
                mt_cell[curr_wp_g1] / max_mt_for_norm,
                mt_cell[curr_wp_sph] / max_mt_for_norm,
                mt_cell[curr_wp_g2] / max_mt_for_norm,
                "figures/GaussianBoxplots_mt", f"{fileprefixes[iii]}_mt")
        
    # multiple testing correction for protein of interest
    wp_comp_kruskal_gaussccd_p = utils.values_comp(wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_iscell, wp_isnuc, wp_iscyto)
    wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp = utils.benji_hoch(alpha_gauss, wp_comp_kruskal_gaussccd_p)
    utils.np_save_overwriting("output/pickles/wp_comp_kruskal_gaussccd_adj.npy", wp_comp_kruskal_gaussccd_adj)
    utils.np_save_overwriting("output/pickles/wp_pass_kruskal_gaussccd_bh_comp.npy", wp_pass_kruskal_gaussccd_bh_comp)

    # multiple testing correction for microtubules
    wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt = utils.benji_hoch(alpha_gauss, wp_mt_kruskal) 
    utils.np_save_overwriting("output/pickles/wp_mt_kruskal_gaussccd_adj.npy", wp_mt_kruskal_gaussccd_adj)
    utils.np_save_overwriting("output/pickles/wp_pass_gaussccd_bh_mt.npy", wp_pass_gaussccd_bh_mt)
    
    # save the phase information
    utils.np_save_overwriting("output/pickles/curr_wp_phases.npy", np.array(curr_wp_phases, dtype=object))
    utils.np_save_overwriting("output/pickles/mockbulk_phases.npy", np.array(mockbulk_phases))

    print(f"{len(wp_pass_kruskal_gaussccd_bh_comp)}: number of genes tested")
    print(f"{sum(wp_pass_kruskal_gaussccd_bh_comp)}: number of passing genes at {alpha_gauss*100}% FDR in compartment")

    return wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp, wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt
Ejemplo n.º 6
0
def analyze_ccd_variation_by_mvavg_rna(adata, wp_ensg, ccd_comp, bioccd, adata_nonccdproteins, adata_regevccdgenes, 
               biotype_to_use, use_isoforms=False, make_mvavg_plots_isoforms=False):
    expression_data = adata.X # log normalized
    normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:,None]).T
    fucci_time_inds = np.argsort(adata.obs["fucci_time"])
    norm_exp_sort = np.take(normalized_exp_data, fucci_time_inds, axis=0)
    moving_averages = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort, WINDOW)
    mvavg_xvals = MovingAverages.mvavg(adata.obs["fucci_time"][fucci_time_inds], WINDOW)
    cell_cycle_variance = np.var(moving_averages, 0)
    total_variance = np.var(norm_exp_sort, 0)
    total_gini = np.apply_along_axis(utils.gini, 0, norm_exp_sort)
    percent_ccd_variance = cell_cycle_variance / total_variance
    avg_expression = np.median(norm_exp_sort, 0)

    # randomize and calculate the mean difference in percent variances from random
    percent_ccd_variance_rng, mean_diff_from_rng = [],[]
    perms = np.asarray([np.random.permutation(len(adata.obs)) for nnn in np.arange(PERMUTATIONS if not use_isoforms else PERMUTATIONS_ISOFORMS)])
    picklePath = f"output/pickles/percent_ccd_variance_rng{'' if not use_isoforms else 'Isoforms'}.npy"
    meandiffPath = f"output/pickles/mean_diff_from_rng{'' if not use_isoforms else 'Isoforms'}.npy"
    if not os.path.exists(picklePath):
        # norm_exp_sort_perm = np.asarray([np.take(normalized_exp_data, perm, axis=0) for perm in perms])
        # moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 1, norm_exp_sort_perm, WINDOW)
        # percent_ccd_variance_rng = np.var(moving_averages_perm, axis=1) / np.var(norm_exp_sort_perm, axis=1)
        for iii, perm in enumerate(perms):
            if iii % 50 == 0: print(f"permutation {iii}")
            norm_exp_sort_perm = np.take(normalized_exp_data, perm, axis=0)
            moving_averages_perm = np.apply_along_axis(MovingAverages.mvavg, 0, norm_exp_sort_perm, WINDOW)
            percent_ccd_variance_rng.append(
                    np.var(moving_averages_perm, axis=0) / np.var(norm_exp_sort_perm, axis=0))
        utils.np_save_overwriting(picklePath, percent_ccd_variance_rng)
    else: 
        percent_ccd_variance_rng = np.load(picklePath, allow_pickle=True)
    percent_ccd_variance_rng = np.asarray(percent_ccd_variance_rng)
    mean_diff_from_rng = np.mean((percent_ccd_variance - percent_ccd_variance_rng).T, 1)
    utils.np_save_overwriting(meandiffPath, mean_diff_from_rng)

    # Statistical testing based on randomization analysis
    alpha_ccd = 0.01
    pass_meandiff = mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM
    ccd_var_comp_rng_wilcoxp = np.apply_along_axis(scipy.stats.wilcoxon, 1, (percent_ccd_variance - percent_ccd_variance_rng).T, None, "wilcox", False, "greater").T[1].T
    eq_percvar_adj, pass_eq_percvar_adj = utils.bonf(alpha_ccd, ccd_var_comp_rng_wilcoxp)
    gtpass_eq_percvar_adj = pass_eq_percvar_adj & (percent_ccd_variance > np.median(percent_ccd_variance_rng, axis=0))

    ccdprotein = np.isin(adata.var_names, np.concatenate((wp_ensg[ccd_comp], bioccd)))
    gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"])
    gene_ids = list(gene_info["gene_id"])
    gene_names = list(gene_info["name"])
    gene_id_name = dict([(gene_ids[idxx], gene_names[idxx]) for idxx in range(len(gene_info))])
    ccdstring = np.array(["No                 "] * len(ccdprotein))
    ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp])] = "Pseudotime"
    ccdstring[np.isin(adata.var_names, bioccd)] = "Mitotic"
    ccdstring[np.isin(adata.var_names, wp_ensg[ccd_comp]) & np.isin(adata.var_names, bioccd)] = "Pseudotime&Mitotic"
    percent_variance_tests = pd.DataFrame(
        {"gene" : adata.var_names, 
        "name" : [gene_id_name[x] if x in gene_id_name else "" for x in adata.var_names],
        "ccd_transcript" : pass_meandiff, 
        "regev_ccd" : adata_regevccdgenes,
        "ccd_protein" : ccdstring,
        "nonccd_protein" : adata_nonccdproteins,
        "mean_diff_from_rng":mean_diff_from_rng,
        "-log10 CCD FDR":-np.log10(eq_percvar_adj)})
    percent_variance_tests.to_csv(f"output/transcript_regulation{biotype_to_use}{'' if not use_isoforms else 'Isoforms'}.csv", index=False)

    # And keep track of the ccd genes with and without transcript regulation
    ccdtranscript = pass_meandiff
    ccdprotein_transcript_regulated = ccdprotein & pass_meandiff
    ccdprotein_nontranscript_regulated = ccdprotein & ~pass_meandiff
    ccdtranscript_names = np.array(adata.var_names)[ccdtranscript]
    proteinccd_transcript_regulated_names = np.array(adata.var_names)[ccdprotein_transcript_regulated]
    proteinccd_nontranscript_regulated_names = np.array(adata.var_names)[ccdprotein_nontranscript_regulated]
    utils.np_save_overwriting(f"output/pickles/ccdprotein{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein) # pseudotime/mitotic ccd, might not have all the proteins, since this only has proteins not filtered in RNA-Seq analysis
    utils.np_save_overwriting(f"output/pickles/ccdtranscript{'' if not use_isoforms else 'Isoforms'}.npy", ccdtranscript)
    utils.np_save_overwriting(f"output/pickles/ccdprotein_transcript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_transcript_regulated)
    utils.np_save_overwriting(f"output/pickles/ccdprotein_nontranscript_regulated{'' if not use_isoforms else 'Isoforms'}.npy", ccdprotein_nontranscript_regulated)
    pd.DataFrame({"gene" : ccdtranscript_names}).to_csv(f"output/all_ccdtranscript_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : proteinccd_transcript_regulated_names}).to_csv(f"output/proteinccd_transcript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : proteinccd_nontranscript_regulated_names}).to_csv(f"output/proteinccd_nontranscript_regulated_names{'' if not use_isoforms else 'Isoforms'}.csv")
    pd.DataFrame({"gene" : adata.var_names}).to_csv(f"output/gene_names{'' if not use_isoforms else 'Isoforms'}.csv")
    
    # make folders
    mvpercs = [] if use_isoforms and not make_mvavg_plots_isoforms else mvavg_plots_pergene(adata, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, use_isoforms)
    if not use_isoforms or make_mvavg_plots_isoforms:
        folder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNAPseudotimes{'' if not use_isoforms else 'Isoforms'}"
        ccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        ccdtransnonccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_CCDTranscriptNonCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        nonccdtransccdprotfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCDTranscriptCCDProtein{'' if not use_isoforms else 'Isoforms'}"
        nonccdfolder = f"{'f:/CellCycle/' if use_isoforms else ''}figures/RNA_NonCCD"
        for f in [ccdtransccdprotfolder,ccdtransnonccdprotfolder,nonccdtransccdprotfolder,nonccdfolder]:
            if not os.path.exists(f): os.mkdir(f)
        # CCD transcript & not CCD protein
        for ensg in adata.var_names[ccdtranscript & ~ccdprotein]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransnonccdprotfolder, ensg +'_mvavg.pdf'))
        # CCD transcript & CCD Protein
        for ensg in adata.var_names[ccdprotein_transcript_regulated]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(ccdtransccdprotfolder, ensg +'_mvavg.pdf'))
        # Not CCD transcript & CCD Protein
        for ensg in adata.var_names[ccdprotein_nontranscript_regulated]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdtransccdprotfolder, ensg +'_mvavg.pdf'))
        # Non-CCD 
        for ensg in adata.var_names[~ccdtranscript & ~ccdprotein]:
            shutil.copy(os.path.join(folder, ensg+'_mvavg.pdf'), os.path.join(nonccdfolder, ensg+'_mvavg.pdf'))

    # Figures of merit
    with open("output/figuresofmerit.txt", "a") as file:
        fom = "--- RNA pseudotime\n\n"
        fom += f"We identified {sum(ccdtranscript)} {'genes' if use_isoforms else 'transcript isoforms'} of {len(ccdtranscript)} protein-coding {'genes' if use_isoforms else 'transcript isoforms'} analyzed ({100 * sum(ccdtranscript) / len(ccdtranscript)}%) to have variance in expression levels correlated to cell cycle progression" + "\n\n"
        if not use_isoforms:
            fom += f"We can attribute only {100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}% of proteomic cell cycle regulation to transcriptomic cycling with single-cell RNA sequencing" + "\n\n"
            fom += f"This includes {100 * sum(np.isin(adata.var_names[mean_diff_from_rng > MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM], adata.var_names[adata_regevccdgenes])) / sum(adata_regevccdgenes)}% of known CCD transcripts. Of these, {sum(ccdprotein_transcript_regulated)} were also cell cycle dependent proteins ({100 * sum(ccdprotein_transcript_regulated) / sum(ccdprotein)}%). Of the {sum(ccdprotein)} CCD proteins, {sum(ccdprotein_nontranscript_regulated)} did not have CCD transcripts, including DUSP18 (Figure 2E). There were {sum(ccdtranscript & adata_nonccdproteins)} CCD transcripts that were Non-CCD as proteins." + "\n\n"
        fom += f"" + "\n\n"
        print(fom)
        file.write(fom)
    
    return percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs
def read_sample_info(df):
    '''Get the metadata for all the samples'''
    plate = np.asarray(df.plate)
    u_plate = np.unique(plate)
    well_plate = np.asarray(df.well_plate)
    imgnb = np.asarray(df.ImageNumber)
    well_plate_imgnb = np.asarray(
        [f"{wp}_{imgnb[i]}" for i, wp in enumerate(well_plate)])
    u_well_plates = np.unique(well_plate)
    ab_objnum = np.asarray(df.ObjectNumber)
    well_plate_imgnb_objnb = np.asarray(
        [f"{wp}_{imgnb[i]}_{ab_objnum[i]}" for i, wp in enumerate(well_plate)])
    area_cell = np.asarray(df.Area_cell)
    area_nuc = np.asarray(df.AreaShape_Area)
    area_cyto = np.asarray(df.Area_cyto)
    name_df = pd.read_csv(
        "input/ProteinData/FucciStainingSummaryFirstPlates.csv")
    wppp1, ensggg1, abbb1, rrrr, cccc1 = list(name_df["well_plate"]), list(
        name_df["ENSG"]), list(name_df["Antibody"]), list(
            name_df["Results_final_update"]), list(name_df["Compartment"])
    name_df2 = pd.read_csv(
        "input/ProteinData/FucciStainingSummarySecondPlates.csv")
    wppp2, ensggg2, abbb2, cccc2 = list(name_df2["well_plate"]), list(
        name_df2["ENSG"]), list(name_df2["Antibody"]), list(
            name_df2["Compartment"])
    wppp, ensggg, abbb, cccc = wppp1 + wppp2, ensggg1 + ensggg2, abbb1 + abbb2, cccc1 + cccc2
    ensg_dict = dict([(wppp[i], ensggg[i]) for i in range(len(wppp))])
    ab_dict = dict([(wppp[i], abbb[i]) for i in range(len(wppp))])
    result_dict = dict([(wppp[i], rrrr[i]) for i in range(len(wppp1))])
    compartment_dict = dict([(wppp[i], cccc[i]) for i in range(len(wppp))])
    ENSG = np.asarray(
        [ensg_dict[wp] if wp in ensg_dict else "" for wp in well_plate])
    antibody = np.asarray(
        [ab_dict[wp] if wp in ab_dict else "" for wp in well_plate])
    result = np.asarray(
        [result_dict[wp] if wp in result_dict else "" for wp in well_plate])
    compartment = np.asarray([
        compartment_dict[wp] if wp in compartment_dict else ""
        for wp in well_plate
    ])

    # Pickle the results
    if not os.path.exists("output/"): os.mkdir("output/")
    if not os.path.exists("output/pickles/"): os.mkdir("output/pickles/")
    if not os.path.exists("figures/"): os.mkdir("figures/")
    utils.np_save_overwriting("output/pickles/plate.npy", plate)
    utils.np_save_overwriting("output/pickles/u_plate.npy", u_plate)
    utils.np_save_overwriting("output/pickles/u_well_plates.npy",
                              u_well_plates)
    utils.np_save_overwriting("output/pickles/area_cell.npy", area_cell)
    utils.np_save_overwriting("output/pickles/area_nuc.npy", area_nuc)
    utils.np_save_overwriting("output/pickles/area_cyto.npy", area_cyto)
    utils.np_save_overwriting("output/pickles/well_plate.npy", well_plate)
    utils.np_save_overwriting("output/pickles/well_plate_imgnb.npy",
                              well_plate_imgnb)
    utils.np_save_overwriting("output/pickles/well_plate_imgnb_objnb.npy",
                              well_plate_imgnb_objnb)

    return plate, u_plate, well_plate, well_plate_imgnb, u_well_plates, ab_objnum, area_cell, area_nuc, area_cyto, ensg_dict, ab_dict, result_dict, compartment_dict, ENSG, antibody, result, compartment
def read_sample_data(df):
    '''Read antibody intensity data for each sample and save it to a file for later use.'''
    # Antibody data (mean intensity)
    ab_nuc = np.asarray([
        df.Intensity_MeanIntensity_ResizedAb,
        df.Intensity_IntegratedIntensity_ResizedAb,
        df.Intensity_IntegratedIntensity_ResizedAb / df.AreaShape_Area
    ][INTENSITY_SWITCH])
    ab_cyto = np.asarray([
        df.Mean_ab_Cyto, df.Integrated_ab_cyto,
        df.Integrated_ab_cyto / df.AreaShape_Area
    ][INTENSITY_SWITCH])
    ab_cell = np.asarray([
        df.Mean_ab_cell, df.Integrated_ab_cell,
        df.Integrated_ab_cell / df.AreaShape_Area
    ][INTENSITY_SWITCH])
    mt_cell = np.asarray([
        df.Mean_mt_cell, df.Integrated_mt_cell,
        df.Integrated_mt_cell / df.AreaShape_Area
    ][INTENSITY_SWITCH])

    # Fucci data (mean intensity)
    green_fucci = np.asarray(df.Intensity_MeanIntensity_CorrResizedGreenFUCCI)
    red_fucci = np.asarray(df.Intensity_MeanIntensity_CorrResizedRedFUCCI)

    # Pickle the results
    utils.np_save_overwriting("output/pickles/ab_nuc.npy", ab_nuc)
    utils.np_save_overwriting("output/pickles/ab_cyto.npy", ab_cyto)
    utils.np_save_overwriting("output/pickles/ab_cell.npy", ab_cell)
    utils.np_save_overwriting("output/pickles/mt_cell.npy", mt_cell)
    utils.np_save_overwriting("output/pickles/green_fucci.npy", green_fucci)
    utils.np_save_overwriting("output/pickles/red_fucci.npy", red_fucci)

    return ab_nuc, ab_cyto, ab_cell, mt_cell, green_fucci, red_fucci
def previous_results(u_well_plates, result_dict, ensg_dict, ab_dict):
    '''Process the results metadata into lists of previously annotated CCD proteins'''
    wp_ensg = np.asarray(
        [ensg_dict[wp] if wp in ensg_dict else "" for wp in u_well_plates])
    wp_ab = np.asarray(
        [ab_dict[wp] if wp in ab_dict else "" for wp in u_well_plates])
    wp_prev_ccd = np.asarray([
        wp in result_dict and result_dict[wp].startswith("ccd")
        for wp in u_well_plates
    ])
    wp_prev_notccd = np.asarray([
        wp in result_dict and result_dict[wp].startswith("notccd")
        for wp in u_well_plates
    ])
    wp_prev_negative = np.asarray([
        wp in result_dict and result_dict[wp].startswith("negative")
        for wp in u_well_plates
    ])
    prev_ccd_ensg = wp_ensg[wp_prev_ccd]
    prev_notccd_ensg = wp_ensg[wp_prev_notccd]
    prev_negative_ensg = wp_ensg[wp_prev_negative]

    # Pickle the results
    utils.np_save_overwriting("output/pickles/wp_ensg.npy", wp_ensg)
    utils.np_save_overwriting("output/pickles/wp_ab.npy", wp_ab)
    utils.np_save_overwriting("output/pickles/wp_prev_ccd.npy", wp_prev_ccd)
    utils.np_save_overwriting("output/pickles/wp_prev_notccd.npy",
                              wp_prev_notccd)
    utils.np_save_overwriting("output/pickles/wp_prev_negative.npy",
                              wp_prev_negative)
    utils.np_save_overwriting("output/pickles/prev_ccd_ensg.npy",
                              prev_ccd_ensg)
    utils.np_save_overwriting("output/pickles/prev_notccd_ensg.npy",
                              prev_notccd_ensg)
    utils.np_save_overwriting("output/pickles/prev_negative_ensg.npy",
                              prev_negative_ensg)

    return wp_ensg, wp_ab, wp_prev_ccd, wp_prev_notccd, wp_prev_negative, prev_ccd_ensg, prev_notccd_ensg, prev_negative_ensg
def calculate_variation(use_log, u_well_plates, wp_iscell, wp_isnuc, wp_iscyto,
                        pol_sort_well_plate, pol_sort_ab_cell, pol_sort_ab_nuc,
                        pol_sort_ab_cyto, pol_sort_mt_cell,
                        pol_sort_well_plate_imgnb):
    '''Calculate overall variation of protein staining intensity in single cells'''
    var_cell, var_nuc, var_cyto, var_mt = [], [], [], [
    ]  # mean intensity variances per antibody
    cv_cell, cv_nuc, cv_cyto, cv_mt = [], [], [], []
    gini_cell, gini_nuc, gini_cyto, gini_mt = [], [], [], [
    ]  # mean intensity ginis per antibody
    mean_mean_cell, mean_mean_nuc, mean_mean_cyto, mean_mean_mt = [], [], [], [
    ]  # mean mean-intensity
    cell_counts = []

    wpi_img = []
    gini_cell_img, gini_nuc_img, gini_cyto_img, gini_mt_img = [], [], [], [
    ]  # mean intensity g per field of view
    var_cell_img, var_nuc_img, var_cyto_img, var_mt_img = [], [], [], [
    ]  # mean intensity variances per field of view
    cv_cell_img, cv_nuc_img, cv_cyto_img, cv_mt_img = [], [], [], []

    # The variance needs to be calculated separately for each well because they all have different numbers of cells
    for well in u_well_plates:
        curr_well_inds = pol_sort_well_plate == well
        curr_ab_cell = pol_sort_ab_cell[
            curr_well_inds] if not use_log else np.log10(
                pol_sort_ab_cell[curr_well_inds])
        curr_ab_nuc = pol_sort_ab_nuc[
            curr_well_inds] if not use_log else np.log10(
                pol_sort_ab_nuc[curr_well_inds])
        curr_ab_cyto = pol_sort_ab_cyto[
            curr_well_inds] if not use_log else np.log10(
                pol_sort_ab_cyto[curr_well_inds])
        curr_mt_cell = pol_sort_mt_cell[
            curr_well_inds] if not use_log else np.log10(
                pol_sort_mt_cell[curr_well_inds])

        cell_counts.append(len(curr_ab_cell))

        var_cell.append(np.var(curr_ab_cell))
        var_nuc.append(np.var(curr_ab_nuc))
        var_cyto.append(np.var(curr_ab_cyto))
        var_mt.append(np.var(curr_mt_cell))

        cv_cell.append(scipy.stats.variation(curr_ab_cell))
        cv_nuc.append(scipy.stats.variation(curr_ab_nuc))
        cv_cyto.append(scipy.stats.variation(curr_ab_cyto))
        cv_mt.append(scipy.stats.variation(curr_mt_cell))

        gini_cell.append(utils.gini(curr_ab_cell))
        gini_nuc.append(utils.gini(curr_ab_nuc))
        gini_cyto.append(utils.gini(curr_ab_cyto))
        gini_mt.append(utils.gini(curr_mt_cell))

        # Save the mean mean intensities
        mean_mean_cell.append(np.mean(curr_ab_cell))
        mean_mean_nuc.append(np.mean(curr_ab_nuc))
        mean_mean_cyto.append(np.mean(curr_ab_cyto))
        mean_mean_mt.append(np.mean(curr_mt_cell))

        curr_well_plate_imgnbs = pol_sort_well_plate_imgnb[curr_well_inds]
        curr_wpi_img = []
        curr_gini_cell_img, curr_gini_nuc_img, curr_gini_cyto_img, curr_gini_mt_img = [],[],[],[] # mean intensity variances per field of view
        curr_var_cell_img, curr_var_nuc_img, curr_var_cyto_img, curr_var_mt_img = [],[],[],[] # mean intensity variances per field of view
        curr_cv_cell_img, curr_cv_nuc_img, curr_cv_cyto_img, curr_cv_mt_img = [], [], [], []
        for wpi in np.unique(curr_well_plate_imgnbs):
            curr_wpis = pol_sort_well_plate_imgnb == wpi
            curr_ab_cell = pol_sort_ab_cell[
                curr_wpis] if not use_log else np.log10(
                    pol_sort_ab_cell[curr_wpis])
            curr_ab_nuc = pol_sort_ab_nuc[
                curr_wpis] if not use_log else np.log10(
                    pol_sort_ab_nuc[curr_wpis])
            curr_ab_cyto = pol_sort_ab_cyto[
                curr_wpis] if not use_log else np.log10(
                    pol_sort_ab_cyto[curr_wpis])
            curr_mt_cell = pol_sort_mt_cell[
                curr_wpis] if not use_log else np.log10(
                    pol_sort_mt_cell[curr_wpis])

            curr_wpi_img.append(wpi)

            curr_var_cell_img.append(np.var(curr_ab_cell))
            curr_var_nuc_img.append(np.var(curr_ab_nuc))
            curr_var_cyto_img.append(np.var(curr_ab_cyto))
            curr_var_mt_img.append(np.var(curr_mt_cell))

            curr_gini_cell_img.append(utils.gini(curr_ab_cell))
            curr_gini_nuc_img.append(utils.gini(curr_ab_nuc))
            curr_gini_cyto_img.append(utils.gini(curr_ab_cyto))
            curr_gini_mt_img.append(utils.gini(curr_mt_cell))

            curr_cv_cell_img.append(scipy.stats.variation(curr_ab_cell))
            curr_cv_nuc_img.append(scipy.stats.variation(curr_ab_nuc))
            curr_cv_cyto_img.append(scipy.stats.variation(curr_ab_cyto))
            curr_cv_mt_img.append(scipy.stats.variation(curr_mt_cell))

        wpi_img.append(curr_wpi_img)
        var_cell_img.append(curr_var_cell_img)
        var_nuc_img.append(curr_var_nuc_img)
        var_cyto_img.append(curr_var_cyto_img)
        var_mt_img.append(curr_var_mt_img)

        gini_cell_img.append(curr_gini_cell_img)
        gini_nuc_img.append(curr_gini_nuc_img)
        gini_cyto_img.append(curr_gini_cyto_img)
        gini_mt_img.append(curr_gini_mt_img)

        cv_cell_img.append(curr_cv_cell_img)
        cv_nuc_img.append(curr_cv_nuc_img)
        cv_cyto_img.append(curr_cv_cyto_img)
        cv_mt_img.append(curr_cv_mt_img)

    print(
        "Plotting average intensities of proteins and microtubules by batch.")
    plot_average_intensities_by_batch(u_well_plates, mean_mean_cell,
                                      mean_mean_nuc, mean_mean_cyto,
                                      mean_mean_mt, wp_iscell, wp_isnuc,
                                      wp_iscyto)

    print("Making general plots for variance, CV, and gini by compartment")
    var_cell, var_nuc, var_cyto, var_mt = np.array(var_cell), np.array(
        var_nuc), np.array(var_cyto), np.array(var_mt)
    gini_cell, gini_nuc, gini_cyto, gini_mt = np.array(gini_cell), np.array(
        gini_nuc), np.array(gini_cyto), np.array(gini_mt)
    cv_cell, cv_nuc, cv_cyto, cv_mt = np.array(cv_cell), np.array(
        cv_nuc), np.array(cv_cyto), np.array(cv_mt)
    utils.general_boxplot(
        (var_cell, var_cyto, var_nuc, var_mt),
        ("var_cell", "var_cyto", "var_nuc", "var_mt"), "Metacompartment",
        f"Variance using {'log' if use_log else 'natural'} intensity values",
        "", True, "figures/VarianceBoxplot.png")
    utils.general_boxplot(
        (cv_cell, cv_cyto, cv_nuc, cv_mt),
        ("cv_cell", "cv_cyto", "cv_nuc", "cv_mt"), "Metacompartment",
        f"Coeff. of Var. using {'log' if use_log else 'natural'} intensity values",
        "", True, "figures/CVBoxplot.png")
    utils.general_boxplot(
        (gini_cell, gini_cyto, gini_nuc, gini_mt),
        ("gini_cell", "gini_cyto", "gini_nuc", "gini_mt"), "Metacompartment",
        f"Gini using {'log' if use_log else 'natural'} intensity values", "",
        True, "figures/GiniBoxplot.png")

    print(
        "Making general plots for variance, CV, and gini in the compartment the protein localizes to"
    )
    mean_mean_comp = utils.values_comp(mean_mean_cell, mean_mean_nuc,
                                       mean_mean_cyto, wp_iscell, wp_isnuc,
                                       wp_iscyto)
    cv_comp = utils.values_comp(cv_cell, cv_nuc, cv_cyto, wp_iscell, wp_isnuc,
                                wp_iscyto)
    gini_comp = utils.values_comp(gini_cell, gini_nuc, gini_cyto, wp_iscell,
                                  wp_isnuc, wp_iscyto)
    var_comp = utils.values_comp(var_cell, var_nuc, var_cyto, wp_iscell,
                                 wp_isnuc, wp_iscyto)
    utils.general_scatter(var_comp, var_mt, "var_comp", "var_mt",
                          "figures/var_comp_mt.png")
    utils.general_scatter(cv_comp, cv_mt, "cv_comp", "cv_mt",
                          "figures/cv_comp_mt.png")
    utils.general_scatter(gini_comp, gini_mt, "gini_comp", "gini_mt",
                          "figures/gini_comp_mt.png")
    utils.general_scatter(var_comp, mean_mean_comp, "var_comp",
                          f"{'log10' if use_log else 'natural'} intensity",
                          "figures/VarianceVsIntensityComp.png")

    print("Comparing image to sample variance")
    var_comp_img = utils.values_comp(var_cell_img, var_nuc_img, var_cyto_img,
                                     wp_iscell, wp_isnuc, wp_iscyto)
    gini_comp_img = utils.values_comp(gini_cell_img, gini_nuc_img,
                                      gini_cyto_img, wp_iscell, wp_isnuc,
                                      wp_iscyto)
    cv_comp_img = utils.values_comp(cv_cell_img, cv_nuc_img, cv_cyto_img,
                                    wp_iscell, wp_isnuc, wp_iscyto)
    utils.general_scatter(
        np.concatenate([[var_comp[i]] * len(vvv)
                        for i, vvv in enumerate(var_comp_img)]),
        np.concatenate(var_comp_img), "variance within compartment",
        "variance for each image", "figures/VarianceByImage.png")
    utils.general_scatter(
        np.concatenate([[gini_comp[i]] * len(vvv)
                        for i, vvv in enumerate(gini_comp_img)]),
        np.concatenate(gini_comp_img), "gini within compartment",
        "gini for each image", "figures/GiniByImage.png")
    utils.general_scatter(
        np.concatenate([[cv_comp[i]] * len(vvv)
                        for i, vvv in enumerate(cv_comp_img)]),
        np.concatenate(cv_comp_img), "cv within compartment",
        "cv for each image", "figures/CVByImage.png")
    print(
        np.concatenate(wpi_img)[np.argmax(np.concatenate(var_comp_img))] +
        ": the image with the max variance")

    plt.hist(
        np.concatenate(
            [vvv / var_comp[i] for i, vvv in enumerate(var_comp_img)]))
    # plt.show()
    plt.close()
    high_var_img = np.concatenate(wpi_img)[np.concatenate(
        [vvv > 4 * var_comp[i] for i, vvv in enumerate(var_comp_img)])]
    print(
        f"{high_var_img}: the images with greater than 4x the variance of the whole sample"
    )

    norm_cv_img = np.concatenate(
        [vvv / cv_comp[i] for i, vvv in enumerate(cv_comp_img)])
    plt.hist(norm_cv_img)
    # plt.show()
    plt.close()
    cutoff = np.mean(norm_cv_img) + 3 * np.std(norm_cv_img)
    high_cv_img = np.concatenate(wpi_img)[norm_cv_img > cutoff]
    print(
        f"{high_cv_img}: the images with greater than 4x the variance of the whole sample"
    )

    np.intersect1d(high_var_img, high_cv_img)

    # Pickle and return main results
    utils.np_save_overwriting("output/pickles/mean_mean_comp.npy",
                              mean_mean_comp)
    utils.np_save_overwriting("output/pickles/cv_comp.npy", cv_comp)
    utils.np_save_overwriting("output/pickles/gini_comp.npy", gini_comp)
    utils.np_save_overwriting("output/pickles/var_comp.npy", var_comp)
    utils.np_save_overwriting("output/pickles/cv_cell.npy", cv_cell)
    utils.np_save_overwriting("output/pickles/gini_cell.npy", gini_cell)
    utils.np_save_overwriting("output/pickles/var_cell.npy", var_cell)

    return mean_mean_comp, var_comp, gini_comp, cv_comp, var_cell, gini_cell, cv_cell, var_mt, gini_mt, cv_mt