def build_cytoplasmic_statistics(analysis_repo, statistics_type, molecule_type, genes, keyorder): gene2stat, gene2median, gene2error, gene2confidence_interval = {}, {}, {}, {} for gene in genes: logger.info("Running {} cytoplasmic {} analysis for {}", molecule_type, statistics_type, gene) image_set = ImageSet(analysis_repo, ['{0}/{1}/'.format(molecule_type, gene)]) if statistics_type == 'centrality': if molecule_type == 'mrna': gene2stat[ gene] = image_set.compute_cytoplasmic_spots_centrality() else: gene2stat[ gene] = image_set.compute_cytoplasmic_intensities_centrality( ) if statistics_type == 'spread': if molecule_type == 'mrna': gene2stat[gene] = image_set.compute_cytoplasmic_spots_spread() else: gene2stat[ gene] = image_set.compute_intensities_cytoplasmic_spread() if statistics_type == 'centrality': gene2median[gene] = np.mean(gene2stat[gene]) gene2error[gene] = helpers.sem(gene2stat[gene], factor=0) lower, higher = helpers.median_confidence_interval(gene2stat[gene]) gene2confidence_interval[gene] = [lower, higher] if statistics_type == 'spread': max_entropy = np.max([np.max(gene2stat[k]) for k in gene2stat.keys()]) for gene in gene2stat.keys(): gene2stat[gene] = gene2stat[gene] / max_entropy gene2median[gene] = np.median(gene2stat[gene]) gene2error[gene] = helpers.sem(gene2stat[gene], factor=0) lower, higher = helpers.median_confidence_interval(gene2stat[gene]) gene2confidence_interval[gene] = [lower, higher] gene2stat = collections.OrderedDict( sorted(gene2stat.items(), key=lambda i: keyorder.index(i[0]))) gene2median = collections.OrderedDict( sorted(gene2median.items(), key=lambda i: keyorder.index(i[0]))) gene2error = collections.OrderedDict( sorted(gene2error.items(), key=lambda i: keyorder.index(i[0]))) gene2confidence_interval = collections.OrderedDict( sorted(gene2confidence_interval.items(), key=lambda i: keyorder.index(i[0]))) return gene2median, gene2stat, gene2error, gene2confidence_interval
def compute_degree_of_clustering(genes_list, analysis_repo, molecule_type): gene2_degree_of_clustering = {} gene2median_degree_of_clustering = {} gene2error_degree_of_clustering = {} gene2confidence_interval = {} degrees_of_clustering = [] for gene in genes_list: image_set = ImageSet(analysis_repo, ['{0}/{1}/'.format(molecule_type, gene)]) d_of_c = np.array(image_set.compute_degree_of_clustering()) degrees_of_clustering.append(d_of_c) for gene, degree_of_clustering in zip(genes_list, degrees_of_clustering): degree_of_clustering = np.log(degree_of_clustering) gene2_degree_of_clustering[gene] = degree_of_clustering gene2median_degree_of_clustering[gene] = np.median( degree_of_clustering) # Standard error and CI computation gene2error_degree_of_clustering[gene] = helpers.sem( degree_of_clustering, factor=0) lower, higher = helpers.median_confidence_interval( degree_of_clustering) gene2confidence_interval[gene] = [lower, higher] return gene2_degree_of_clustering, gene2median_degree_of_clustering, gene2error_degree_of_clustering, gene2confidence_interval
def plot_dynamic_barplot(analysis_repo): ''' Formats the data and calls the plotting function ''' plot_colors = constants.analysis_config['PLOT_COLORS'] # paired mRNA-protein barplots, so we go through proteins (we have less proteins than mRNA) tp_mrna = constants.dataset_config['TIMEPOINTS_MRNA'] tp_proteins = constants.dataset_config['TIMEPOINTS_PROTEIN'] all_timepoints = np.sort(list(set(tp_mrna) | set(tp_proteins))) for i, gene in enumerate(constants.analysis_config['PROTEINS']): df = pd.DataFrame( columns=["Molecule", "Timepoint", "d_of_c", "error", "CI"]) for molecule, timepoints in zip(["mrna", "protein"], [tp_mrna, tp_proteins]): for j, tp in enumerate(all_timepoints): if tp not in timepoints: df = df.append( { "Molecule": molecule, "Timepoint": tp, "error": 0, "CI": [0, 0], "d_of_c": 0 }, ignore_index=True) continue image_set = ImageSet( analysis_repo, ["{0}/{1}/{2}/".format(molecule, gene, tp)]) degree_of_clustering = np.log( image_set.compute_degree_of_clustering( )) # * factor[gene][molecule][j] err = helpers.sem(degree_of_clustering, factor=6) lower, higher = helpers.median_confidence_interval( degree_of_clustering) df = df.append( { "Molecule": molecule, "Timepoint": tp, "error": err, "CI": [lower, higher], "d_of_c": degree_of_clustering }, ignore_index=True) df = df.sort_values('Timepoint') df = df.groupby('Molecule').apply(mean_column) my_pal = { "mrna": str(plot_colors[i]), "protein": str(color_variant(plot_colors[i], +80)) } tgt_image_name = constants.analysis_config[ 'DYNAMIC_FIGURE_NAME_FORMAT'].format(gene=gene) tgt_fp = pathlib.Path( constants.analysis_config['FIGURE_OUTPUT_PATH'].format( root_dir=global_root_dir), tgt_image_name) plot.bar_profile_median_timepoints(df, palette=my_pal, figname=tgt_fp, fixed_yscale=15)
def mrna_cytoplasmic_total_count(analysis_repo, keyorder): gene2image_set = {} gene2cyto_count = {} gene2median_cyto_count = {} gene2error = {} gene2confidence_interval = {} for gene in constants.analysis_config['MRNA_GENES']: logger.info("Running mrna cytoplasmic total count analysis for {}", gene) gene2image_set[gene] = ImageSet(analysis_repo, ['mrna/%s/' % gene]) gene2cyto_count[gene] = gene2image_set[gene].compute_cytoplasmic_spots_counts() gene2median_cyto_count[gene] = np.median(gene2cyto_count[gene]) gene2error[gene] = helpers.sem(gene2cyto_count[gene], factor=0) lower, higher = helpers.median_confidence_interval(gene2cyto_count[gene]) gene2confidence_interval[gene] = [lower, higher] # generate bar plot image gene2median_cyto_count = collections.OrderedDict(sorted(gene2median_cyto_count.items(), key=lambda i: keyorder.index(i[0]))) gene2error = collections.OrderedDict(sorted(gene2error.items(), key=lambda i: keyorder.index(i[0]))) gene2confidence_interval = collections.OrderedDict(sorted(gene2confidence_interval.items(), key=lambda i: keyorder.index(i[0]))) xlabels = constants.analysis_config['MRNA_GENES_LABEL'] tgt_image_name = constants.analysis_config['FIGURE_NAME_FORMAT'].format(molecule_type="mrna") tgt_fp = pathlib.Path(constants.analysis_config['FIGURE_OUTPUT_PATH'].format(root_dir=global_root_dir), tgt_image_name) plot.bar_profile_median(gene2median_cyto_count, gene2error.values(), 'mrna', xlabels, tgt_fp, gene2confidence_interval, annot=False, data_to_annot=gene2cyto_count ) # generate violin plot image tgt_image_name = constants.analysis_config['FIGURE_NAME_VIOLIN_FORMAT'].format(molecule_type="mrna") tgt_fp = pathlib.Path(constants.analysis_config['FIGURE_OUTPUT_PATH'].format(root_dir=global_root_dir), tgt_image_name) plot.violin_profile(gene2cyto_count, tgt_fp, xlabels, rotation=0, annot=False)
def test_median_confidence_interval(self): a = np.array([24, 38, 61, 22, 16, 57, 31, 29, 35]) l, h = helpers.median_confidence_interval(a, cutoff=0.8) self.assertEqual(l, 29) self.assertEqual(h, 57)