def compute_stats(values, concentrations, background_std, clean=True): def preprocess_concentrations(): u_concentrations = np.unique(concentrations)[1:] re_concentrations = np.log(u_concentrations) _5_p = np.log(1.05) backbone = squareform(pdist(re_concentrations[:, np.newaxis])) msk = np.array((backbone < _5_p).nonzero()).T collapse = [] for i, j in msk.tolist(): if i > j: collapse.append((u_concentrations[i], u_concentrations[j])) for c1, c2 in collapse: concentrations[concentrations == c2] = c1 preprocess_concentrations() unique_values = np.unique(concentrations) means = np.zeros_like(unique_values) errs = np.zeros_like(unique_values) stds = np.zeros_like(unique_values) freedom_degs = np.zeros_like(unique_values) for i, val in enumerate(unique_values): mask = concentrations == val vals = rm_nans(values[:, mask, :]) means[i] = np.mean(vals) stds[i] = np.sqrt(np.std(vals) ** 2 + background_std ** 2) freedom_degs[i] = np.max((vals.shape[0] - 1, 1)) errs[i] = get_t_distro_outlier_bound_estimation(vals, background_std) / freedom_degs[i] return means, errs, stds, freedom_degs, unique_values
def preformat(means_accumulator, errs_accumulator, all_cell_lines_arr, names_accumulator): means_accumulator = means_accumulator.tolist() errs_accumulator = errs_accumulator.tolist() all_cell_lines = np.sort(all_cell_lines_arr).tolist() idx1 = all_cell_lines.index("184A1") idx2 = all_cell_lines.index("184B5") mean_for_proxy_wt = np.nanmean(np.array(means_accumulator)[[idx1, idx2], :], axis=0) errs_for_proxy_wt = np.nanmean(np.array(errs_accumulator)[[idx1, idx2], :], axis=0) all_cell_lines.append("WT_proxy") means_accumulator.append(mean_for_proxy_wt.tolist()) errs_accumulator.append(errs_for_proxy_wt.tolist()) means_accumulator = np.array(means_accumulator) errs_accumulator = np.array(errs_accumulator) support = np.logical_not(np.isnan(means_accumulator[-1, :])) names_accumulator = np.array(names_accumulator) all_cell_lines_arr = np.array(all_cell_lines) tmp_calc = rm_nans(means_accumulator) q1 = np.percentile(tmp_calc, 25) q3 = np.percentile(tmp_calc, 75) ub = q3 + (q3 - q1) * 1.5 means_accumulator[means_accumulator > ub] = np.nan errs_accumulator[means_accumulator > ub] = np.nan means_accumulator = means_accumulator[:, support] errs_accumulator = errs_accumulator[:, support] names_accumulator = names_accumulator[support] line_wise_support = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=1) cell_lines_support_filter = line_wise_support > 10 ban_list = ["MCF10A", "MCF10F", "MCF12A"] for item in ban_list: idx = all_cell_lines.index(item) cell_lines_support_filter[idx] = False means_accumulator = means_accumulator[cell_lines_support_filter, :] errs_accumulator = errs_accumulator[cell_lines_support_filter, :] all_cell_lines_arr = all_cell_lines_arr[cell_lines_support_filter] column_wise_support = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=0) drugs_support_filter = column_wise_support > 20 means_accumulator = means_accumulator[:, drugs_support_filter] errs_accumulator = errs_accumulator[:, drugs_support_filter] names_accumulator = names_accumulator[drugs_support_filter] return means_accumulator, errs_accumulator, all_cell_lines_arr, names_accumulator
def get_t_distro_outlier_bound_estimation(array, background_std): nums_only_array = rm_nans(array) low, up = t.interval( 0.95, nums_only_array.shape[0] - 1, np.mean(nums_only_array), np.sqrt(np.var(nums_only_array) + background_std ** 2), ) up, low = (up - np.mean(nums_only_array), np.mean(nums_only_array) - low) return max(up, low)
def errplot_with_selectors(table, errtable): for i in range(0, len(selector)): v1 = table.reset_index().values[:, 1:][i, :].flatten() v2 = errtable.reset_index().values[:, 1:][i, :].flatten() v1 = v1.astype(np.float) nl = np.sum(np.logical_not(np.isnan(v1))) gni = gini_coeff(1./rm_nans(v1)) plt.errorbar(condition_index, v1, v2, fmt='.', label='%s; gini: %.2f, valid: %s' % (pre_selector[i], gni, nl)) plt.xticks(condition_index, condition_names, rotation='vertical') plt.gca().set_yscale("log", nonposy='clip') plt.legend(loc='upper left', prop={'size': 10}) plt.show()
def logistic_regression(tf, t0, concentrations, background_std): def get_1p_bounds(mean, std, dof): return t.interval(0.99, dof, mean, std) mask = concentrations == 0.0 vals_at_conc_0 = rm_nans(tf[:, mask, :]) max_capacity = ( get_1p_bounds( np.mean(vals_at_conc_0), np.sqrt(np.var(vals_at_conc_0) + background_std ** 2), vals_at_conc_0.shape[0] )[1] * 1.05 ) compensation_t0 = -np.log2(max_capacity / t0 - 1)[:, :, np.newaxis] compensation_tf = -np.log2(max_capacity / tf - 1) alphas = compensation_tf - compensation_t0 return alphas
def __init__(self, pth, fle, alpha_bound_percentile=5): cells = [] drugs = [] drug_versions = defaultdict(list) plates = [] with open(path.join(pth, fle)) as src: rdr = reader(src, dialect='excel-tab') header = rdr.next() for row in rdr: expanded_drug_name = (row[1], float(row[47])) cells.append(row[0]) drug_versions[row[1]].append(expanded_drug_name) drugs.append(expanded_drug_name) plates.append(row[2]) cell_idx = supporting_functions.index(set(cells)) drug_idx = supporting_functions.index(set(drugs)) plates_idx = supporting_functions.index(set(plates)) drug_versions = dict([(key, list(set(values))) for key, values in drug_versions.iteritems()]) cell_idx_rv = dict([(value, key) for key, value in cell_idx.iteritems()]) drug_idx_rv = dict([(value, key) for key, value in drug_idx.iteritems()]) plates_idx_rv = dict([(value, key) for key, value in plates_idx.iteritems()]) cells_no = len(cell_idx) drugs_no = len(drug_idx) plates_no = len(plates_idx) depth_limiter = 7 storage = np.empty((cells_no, drugs_no, depth_limiter, 10, 3)) storage.fill(np.NaN) background = np.empty((cells_no, drugs_no, depth_limiter, 4)) background.fill(np.NaN) t0_median = np.empty((cells_no, drugs_no, depth_limiter)) t0_median.fill(np.NaN) t0_background = np.empty((cells_no, drugs_no, depth_limiter)) t0_background.fill(np.NaN) tf_background = np.empty((cells_no, drugs_no, depth_limiter)) tf_background.fill(np.NaN) background_noise = np.empty((plates_no, 2)) background_noise.fill(np.NaN) cl_drug_replicates = np.zeros((cells_no, drugs_no)) with open(path.join(pth, fle)) as src: rdr = reader(src, dialect='excel-tab') test_array = rdr.next() supporting_functions.broadcast(test_array[6:36]) for row in rdr: cell_no = cell_idx[row[0]] drug_no = drug_idx[(row[1], float(row[47]))] plate_no = plates_idx[row[2]] depth_index = min(cl_drug_replicates[cell_no, drug_no], depth_limiter-1) storage[cell_no, drug_no, depth_index, :, :] = supporting_functions.broadcast(row[6:36]) background[cell_no, drug_no, depth_index, :] = supporting_functions.lgi(row, [4, 5, 36, 37]) t0_median[cell_no, drug_no, depth_index] = row[38] t0_background[cell_no, drug_no, depth_index] = np.mean( supporting_functions.lgi(row, [4, 5]).astype(np.float64)).tolist() tf_background[cell_no, drug_no, depth_index] = np.mean( supporting_functions.lgi(row, [36, 37]).astype(np.float64)).tolist() background_noise[plate_no, :] = np.abs( supporting_functions.lgi(row, [4, 36]).astype(np.float64) - supporting_functions.lgi(row, [5, 37]).astype( np.float64)) cl_drug_replicates[cell_no, drug_no] += 1 cl_drug_replicates[cl_drug_replicates < 1] = np.nan alpha_bound = np.percentile(rm_nans(background_noise), 100 - alpha_bound_percentile) std_of_tools = np.percentile(rm_nans(background_noise), 66) background = supporting_functions.p_stabilize(background, 0.5) t0_background = supporting_functions.p_stabilize(t0_background, 0.5) tf_background = supporting_functions.p_stabilize(tf_background, 0.5) storage_dblanc = storage - tf_background[:, :, :, np.newaxis, np.newaxis] self.header_line = header self.cell_line_2_idx = cell_idx self.drug_2_idx = drug_idx self.idx_2_cell_line = cell_idx_rv self.idx_2_drug = drug_idx_rv self.raw_data = storage_dblanc # cell_line, drug, concentration -> 3 replicates self.background = background # cell_line, drug -> (T0_1, T0_2, T_final, T_final) backgrounds self.t0_background = t0_background self.t_f_background = tf_background self.t0_median = t0_median # for each cell_line and drug contains T0 self.alpha_bound = alpha_bound # lower significance bound self.std_of_tools = std_of_tools self.background_noise = background_noise self.drug_versions = dict(drug_versions) # drug names + concentrations versions self.cl_drug_replicates = cl_drug_replicates
def p_stabilize(array, percentile): p_low = np.percentile(rm_nans(array), percentile) p_high = np.percentile(rm_nans(array), 100 - percentile) array[array < p_low] = p_low array[array > p_high] = p_high return array
def retrieve_normalization_factor(T0_median_array): redux_function = lambda x: rm_nans(x)[0] retour = np.apply_along_axis(redux_function, 1, T0_median_array) return retour
def quick_hist(data): plt.hist(np.log10(rm_nans(data)), bins=20) plt.show()