def compute_stats(values, concentrations, background_std, clean=True):
    def preprocess_concentrations():
        u_concentrations = np.unique(concentrations)[1:]

        re_concentrations = np.log(u_concentrations)
        _5_p = np.log(1.05)
        backbone = squareform(pdist(re_concentrations[:, np.newaxis]))

        msk = np.array((backbone < _5_p).nonzero()).T
        collapse = []

        for i, j in msk.tolist():
            if i > j:
                collapse.append((u_concentrations[i], u_concentrations[j]))

        for c1, c2 in collapse:
            concentrations[concentrations == c2] = c1

    preprocess_concentrations()
    unique_values = np.unique(concentrations)

    means = np.zeros_like(unique_values)
    errs = np.zeros_like(unique_values)
    stds = np.zeros_like(unique_values)
    freedom_degs = np.zeros_like(unique_values)

    for i, val in enumerate(unique_values):
        mask = concentrations == val
        vals = rm_nans(values[:, mask, :])
        means[i] = np.mean(vals)
        stds[i] = np.sqrt(np.std(vals) ** 2 + background_std ** 2)
        freedom_degs[i] = np.max((vals.shape[0] - 1, 1))
        errs[i] = get_t_distro_outlier_bound_estimation(vals, background_std) / freedom_degs[i]

    return means, errs, stds, freedom_degs, unique_values
def preformat(means_accumulator, errs_accumulator, all_cell_lines_arr, names_accumulator):

    means_accumulator = means_accumulator.tolist()
    errs_accumulator = errs_accumulator.tolist()
    all_cell_lines = np.sort(all_cell_lines_arr).tolist()

    idx1 = all_cell_lines.index("184A1")
    idx2 = all_cell_lines.index("184B5")

    mean_for_proxy_wt = np.nanmean(np.array(means_accumulator)[[idx1, idx2], :], axis=0)
    errs_for_proxy_wt = np.nanmean(np.array(errs_accumulator)[[idx1, idx2], :], axis=0)

    all_cell_lines.append("WT_proxy")
    means_accumulator.append(mean_for_proxy_wt.tolist())
    errs_accumulator.append(errs_for_proxy_wt.tolist())

    means_accumulator = np.array(means_accumulator)
    errs_accumulator = np.array(errs_accumulator)
    support = np.logical_not(np.isnan(means_accumulator[-1, :]))
    names_accumulator = np.array(names_accumulator)
    all_cell_lines_arr = np.array(all_cell_lines)

    tmp_calc = rm_nans(means_accumulator)
    q1 = np.percentile(tmp_calc, 25)
    q3 = np.percentile(tmp_calc, 75)
    ub = q3 + (q3 - q1) * 1.5
    means_accumulator[means_accumulator > ub] = np.nan
    errs_accumulator[means_accumulator > ub] = np.nan

    means_accumulator = means_accumulator[:, support]
    errs_accumulator = errs_accumulator[:, support]
    names_accumulator = names_accumulator[support]

    line_wise_support = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=1)
    cell_lines_support_filter = line_wise_support > 10

    ban_list = ["MCF10A", "MCF10F", "MCF12A"]

    for item in ban_list:
        idx = all_cell_lines.index(item)
        cell_lines_support_filter[idx] = False

    means_accumulator = means_accumulator[cell_lines_support_filter, :]
    errs_accumulator = errs_accumulator[cell_lines_support_filter, :]
    all_cell_lines_arr = all_cell_lines_arr[cell_lines_support_filter]

    column_wise_support = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=0)

    drugs_support_filter = column_wise_support > 20
    means_accumulator = means_accumulator[:, drugs_support_filter]
    errs_accumulator = errs_accumulator[:, drugs_support_filter]
    names_accumulator = names_accumulator[drugs_support_filter]

    return means_accumulator, errs_accumulator, all_cell_lines_arr, names_accumulator
def get_t_distro_outlier_bound_estimation(array, background_std):
    nums_only_array = rm_nans(array)

    low, up = t.interval(
        0.95,
        nums_only_array.shape[0] - 1,
        np.mean(nums_only_array),
        np.sqrt(np.var(nums_only_array) + background_std ** 2),
    )
    up, low = (up - np.mean(nums_only_array), np.mean(nums_only_array) - low)

    return max(up, low)
 def errplot_with_selectors(table, errtable):
     for i in range(0, len(selector)):
         v1 = table.reset_index().values[:, 1:][i, :].flatten()
         v2 = errtable.reset_index().values[:, 1:][i, :].flatten()
         v1 = v1.astype(np.float)
         nl = np.sum(np.logical_not(np.isnan(v1)))
         gni = gini_coeff(1./rm_nans(v1))
         plt.errorbar(condition_index, v1, v2, fmt='.', label='%s; gini: %.2f, valid: %s' %
                                                              (pre_selector[i], gni, nl))
     plt.xticks(condition_index, condition_names, rotation='vertical')
     plt.gca().set_yscale("log", nonposy='clip')
     plt.legend(loc='upper left', prop={'size': 10})
     plt.show()
def logistic_regression(tf, t0, concentrations, background_std):
    def get_1p_bounds(mean, std, dof):
        return t.interval(0.99, dof, mean, std)

    mask = concentrations == 0.0
    vals_at_conc_0 = rm_nans(tf[:, mask, :])
    max_capacity = (
        get_1p_bounds(
            np.mean(vals_at_conc_0), np.sqrt(np.var(vals_at_conc_0) + background_std ** 2), vals_at_conc_0.shape[0]
        )[1]
        * 1.05
    )

    compensation_t0 = -np.log2(max_capacity / t0 - 1)[:, :, np.newaxis]
    compensation_tf = -np.log2(max_capacity / tf - 1)

    alphas = compensation_tf - compensation_t0

    return alphas
    def __init__(self, pth, fle, alpha_bound_percentile=5):

        cells = []
        drugs = []
        drug_versions = defaultdict(list)
        plates = []
        with open(path.join(pth, fle)) as src:
            rdr = reader(src, dialect='excel-tab')
            header = rdr.next()
            for row in rdr:
                expanded_drug_name = (row[1], float(row[47]))
                cells.append(row[0])
                drug_versions[row[1]].append(expanded_drug_name)
                drugs.append(expanded_drug_name)
                plates.append(row[2])

        cell_idx = supporting_functions.index(set(cells))
        drug_idx = supporting_functions.index(set(drugs))
        plates_idx = supporting_functions.index(set(plates))
        drug_versions = dict([(key, list(set(values)))
                              for key, values in drug_versions.iteritems()])

        cell_idx_rv = dict([(value, key) for key, value in cell_idx.iteritems()])
        drug_idx_rv = dict([(value, key) for key, value in drug_idx.iteritems()])
        plates_idx_rv = dict([(value, key) for key, value in plates_idx.iteritems()])

        cells_no = len(cell_idx)
        drugs_no = len(drug_idx)
        plates_no = len(plates_idx)

        depth_limiter = 7

        storage = np.empty((cells_no, drugs_no, depth_limiter, 10, 3))
        storage.fill(np.NaN)

        background = np.empty((cells_no, drugs_no, depth_limiter, 4))
        background.fill(np.NaN)

        t0_median = np.empty((cells_no, drugs_no, depth_limiter))
        t0_median.fill(np.NaN)

        t0_background = np.empty((cells_no, drugs_no, depth_limiter))
        t0_background.fill(np.NaN)

        tf_background = np.empty((cells_no, drugs_no, depth_limiter))
        tf_background.fill(np.NaN)

        background_noise = np.empty((plates_no, 2))
        background_noise.fill(np.NaN)

        cl_drug_replicates = np.zeros((cells_no, drugs_no))

        with open(path.join(pth, fle)) as src:
            rdr = reader(src, dialect='excel-tab')
            test_array = rdr.next()
            supporting_functions.broadcast(test_array[6:36])
            for row in rdr:
                cell_no = cell_idx[row[0]]
                drug_no = drug_idx[(row[1], float(row[47]))]
                plate_no = plates_idx[row[2]]
                depth_index = min(cl_drug_replicates[cell_no, drug_no], depth_limiter-1)
                storage[cell_no, drug_no, depth_index, :, :] = supporting_functions.broadcast(row[6:36])
                background[cell_no, drug_no, depth_index, :] = supporting_functions.lgi(row, [4, 5, 36, 37])
                t0_median[cell_no, drug_no, depth_index] = row[38]
                t0_background[cell_no, drug_no, depth_index] = np.mean(
                    supporting_functions.lgi(row, [4, 5]).astype(np.float64)).tolist()
                tf_background[cell_no, drug_no, depth_index] = np.mean(
                    supporting_functions.lgi(row, [36, 37]).astype(np.float64)).tolist()
                background_noise[plate_no, :] = np.abs(
                    supporting_functions.lgi(row, [4, 36]).astype(np.float64) -
                    supporting_functions.lgi(row, [5, 37]).astype(
                        np.float64))
                cl_drug_replicates[cell_no, drug_no] += 1

        cl_drug_replicates[cl_drug_replicates < 1] = np.nan

        alpha_bound = np.percentile(rm_nans(background_noise), 100 - alpha_bound_percentile)
        std_of_tools = np.percentile(rm_nans(background_noise), 66)

        background = supporting_functions.p_stabilize(background, 0.5)
        t0_background = supporting_functions.p_stabilize(t0_background, 0.5)
        tf_background = supporting_functions.p_stabilize(tf_background, 0.5)

        storage_dblanc = storage - tf_background[:, :, :, np.newaxis, np.newaxis]

        self.header_line = header
        self.cell_line_2_idx = cell_idx
        self.drug_2_idx = drug_idx
        self.idx_2_cell_line = cell_idx_rv
        self.idx_2_drug = drug_idx_rv
        self.raw_data = storage_dblanc           # cell_line, drug, concentration -> 3 replicates
        self.background = background             # cell_line, drug -> (T0_1, T0_2, T_final, T_final) backgrounds
        self.t0_background = t0_background
        self.t_f_background = tf_background
        self.t0_median = t0_median              # for each cell_line and drug contains T0
        self.alpha_bound = alpha_bound          # lower significance bound
        self.std_of_tools = std_of_tools
        self.background_noise = background_noise
        self.drug_versions = dict(drug_versions)      # drug names + concentrations versions
        self.cl_drug_replicates = cl_drug_replicates
def p_stabilize(array, percentile):
    p_low = np.percentile(rm_nans(array), percentile)
    p_high = np.percentile(rm_nans(array), 100 - percentile)
    array[array < p_low] = p_low
    array[array > p_high] = p_high
    return array
def retrieve_normalization_factor(T0_median_array):
    redux_function = lambda x: rm_nans(x)[0]
    retour = np.apply_along_axis(redux_function, 1, T0_median_array)
    return retour
def quick_hist(data):
    plt.hist(np.log10(rm_nans(data)), bins=20)
    plt.show()