def get_stats_for_one_edge(row, segs, gm_df, num_subsamples, use_only_two_rep_segs):
    # The difference here is I am including replicate s measurements in the model instead of using the mean s
    if use_only_two_rep_segs:  #only include segregants w s measured in both replicates
        measured = [seg for seg in segs if row[seg + '.rep1.cbcs'] >= 2 and row[seg + '.rep2.cbcs'] >= 2]
    else:
        measured = [seg for seg in segs if row[seg + '.rep1.cbcs'] >= 2 or row[seg + '.rep2.cbcs'] >= 2]
    reps = ['rep1', 'rep2']
    measured_by_rep = {r: [seg for seg in segs if row[seg + '.' + r + '.cbcs'] >= 2] for r in reps}
    stat_dict = {'num.measured': len(measured)}
    if len(measured) >= 10:
        pvals = [row[seg + '.pval'] for seg in measured]
        pval_sig_boolean = benjamini_hochberg(pvals)[0]  # B/H with alpha=0.05 by default
        sig = [measured[s] for s in range(len(measured)) if pval_sig_boolean[s]]
        stat_dict['num.sig'] = len(sig)
        means = [row[seg + '.mean.s'] for seg in measured]
        variances = [row[seg + '.stderr.s']**2 for seg in measured]
        Vg = np.var(means)
        Ve = np.mean(variances)
        stat_dict['H2'] = (Vg - Ve) / Vg
        # doing n/2 sub-samplings to get error on that
        sub_H2 = []
        seg_indices = [i for i in range(len(measured))]
        for b in range(num_subsamples):
            segs_chosen = np.random.choice(seg_indices, size=int(np.floor(len(measured)/2)), replace=False)
            Vg = np.var([means[seg_ind] for seg_ind in segs_chosen])
            Ve = np.mean([variances[seg_ind] for seg_ind in segs_chosen])
            sub_H2.append((Vg - Ve) / Vg)
        stat_dict['H2_95_conf_low'] = np.percentile(sub_H2, 2.5)
        stat_dict['H2_95_conf_high'] = np.percentile(sub_H2, 97.5)
        rep_means = [row[seg + '.rep1.s'] for seg in measured_by_rep['rep1']] + [row[seg + '.rep2.s'] for seg in measured_by_rep['rep2']]
        stat_dict.update(analyze_determinants(measured_by_rep['rep1']+measured_by_rep['rep2'], rep_means, gm_df, num_subsamples))
    return stat_dict
def get_stats_for_one_edge(row, segs, gm_df, num_subsamples):
    measured = [seg for seg in segs if pd.notnull(row[seg + '.mean.s'])
                ]  # must have at least 2 cbcs in at least one replicate
    reps = ['rep1', 'rep2']
    stat_dict = {'num.measured': len(measured)}
    if len(measured) > 0:
        pvals = [row[seg + '.pval'] for seg in measured]
        pval_sig_boolean = benjamini_hochberg(pvals)[
            0]  # B/H with alpha=0.05 by default
        sig = [
            measured[s] for s in range(len(measured)) if pval_sig_boolean[s]
        ]
        stat_dict['num.sig'] = len(sig)
        stat_dict['avg_s'] = np.nanmean(
            [row[seg + '.mean.s'] for seg in measured])
    if len(measured) >= 10:
        means = [row[seg + '.mean.s'] for seg in measured]
        stat_dict.update(
            analyze_determinants(measured, means, gm_df, num_subsamples))
    return stat_dict
Ejemplo n.º 3
0
cols_to_analyze = ['background.fitness', 'mean', 'median', 'variance', 'skew', 'kurtosis', 'significant.beneficial.mutations', 'significant.deleterious.mutations']

seg_to_fit = {i[0]: i[1] for i in pd.read_csv('../accessory_files/Clones_For_Tn96_Experiment.csv').as_matrix(['segregant', 'initial fitness, YPD 30C'])}

dfes = defaultdict(lambda: defaultdict(dict))
dfes_sig = defaultdict(lambda: defaultdict(dict))
exps = {'BT': 'MM', 'TP': 'FM'}
exp_segs = {exp: [i.split('.')[0] for i in dats[exp].columns if '.mean.s' in i] for exp in dats}
for exp in exps:
    segs = exp_segs[exp]
    d = dats[exp]
    for seg in segs:
        measured = d.loc[pd.notnull(d[seg + '.mean.s'])]
        dfes[exp][seg] = list(measured[seg + '.mean.s'])
        pvals = list(measured[seg + '.pval'])
        sig = measured.loc[benjamini_hochberg(pvals)[0]] # B/H with alpha=0.05 by default
        dfes_sig[exp][seg] = list(sig[seg + '.mean.s'])

for exp in exps:
    td = dfes[exp]
    td_sig = dfes_sig[exp]
    segs = [s for s in td if len(td[s]) >= 50]
    tmp_dict = dict()
    for seg in segs:
        sub_means = [np.nanmean(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)]
        sub_medians = [np.nanmedian(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)]
        sub_variances = [np.nanvar(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)]
        sub_skews = [sci_stats.skew(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)]
        sub_kurtosis = [sci_stats.kurtosis(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)]
        tmp_dict[seg] = {
            'background.fitness': seg_to_fit[seg],
Ejemplo n.º 4
0
def do_modeling(in_s, in_x, outfile, outfile2, cbc_cutoff=5):

    vtn_s = expand_df(pd.read_csv(in_s))
    vtn_x = expand_df(pd.read_csv(in_x))
    g70_fits = {
        i[0]: i[1]
        for i in np.array(vtn_x[vtn_x.Gen == 70][['Cond', 'Fitness']].groupby(
            'Cond').mean().reset_index()[['Cond', 'Fitness']])
    }
    vtn_x['Fitness_sub_70'] = vtn_x.apply(
        lambda row: row['Fitness'] - g70_fits[row['Cond']], axis=1)
    vtn_s = vtn_s.merge(
        vtn_x[['Sample', 'Fitness', 'Fitness_sub_70', 'Fitness_std']],
        on='Sample',
        how='left')

    # making a bunch of indicator variables that say: this is at or after timepoint X in population Y
    gens = [70, 1410, 2640, 5150, 7530, 10150]
    for pop in p1_pops + p3_pops:
        for gen in gens[:-1]:  # no 10K indicators, that is fitting one point
            vtn_s['ind_' + pop + '_' + str(gen)] = vtn_s.apply(
                lambda r: indicator(r, gen, pop), axis=1)

    indicators = [i for i in vtn_s if i[:3] == 'ind']
    all_edges = sorted(set(vtn_s['Edge']))
    conditions = ['P1_YPD_30C', 'P3_SC_37C', 'P1_SC_37C', 'P3_bad_SC_37C']

    # Getting ancestor
    anc_s = {c: dict() for c in conditions}
    for cond in conditions:
        for edge in all_edges:
            td = vtn_s[(vtn_s.Edge == edge) & (vtn_s.Cond == cond) &
                       (vtn_s.num_cbcs >= cbc_cutoff) & (vtn_s.Gen == 70)]
            anc_s[cond][edge] = np.nanmean(td['s'])

    vtn_s['g70_s'] = vtn_s.apply(lambda row: anc_s[row.Cond][row.Edge], axis=1)
    vtn_s['s_sub_g70_s'] = vtn_s['s'] - vtn_s['g70_s']

    # Changes:
    # don't allow 10K indicators or any that only fit one point (e.g. a 7.5k indicator, but there is no 10K measurement for that pop)
    # don't allow more than one indicator param per population
    edge_results = {c: {'no_x': dict(), 'x': dict()} for c in conditions}
    s_var = 's_sub_g70_s'
    vtn_s['dummy'] = [0] * len(vtn_s)
    for cond in conditions:
        print(cond)
        cc = 0
        for edge in all_edges:
            cc += 1
            if cc % 10 == 0:
                print(cc)
            td = vtn_s[(vtn_s.Edge == edge) & (vtn_s.Cond == cond) &
                       (vtn_s.num_cbcs >= cbc_cutoff) &
                       (vtn_s['s_sub_g70_s'].notnull())]
            if len(td) >= 20:
                for base_case in [['no_x', []], ['x', ['Fitness_sub_70']]]:
                    params = base_case[1]
                    if base_case[0] == 'x':
                        if cond == 'P3_bad_SC_37C':
                            break
                        results = [
                            smf.ols(formula=s_var + ' ~ Fitness_sub_70 -1',
                                    data=td).fit()
                        ]
                    else:
                        results = [
                            smf.ols(formula=s_var + ' ~ dummy -1',
                                    data=td).fit()
                        ]
                    ind_use = indicators
                    while True:
                        rec = {
                            ind: smf.ols(formula=s_var + ' ~ ' +
                                         ' + '.join([ind] + params) + '-1',
                                         data=td).fit()
                            for ind in ind_use
                        }
                        best = sorted(rec.keys(), key=lambda x: rec[x].bic)
                        if rec[best[0]].bic - results[-1].bic >= -2:
                            break
                        params.append(best[0])
                        results.append(rec[best[0]])
                        pops_w_params = set([
                            ind.split('_')[1] for ind in params
                            if ind != 'Fitness_sub_70'
                        ])
                        # this implements the criteria in the comment at the top
                        ind_use = [
                            ind for ind in ind_use
                            if ind.split('_')[1] not in pops_w_params
                            and len(td[td[ind] == 1]) > 1
                        ]
                        if len(ind_use) == 0:
                            break
                    edge_results[cond][base_case[0]][edge] = [params, results]

    model_fixer = {'x': 'FM', 'no_x': 'IM'}
    mat = []
    for cond in conditions:
        for base in ['x', 'no_x']:
            td = edge_results[cond][base]
            for edge in all_edges:
                if edge in td:
                    er = td[edge]
                    full_model = er[1][-1]
                    coeffs = dict(full_model.params)
                    pvals = dict(full_model.pvalues)
                    coeff_list = [c for c in coeffs]
                    # Using 1-full_model.ssr/full_model.centered_tss to get R2 because otherwise the fixed-intercept model is comparing our predictions to the sum of squares
                    # of differences from the fixed intercept (mean gen 70 s) rather than from the mean (inflating R2)
                    mat.append([
                        edge, cond, model_fixer[base],
                        1 - full_model.ssr / full_model.centered_tss,
                        full_model.llf, full_model.bic, ';'.join(coeff_list),
                        ';'.join([str(coeffs[c]) for c in coeff_list]),
                        ';'.join([str(pvals[c]) for c in coeff_list])
                    ])
                    if base == 'x':
                        x_model = er[1][0]
                        coeffs = dict(x_model.params)
                        pvals = dict(x_model.pvalues)
                        coeff_list = [c for c in coeffs]
                        mat.append([
                            edge, cond, 'XM',
                            1 - x_model.ssr / x_model.centered_tss,
                            x_model.llf, x_model.bic, ';'.join(coeff_list),
                            ';'.join([str(coeffs[c]) for c in coeff_list]),
                            ';'.join([str(pvals[c]) for c in coeff_list])
                        ])

    modeling = pd.DataFrame(mat,
                            columns=[
                                'Edge', 'Cond', 'Model', 'R2', 'LLF', 'BIC',
                                'Params', 'Coeffs', 'Pvalues'
                            ])
    # see note above about using centered_tss for R2 calculation - this can mean an R2 below zero, which we will change to nan
    modeling['R2'] = np.clip(modeling['R2'], 0, 1)
    modeling.to_csv(outfile, index=False)

    # Reformatting modeling data
    modeling['Cmodel'] = modeling['Cond'] + '_' + modeling['Model']
    vcols = ['R2', 'LLF', 'BIC', 'Params', 'Coeffs', 'Pvalues']
    dats = [
        modeling.pivot(index='Edge', columns='Cmodel', values=v).reset_index()
        for v in vcols
    ]
    edge_models = dats[0]
    base_cols = [i for i in edge_models if i != 'Edge']
    for i in range(1, len(dats)):
        edge_models = edge_models.merge(dats[i],
                                        on='Edge',
                                        how='outer',
                                        suffixes=('', '_' + vcols[i]))
    edge_models = edge_models.rename(columns={i: i + '_R2' for i in base_cols})

    # Gene annotation data etc.
    edge_info = pd.read_csv('../accessory_files/TP_data_by_edge.csv')
    e2g = {i[0]: i[1] for i in np.array(edge_info[['Edge', 'Gene.Use']])}

    mat = []
    for edge in set(vtn_s['Edge']):
        tmp = [edge]
        dfs = [byrm_s] + [vtn_s[vtn_s.Cond == cond] for cond in conditions]
        things = ['BYxRM'] + conditions
        i = 0
        for df in dfs:
            # For each condition
            # Filtering for >= 5 cBCs (or >= 3 cBCs for clone modeling)
            td = df[(pd.notnull(df['s'])) & (df['Edge'] == edge) &
                    (df['num_cbcs'] >= cbc_cutoff)]
            if len(td) >= 20:  # recording mean s and variance of s
                tmp += [np.mean(td['s']), np.var(td['s'])]
            else:
                tmp += [np.nan, np.nan]
            td = df[(pd.notnull(df['Fitness'])) & (pd.notnull(df['s'])) &
                    (df['Edge'] == edge) & (df['num_cbcs'] >= cbc_cutoff)]
            if len(td) >= 20:  # recording regression results
                lr = sci_stats.linregress(td['Fitness'], td['s'])
                tmp += [lr[0], lr[3], lr[2]**2]  # slope, P, R^2
            else:
                #print(things[i], e2g[edge], edge)
                tmp += [np.nan, np.nan, np.nan]
            i += 1
        mat.append(tmp)
    cols = ['Edge']
    for c in ['BYxRM'] + conditions:
        cols += [
            c + '_s_mean', c + '_s_var', c + '_slope', c + '_p', c + '_x_R2'
        ]
    # turning it into a dataframe
    edge_stats = pd.DataFrame(mat, columns=cols)
    for cond in ['BYxRM'] + conditions:
        edge_stats[cond + '_call'] = edge_stats.apply(
            lambda row: call_slope(row, cond), axis=1)
        td = edge_stats[edge_stats[cond + '_p'].notnull()]
        edges = list(td['Edge'])
        corrected_ps = benjamini_hochberg(list(td[cond + '_p']))[1]
        p_dict = {edges[i]: corrected_ps[i] for i in range(len(edges))}
        edge_stats[cond + '_bh_p'] = edge_stats['Edge'].apply(
            lambda e: p_dict.get(e, np.nan))

    edge_short = edge_info[[
        'Edge', 'chromosome', 'Type', 'Gene.Use', 'briefDescription',
        'insertion_edge', 'phenotypeSummary', 'phenotypeSummary.nearby'
    ]].rename(columns={'Gene.Use': 'Gene_Use'})
    edge_stats = edge_stats.merge(edge_short, on='Edge',
                                  how='left')  # adding Gene annotations etc.
    edge_stats = edge_stats.merge(edge_models, on='Edge',
                                  how='outer')  # adding modeling data
    edge_stats.to_csv(outfile2, index=False)
Ejemplo n.º 5
0
        ]) / len(gene_to_sim_ll_ratios[g][i])
        tmp.append(percentile)
    orfs_to_results[orf_names[g]] = tmp

for i in range(3):
    multi_hit_data['model' + str(i + 2) +
                   '_LL_ratio'] = multi_hit_data['ORF'].apply(
                       lambda g: orfs_to_results.get(g, [np.nan] * 6)[i])
    multi_hit_data['model' + str(i + 2) +
                   '_LL_ratio_p'] = multi_hit_data['ORF'].apply(
                       lambda g: orfs_to_results.get(g, [np.nan] * 6)[i + 3])

all_pvals = list(multi_hit_data['model2_LL_ratio_p']) + list(
    multi_hit_data['model3_LL_ratio_p']) + list(
        multi_hit_data['model4_LL_ratio_p'])
corrected_sig_test = benjamini_hochberg(all_pvals, alpha=0.05)
for i in range(3):
    multi_hit_data['model' + str(i + 2) +
                   '_LL_ratio_p_corrected'] = corrected_sig_test[1][
                       i * len(multi_hit_data):(i + 1) * len(multi_hit_data)]

# AIC = 2k - 2LL where k is num parameters and LL is log likelihood. Model 4 has 9 parameters, vs. 3 in model 2 or 3
# AIC_4_vs_2 = 18 - 2LL4 - (6 - 2LL2) = 12 - 2(LL4-LL2)
# LL4-LL2 is the same as model4_LL_ratio-model2_LL_ratio (since both are just the LL minus LL1)
# AIC_4_vs_3 calculated the same way. If this AIC comparison is less than 0, model 4 has a lower AIC and is favored
for i in range(2):
    multi_hit_data['AIC_model4_v_' + str(i + 2)] = multi_hit_data.apply(
        lambda r: 12 - 2 *
        (r['model4_LL_ratio'] - r['model' + str(i + 2) + '_LL_ratio']),
        axis=1)
Ejemplo n.º 6
0
                                                      pvals[cols + ['pval']],
                                                      on=['Edge', 'Sample'],
                                                      how='inner')

seg_fits = pd.read_csv('../accessory_files/Clones_For_Tn96_Experiment.csv')
byrm_x = seg_fits[seg_fits['segregant'].isin(set(
    byrm_s['Sample']))][['segregant', 'initial fitness, YPD 30C',
                         'std err']].rename(
                             columns={
                                 'segregant': 'Sample',
                                 'initial fitness, YPD 30C': 'Fitness',
                                 'std err': 'Fitness_std'
                             })

# Benjamini-Hochberg correction for p values
byrm_s['sig'] = benjamini_hochberg(byrm_s['pval'])[0]
vtn_s['sig'] = benjamini_hochberg(vtn_s['pval'])[0]

# Outputting tidy fitness effect (s) data
vtn_s.to_csv('../../output/VTn_s.csv', index=False)
byrm_s = byrm_s[pd.notnull(
    byrm_s['s'])]  # didn't have to do this for vtn_s bc it has no nulls
byrm_s.to_csv('../../output/BYxRM_s.csv', index=False)
byrm_x.to_csv('../../output/BYxRM_x.csv', index=False)

# Randomly shuffling data
vtn_s['Cond'] = vtn_s.apply(
    lambda r: r['Sample'].split('_')[1][:2] + '_' + r['Sample'].split('-')[1],
    axis=1)
dfs = []
for i in vtn_s.groupby(['Cond', 'Edge']):