def get_stats_for_one_edge(row, segs, gm_df, num_subsamples, use_only_two_rep_segs): # The difference here is I am including replicate s measurements in the model instead of using the mean s if use_only_two_rep_segs: #only include segregants w s measured in both replicates measured = [seg for seg in segs if row[seg + '.rep1.cbcs'] >= 2 and row[seg + '.rep2.cbcs'] >= 2] else: measured = [seg for seg in segs if row[seg + '.rep1.cbcs'] >= 2 or row[seg + '.rep2.cbcs'] >= 2] reps = ['rep1', 'rep2'] measured_by_rep = {r: [seg for seg in segs if row[seg + '.' + r + '.cbcs'] >= 2] for r in reps} stat_dict = {'num.measured': len(measured)} if len(measured) >= 10: pvals = [row[seg + '.pval'] for seg in measured] pval_sig_boolean = benjamini_hochberg(pvals)[0] # B/H with alpha=0.05 by default sig = [measured[s] for s in range(len(measured)) if pval_sig_boolean[s]] stat_dict['num.sig'] = len(sig) means = [row[seg + '.mean.s'] for seg in measured] variances = [row[seg + '.stderr.s']**2 for seg in measured] Vg = np.var(means) Ve = np.mean(variances) stat_dict['H2'] = (Vg - Ve) / Vg # doing n/2 sub-samplings to get error on that sub_H2 = [] seg_indices = [i for i in range(len(measured))] for b in range(num_subsamples): segs_chosen = np.random.choice(seg_indices, size=int(np.floor(len(measured)/2)), replace=False) Vg = np.var([means[seg_ind] for seg_ind in segs_chosen]) Ve = np.mean([variances[seg_ind] for seg_ind in segs_chosen]) sub_H2.append((Vg - Ve) / Vg) stat_dict['H2_95_conf_low'] = np.percentile(sub_H2, 2.5) stat_dict['H2_95_conf_high'] = np.percentile(sub_H2, 97.5) rep_means = [row[seg + '.rep1.s'] for seg in measured_by_rep['rep1']] + [row[seg + '.rep2.s'] for seg in measured_by_rep['rep2']] stat_dict.update(analyze_determinants(measured_by_rep['rep1']+measured_by_rep['rep2'], rep_means, gm_df, num_subsamples)) return stat_dict
def get_stats_for_one_edge(row, segs, gm_df, num_subsamples): measured = [seg for seg in segs if pd.notnull(row[seg + '.mean.s']) ] # must have at least 2 cbcs in at least one replicate reps = ['rep1', 'rep2'] stat_dict = {'num.measured': len(measured)} if len(measured) > 0: pvals = [row[seg + '.pval'] for seg in measured] pval_sig_boolean = benjamini_hochberg(pvals)[ 0] # B/H with alpha=0.05 by default sig = [ measured[s] for s in range(len(measured)) if pval_sig_boolean[s] ] stat_dict['num.sig'] = len(sig) stat_dict['avg_s'] = np.nanmean( [row[seg + '.mean.s'] for seg in measured]) if len(measured) >= 10: means = [row[seg + '.mean.s'] for seg in measured] stat_dict.update( analyze_determinants(measured, means, gm_df, num_subsamples)) return stat_dict
cols_to_analyze = ['background.fitness', 'mean', 'median', 'variance', 'skew', 'kurtosis', 'significant.beneficial.mutations', 'significant.deleterious.mutations'] seg_to_fit = {i[0]: i[1] for i in pd.read_csv('../accessory_files/Clones_For_Tn96_Experiment.csv').as_matrix(['segregant', 'initial fitness, YPD 30C'])} dfes = defaultdict(lambda: defaultdict(dict)) dfes_sig = defaultdict(lambda: defaultdict(dict)) exps = {'BT': 'MM', 'TP': 'FM'} exp_segs = {exp: [i.split('.')[0] for i in dats[exp].columns if '.mean.s' in i] for exp in dats} for exp in exps: segs = exp_segs[exp] d = dats[exp] for seg in segs: measured = d.loc[pd.notnull(d[seg + '.mean.s'])] dfes[exp][seg] = list(measured[seg + '.mean.s']) pvals = list(measured[seg + '.pval']) sig = measured.loc[benjamini_hochberg(pvals)[0]] # B/H with alpha=0.05 by default dfes_sig[exp][seg] = list(sig[seg + '.mean.s']) for exp in exps: td = dfes[exp] td_sig = dfes_sig[exp] segs = [s for s in td if len(td[s]) >= 50] tmp_dict = dict() for seg in segs: sub_means = [np.nanmean(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)] sub_medians = [np.nanmedian(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)] sub_variances = [np.nanvar(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)] sub_skews = [sci_stats.skew(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)] sub_kurtosis = [sci_stats.kurtosis(np.random.choice(td[seg], size=int(len(td[seg])/2), replace=False)) for i in range(NUM_SUBSAMPLES)] tmp_dict[seg] = { 'background.fitness': seg_to_fit[seg],
def do_modeling(in_s, in_x, outfile, outfile2, cbc_cutoff=5): vtn_s = expand_df(pd.read_csv(in_s)) vtn_x = expand_df(pd.read_csv(in_x)) g70_fits = { i[0]: i[1] for i in np.array(vtn_x[vtn_x.Gen == 70][['Cond', 'Fitness']].groupby( 'Cond').mean().reset_index()[['Cond', 'Fitness']]) } vtn_x['Fitness_sub_70'] = vtn_x.apply( lambda row: row['Fitness'] - g70_fits[row['Cond']], axis=1) vtn_s = vtn_s.merge( vtn_x[['Sample', 'Fitness', 'Fitness_sub_70', 'Fitness_std']], on='Sample', how='left') # making a bunch of indicator variables that say: this is at or after timepoint X in population Y gens = [70, 1410, 2640, 5150, 7530, 10150] for pop in p1_pops + p3_pops: for gen in gens[:-1]: # no 10K indicators, that is fitting one point vtn_s['ind_' + pop + '_' + str(gen)] = vtn_s.apply( lambda r: indicator(r, gen, pop), axis=1) indicators = [i for i in vtn_s if i[:3] == 'ind'] all_edges = sorted(set(vtn_s['Edge'])) conditions = ['P1_YPD_30C', 'P3_SC_37C', 'P1_SC_37C', 'P3_bad_SC_37C'] # Getting ancestor anc_s = {c: dict() for c in conditions} for cond in conditions: for edge in all_edges: td = vtn_s[(vtn_s.Edge == edge) & (vtn_s.Cond == cond) & (vtn_s.num_cbcs >= cbc_cutoff) & (vtn_s.Gen == 70)] anc_s[cond][edge] = np.nanmean(td['s']) vtn_s['g70_s'] = vtn_s.apply(lambda row: anc_s[row.Cond][row.Edge], axis=1) vtn_s['s_sub_g70_s'] = vtn_s['s'] - vtn_s['g70_s'] # Changes: # don't allow 10K indicators or any that only fit one point (e.g. a 7.5k indicator, but there is no 10K measurement for that pop) # don't allow more than one indicator param per population edge_results = {c: {'no_x': dict(), 'x': dict()} for c in conditions} s_var = 's_sub_g70_s' vtn_s['dummy'] = [0] * len(vtn_s) for cond in conditions: print(cond) cc = 0 for edge in all_edges: cc += 1 if cc % 10 == 0: print(cc) td = vtn_s[(vtn_s.Edge == edge) & (vtn_s.Cond == cond) & (vtn_s.num_cbcs >= cbc_cutoff) & (vtn_s['s_sub_g70_s'].notnull())] if len(td) >= 20: for base_case in [['no_x', []], ['x', ['Fitness_sub_70']]]: params = base_case[1] if base_case[0] == 'x': if cond == 'P3_bad_SC_37C': break results = [ smf.ols(formula=s_var + ' ~ Fitness_sub_70 -1', data=td).fit() ] else: results = [ smf.ols(formula=s_var + ' ~ dummy -1', data=td).fit() ] ind_use = indicators while True: rec = { ind: smf.ols(formula=s_var + ' ~ ' + ' + '.join([ind] + params) + '-1', data=td).fit() for ind in ind_use } best = sorted(rec.keys(), key=lambda x: rec[x].bic) if rec[best[0]].bic - results[-1].bic >= -2: break params.append(best[0]) results.append(rec[best[0]]) pops_w_params = set([ ind.split('_')[1] for ind in params if ind != 'Fitness_sub_70' ]) # this implements the criteria in the comment at the top ind_use = [ ind for ind in ind_use if ind.split('_')[1] not in pops_w_params and len(td[td[ind] == 1]) > 1 ] if len(ind_use) == 0: break edge_results[cond][base_case[0]][edge] = [params, results] model_fixer = {'x': 'FM', 'no_x': 'IM'} mat = [] for cond in conditions: for base in ['x', 'no_x']: td = edge_results[cond][base] for edge in all_edges: if edge in td: er = td[edge] full_model = er[1][-1] coeffs = dict(full_model.params) pvals = dict(full_model.pvalues) coeff_list = [c for c in coeffs] # Using 1-full_model.ssr/full_model.centered_tss to get R2 because otherwise the fixed-intercept model is comparing our predictions to the sum of squares # of differences from the fixed intercept (mean gen 70 s) rather than from the mean (inflating R2) mat.append([ edge, cond, model_fixer[base], 1 - full_model.ssr / full_model.centered_tss, full_model.llf, full_model.bic, ';'.join(coeff_list), ';'.join([str(coeffs[c]) for c in coeff_list]), ';'.join([str(pvals[c]) for c in coeff_list]) ]) if base == 'x': x_model = er[1][0] coeffs = dict(x_model.params) pvals = dict(x_model.pvalues) coeff_list = [c for c in coeffs] mat.append([ edge, cond, 'XM', 1 - x_model.ssr / x_model.centered_tss, x_model.llf, x_model.bic, ';'.join(coeff_list), ';'.join([str(coeffs[c]) for c in coeff_list]), ';'.join([str(pvals[c]) for c in coeff_list]) ]) modeling = pd.DataFrame(mat, columns=[ 'Edge', 'Cond', 'Model', 'R2', 'LLF', 'BIC', 'Params', 'Coeffs', 'Pvalues' ]) # see note above about using centered_tss for R2 calculation - this can mean an R2 below zero, which we will change to nan modeling['R2'] = np.clip(modeling['R2'], 0, 1) modeling.to_csv(outfile, index=False) # Reformatting modeling data modeling['Cmodel'] = modeling['Cond'] + '_' + modeling['Model'] vcols = ['R2', 'LLF', 'BIC', 'Params', 'Coeffs', 'Pvalues'] dats = [ modeling.pivot(index='Edge', columns='Cmodel', values=v).reset_index() for v in vcols ] edge_models = dats[0] base_cols = [i for i in edge_models if i != 'Edge'] for i in range(1, len(dats)): edge_models = edge_models.merge(dats[i], on='Edge', how='outer', suffixes=('', '_' + vcols[i])) edge_models = edge_models.rename(columns={i: i + '_R2' for i in base_cols}) # Gene annotation data etc. edge_info = pd.read_csv('../accessory_files/TP_data_by_edge.csv') e2g = {i[0]: i[1] for i in np.array(edge_info[['Edge', 'Gene.Use']])} mat = [] for edge in set(vtn_s['Edge']): tmp = [edge] dfs = [byrm_s] + [vtn_s[vtn_s.Cond == cond] for cond in conditions] things = ['BYxRM'] + conditions i = 0 for df in dfs: # For each condition # Filtering for >= 5 cBCs (or >= 3 cBCs for clone modeling) td = df[(pd.notnull(df['s'])) & (df['Edge'] == edge) & (df['num_cbcs'] >= cbc_cutoff)] if len(td) >= 20: # recording mean s and variance of s tmp += [np.mean(td['s']), np.var(td['s'])] else: tmp += [np.nan, np.nan] td = df[(pd.notnull(df['Fitness'])) & (pd.notnull(df['s'])) & (df['Edge'] == edge) & (df['num_cbcs'] >= cbc_cutoff)] if len(td) >= 20: # recording regression results lr = sci_stats.linregress(td['Fitness'], td['s']) tmp += [lr[0], lr[3], lr[2]**2] # slope, P, R^2 else: #print(things[i], e2g[edge], edge) tmp += [np.nan, np.nan, np.nan] i += 1 mat.append(tmp) cols = ['Edge'] for c in ['BYxRM'] + conditions: cols += [ c + '_s_mean', c + '_s_var', c + '_slope', c + '_p', c + '_x_R2' ] # turning it into a dataframe edge_stats = pd.DataFrame(mat, columns=cols) for cond in ['BYxRM'] + conditions: edge_stats[cond + '_call'] = edge_stats.apply( lambda row: call_slope(row, cond), axis=1) td = edge_stats[edge_stats[cond + '_p'].notnull()] edges = list(td['Edge']) corrected_ps = benjamini_hochberg(list(td[cond + '_p']))[1] p_dict = {edges[i]: corrected_ps[i] for i in range(len(edges))} edge_stats[cond + '_bh_p'] = edge_stats['Edge'].apply( lambda e: p_dict.get(e, np.nan)) edge_short = edge_info[[ 'Edge', 'chromosome', 'Type', 'Gene.Use', 'briefDescription', 'insertion_edge', 'phenotypeSummary', 'phenotypeSummary.nearby' ]].rename(columns={'Gene.Use': 'Gene_Use'}) edge_stats = edge_stats.merge(edge_short, on='Edge', how='left') # adding Gene annotations etc. edge_stats = edge_stats.merge(edge_models, on='Edge', how='outer') # adding modeling data edge_stats.to_csv(outfile2, index=False)
]) / len(gene_to_sim_ll_ratios[g][i]) tmp.append(percentile) orfs_to_results[orf_names[g]] = tmp for i in range(3): multi_hit_data['model' + str(i + 2) + '_LL_ratio'] = multi_hit_data['ORF'].apply( lambda g: orfs_to_results.get(g, [np.nan] * 6)[i]) multi_hit_data['model' + str(i + 2) + '_LL_ratio_p'] = multi_hit_data['ORF'].apply( lambda g: orfs_to_results.get(g, [np.nan] * 6)[i + 3]) all_pvals = list(multi_hit_data['model2_LL_ratio_p']) + list( multi_hit_data['model3_LL_ratio_p']) + list( multi_hit_data['model4_LL_ratio_p']) corrected_sig_test = benjamini_hochberg(all_pvals, alpha=0.05) for i in range(3): multi_hit_data['model' + str(i + 2) + '_LL_ratio_p_corrected'] = corrected_sig_test[1][ i * len(multi_hit_data):(i + 1) * len(multi_hit_data)] # AIC = 2k - 2LL where k is num parameters and LL is log likelihood. Model 4 has 9 parameters, vs. 3 in model 2 or 3 # AIC_4_vs_2 = 18 - 2LL4 - (6 - 2LL2) = 12 - 2(LL4-LL2) # LL4-LL2 is the same as model4_LL_ratio-model2_LL_ratio (since both are just the LL minus LL1) # AIC_4_vs_3 calculated the same way. If this AIC comparison is less than 0, model 4 has a lower AIC and is favored for i in range(2): multi_hit_data['AIC_model4_v_' + str(i + 2)] = multi_hit_data.apply( lambda r: 12 - 2 * (r['model4_LL_ratio'] - r['model' + str(i + 2) + '_LL_ratio']), axis=1)
pvals[cols + ['pval']], on=['Edge', 'Sample'], how='inner') seg_fits = pd.read_csv('../accessory_files/Clones_For_Tn96_Experiment.csv') byrm_x = seg_fits[seg_fits['segregant'].isin(set( byrm_s['Sample']))][['segregant', 'initial fitness, YPD 30C', 'std err']].rename( columns={ 'segregant': 'Sample', 'initial fitness, YPD 30C': 'Fitness', 'std err': 'Fitness_std' }) # Benjamini-Hochberg correction for p values byrm_s['sig'] = benjamini_hochberg(byrm_s['pval'])[0] vtn_s['sig'] = benjamini_hochberg(vtn_s['pval'])[0] # Outputting tidy fitness effect (s) data vtn_s.to_csv('../../output/VTn_s.csv', index=False) byrm_s = byrm_s[pd.notnull( byrm_s['s'])] # didn't have to do this for vtn_s bc it has no nulls byrm_s.to_csv('../../output/BYxRM_s.csv', index=False) byrm_x.to_csv('../../output/BYxRM_x.csv', index=False) # Randomly shuffling data vtn_s['Cond'] = vtn_s.apply( lambda r: r['Sample'].split('_')[1][:2] + '_' + r['Sample'].split('-')[1], axis=1) dfs = [] for i in vtn_s.groupby(['Cond', 'Edge']):