def get_poswise_df(data, nm_to_seq, treat_nm): dd = defaultdict(list) timer = util.Timer(total=len(data)) for nm in data: pw = data[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): pos = _data.idx_to_pos(jdx, treat_nm) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] total = sum(pw[jdx]) for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(total) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) dd['Position'].append(pos) dd['Name'].append(nm) timer.update() df = pd.DataFrame(dd) return df
def gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, decisions): ''' Filter treatment mutations that can be explained by control freq. In practice, this step is most effective for control mutations with relatively high frequency => relatively high variance Considers all events that occur (fq > 0%) in both control and treatment data ''' fpr_threshold_try1 = 0.10 for jdx, ref_nt in enumerate(seq): c_tot = sum(c[jdx]) t_tot = sum(t[jdx]) for kdx in range(len(t[jdx])): if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0: continue c_fq = c[jdx][kdx] / c_tot t_fq = t[jdx][kdx] / t_tot pval = binom.sf(t[jdx][kdx] - 1, t_tot, c_fq) if c_fq > 0: decisions['obs_nt'].append(nts[kdx]) decisions['ref_nt'].append(ref_nt) decisions['c_fq'].append(c_fq) decisions['c_ct'].append(c[jdx][kdx]) decisions['t_fq'].append(t_fq) decisions['t_ct'].append(t[jdx][kdx]) decisions['c_tot'].append(c_tot) decisions['t_tot'].append(t_tot) decisions['idx'].append(jdx) decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm)) decisions['pval'].append(pval) decisions['nm'].append(nm) return
def filter_high_control_muts(t, c, seq, treat_nm, nm, decisions): ''' Filter positions with very high control mut freq. with significant support from readcounts ''' max_control_mut_fq = 0.05 for jdx, ref_nt in enumerate(seq): c_tot = sum(c[jdx]) t_tot = sum(t[jdx]) wipe_col = False for kdx in range(len(t[jdx])): if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0: continue c_fq = c[jdx][kdx] / c_tot t_fq = t[jdx][kdx] / t_tot if c_fq > 0: decisions['obs_nt'].append(nts[kdx]) decisions['ref_nt'].append(ref_nt) decisions['c_fq'].append(c_fq) decisions['c_tot'].append(c_tot) decisions['idx'].append(jdx) decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm)) decisions['nm'].append(nm) if c_fq >= max_control_mut_fq: wipe_col = True decisions['wiped'].append(True) else: decisions['wiped'].append(False) if wipe_col: for kdx in range(len(t[jdx])): t[jdx][kdx] = 0 return t
def gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, decisions): ''' Identify mutations explainable by Illumina sequencing error Filtered at Q30 (1e-3), most columns have minimum Q = 32 (6e-4), or Q = 36 (2e-4) Considers all events (>0 freq.) in treatment data. ''' fpr_threshold = 0.05 for jdx, ref_nt in enumerate(seq): t_tot = np.sum(t[jdx]) t_bin_p = 10**(-t_minq[jdx] / 10) c_bin_p = 10**(-c_minq[jdx] / 10) for kdx in range(len(t[jdx])): if kdx == nt_to_idx[ref_nt]: continue if t[jdx][kdx] > 0: t_fq = t[jdx][kdx] / t_tot pval = binom_minus_binom_pval(t[jdx][kdx], t_bin_p, c_bin_p, t_tot) decisions['obs_nt'].append(nts[kdx]) decisions['ref_nt'].append(ref_nt) decisions['t_bin_p'].append(t_bin_p) decisions['c_bin_p'].append(c_bin_p) decisions['t_ct'].append(t[jdx][kdx]) decisions['t_fq'].append(t_fq) decisions['t_tot'].append(t_tot) decisions['idx'].append(jdx) decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm)) decisions['pval'].append(pval) decisions['nm'].append(nm) return
def adjust_treatment_control(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' # adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb')) lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print( 'Gathering statistics on treatment mutations matching background profile by frequency of zeros...' ) dd = defaultdict(list) timer = util.Timer(total=len(adj_d)) for nm in adj_d: timer.update() pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = np.nansum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 100] # Form stats_df and find p for binomial, which is typically ~0.99 dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total=len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero'].append(num_zeros) dd['Total num target sites'].append(total) dd['Frequency of zero in target sites'].append(num_zeros / total) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) fz_df = pd.DataFrame(dd) baseline_pos_range = pos_range[-5:] max_mean_activity = 0.025 min_num_targets = 50 crit = (fz_df['Position index'].isin(baseline_pos_range)) & \ (fz_df['Mean activity'] <= max_mean_activity) & \ (fz_df['Total num target sites'] >= min_num_targets) bg_bin_p = np.mean(fz_df[crit]['Frequency of zero in target sites']) if np.isnan(bg_bin_p): raise ValueError pvals = [] timer = util.Timer(total=len(fz_df)) for idx, row in fz_df.iterrows(): total = row['Total num target sites'] numzero = row['Num target sites with zero'] pval = binom.cdf(numzero, total, bg_bin_p) pvals.append(pval) timer.update() fz_df['pval'] = pvals fz_fdr_threshold = 0.001 fz_df = ben_hoch_fdr(fz_df, fz_fdr_threshold) fz_df.to_csv(out_dir + '%s_fraczero_dec.csv' % (treat_nm)) print( 'Filtering treatment mutations matching background profile by frequency of zeros...' ) to_remove = fz_df[fz_df['FDR accept'] == False] adj_d = filter_freqzero_background_mutations(to_remove, adj_d, nm_to_seq) ## # Write ## with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) return
def fig_editing_profiles(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb')) lib_design, seq_col = _data.get_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} lib_nm = _data.get_lib_nm(treat_nm) ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm)) ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print('Forming long df...') dd = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: timer.update() if nm not in ontarget_nms: continue pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = np.nansum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 100] n_targetsites_in_condition = len(df) # Form stats_df dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total = len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero for mutation'].append(num_zeros) dd['Total num target sites for mutation'].append(total) dd['Frequency of zero in target sites for mutation'].append(num_zeros / total) dd['Num target sites in condition'].append(n_targetsites_in_condition) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) hm_df = pd.DataFrame(dd) hm_df.to_csv(out_dir + '%s.csv' % (treat_nm)) # Median normalize background_range = range(25, 34 + 1) for ref_nt in nts: for obs_nt in nts: if obs_nt == ref_nt: continue crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity'])) medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity']) hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi)) hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm)) return
def calculate_statistics(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print('Gathering statistics...') dd = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: timer.update() pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = sum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 50] dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total = len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero'].append(num_zeros) dd['Total num target sites'].append(total) dd['Frequency of zero in target sites'].append(num_zeros / total) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) stats_df = pd.DataFrame(dd) stats_df.to_csv(out_dir + '%s.csv' % (treat_nm)) return
def form_data(exp_nm): data = _data.load_data(exp_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_lib_design(exp_nm) # Get target nt editor_type = _data.get_editor_type(exp_nm) if editor_type == 'CtoTeditor': target_nt = 'C' elif editor_type == 'AtoGeditor': target_nt = 'A' nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] grnas = lib_design['gRNA (20nt)'] design_cats = lib_design['Design category'] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} nm_to_grna = {nm: grna for nm, grna in zip(nms, grnas)} nm_to_design_cat = { nm: design_cat for nm, design_cat in zip(nms, design_cats) } dd = defaultdict(list) timer = util.Timer(total=len(data)) for nm in data: pw = data[nm] seq = nm_to_seq[nm] grna = nm_to_grna[nm] design_cat = nm_to_design_cat[nm] # Get category, subcategory, and match count match_count = get_match_count(grna, seq) if design_cat == 'guideseq': category = 'Off-target series' subcategory = nm.split('_')[2] # gene name elif design_cat == 'mismatch': category = 'Mismatch series' subcategory = nm.split('_')[1] # series number elif design_cat == 'chipseq': category = 'Chip series' elif design_cat == 'vivo': category = 'vivo' subcategory = 'vivo' else: assert match_count == 20, 'fail' category = 'On-target' subcategory = 'On-target' for jdx in range(len(pw)): pos = _data.idx_to_pos(jdx, exp_nm) if pos not in [6, 7]: continue ref_nt = seq[jdx] if ref_nt != target_nt: continue ref_idx = nt_to_idx[ref_nt] total = sum(pw[jdx]) edit_ct = 0 for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue edit_ct += pw[jdx][kdx] if total > 0: dd['Edited fraction'].append(edit_ct / total) else: dd['Edited fraction'].append(np.nan) dd['Edit count'].append(edit_ct) dd['Total count'].append(total) dd['Position'].append(pos) dd['Ref nt'].append(ref_nt) dd['Name'].append(nm) dd['Match count'].append(int(match_count)) dd['Category'].append(category) dd['Subcategory'].append(subcategory) timer.update() df = pd.DataFrame(dd) df.to_csv(out_dir + '%s.csv' % (exp_nm)) return