def gather_statistics(exp_nm, params): (muts, allowed_pos, feature_radius) = params # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total = len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics for mut_nm in muts: print(mut_nm) mut = muts[mut_nm] if len(mut) == 1: d_temp = data[data['Mutation'] == mut[0]] else: d_temp = data[data['Mutation'].isin(mut)] d_temp['Mutation'] = mut_nm d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() for ml_task in ['classify_zero', 'regress_nonzero']: print(ml_task) results = train_models(exp_nm, d_temp, mut_nm, ml_task) save_results(exp_nm, mut_nm, ml_task, results) return
def gather_statistics(exp_nm): # Load data data = pd.read_csv(inp_dir + '_batch_adjusted_all_ratios-ps0_1bpcorrect.csv', index_col=0) data = data[data['Condition'] == exp_nm] # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name (unique)'].isin(ontarget_sites)] # Annotate with local sequence context # lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total=len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name (unique)']] lib_zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)']) # local_context = row['gRNA (20nt)'] local_context = seq[lib_zero_idx - 9:lib_zero_idx + 20 + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] print(data.shape) results = train_models(exp_nm, data, 'Log10 batch-adjusted base edit to indel ratio') save_results(exp_nm, results) return
def gather_statistics(celltype, lib_nm, editor_nm): print(celltype, lib_nm, editor_nm) [rep1, rep2] = _data.get_replicates(celltype, lib_nm, editor_nm) df1 = pd.read_csv(inp_dir + '%s.csv' % (rep1), index_col=0) df2 = pd.read_csv(inp_dir + '%s.csv' % (rep2), index_col=0) lib_nm = _data.get_lib_nm(rep1) lib_design, seq_col = _data.get_lib_design(rep1) ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) # Prepare data # data = data[data['Total count'] >= 100] df1 = df1[df1['Name (unique)'].isin(ontarget_sites)] df2 = df2[df2['Name (unique)'].isin(ontarget_sites)] id_cols = [ 'Name (unique)', 'gRNA (20nt)', seq_col, ] mdf = df1.merge(df2, on=id_cols, suffixes=['_r1', '_r2']) stat_col = 'Fraction edited' mdf['absdiff'] = np.abs(mdf['%s_r1' % (stat_col)] - mdf['%s_r2' % (stat_col)]) mdf['abslfc'] = np.abs( np.log2(mdf['%s_r1' % (stat_col)]) - np.log2(mdf['%s_r2' % (stat_col)])) n_col = 'Total count' mdf['Total n'] = mdf['%s_r1' % (n_col)] + mdf['%s_r2' % (n_col)] mdf.to_csv(out_dir + '%s_%s_%s.csv' % (celltype, lib_nm, editor_nm)) return
def fig_editing_profiles(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb')) lib_design, seq_col = _data.get_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} lib_nm = _data.get_lib_nm(treat_nm) ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm)) ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print('Forming long df...') dd = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: timer.update() if nm not in ontarget_nms: continue pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = np.nansum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 100] n_targetsites_in_condition = len(df) # Form stats_df dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total = len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero for mutation'].append(num_zeros) dd['Total num target sites for mutation'].append(total) dd['Frequency of zero in target sites for mutation'].append(num_zeros / total) dd['Num target sites in condition'].append(n_targetsites_in_condition) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) hm_df = pd.DataFrame(dd) hm_df.to_csv(out_dir + '%s.csv' % (treat_nm)) # Median normalize background_range = range(25, 34 + 1) for ref_nt in nts: for obs_nt in nts: if obs_nt == ref_nt: continue crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity'])) medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity']) hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi)) hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm)) return
def gather_statistics(exp_nm): feature_radius = 10 allowed_pos = range(3, 8 + 1) # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total=len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius:pidx] + seq[pidx + 1:pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics # for mut_nm in muts: # print(mut_nm) # mut = muts[mut_nm] # if len(mut) == 1: # d_temp = data[data['Mutation'] == mut[0]] # else: # d_temp = data[data['Mutation'].isin(mut)] # d_temp['Mutation'] = mut_nm # d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] # group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] # d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() print(data.columns) print(set(data['Mutation'])) acc_muts = [ 'C_T', 'C_G', 'C_A', ] data = data[data['Mutation'].isin(acc_muts)] data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt']) data = data.pivot_table( index=['Name', 'Position', 'Local context'], columns='Mutation', values='Frequency', ).reset_index() data = data.fillna(value=0) numerator = data['C_G'] + data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_GA_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_T'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_T_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_A_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_A'] + data['C_G'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_GA' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) return
def form_data(exp_nm, start_idx, end_idx): ''' Annotate library design with total count, edited count, fraction edited, etc. ''' data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_nm = _data.get_lib_nm(exp_nm) lib_design = lib_design.iloc[start_idx : end_idx + 1] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) new_data = dict() nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total = len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)]) if 'index' in df.columns: df = df[[col for col in df.columns if col != 'index']] if len(df) == 0: continue ## 8/21/19 ''' Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt ''' editor = _data.get_editor_nm(exp_nm) editor_to_central_pos = { 'ABE': 6, 'ABE-CP': 6, 'AID': 6, 'BE4': 6, 'BE4-CP': 8, 'CDA': 5, 'eA3A': 6, 'evoAPOBEC': 5, } if editor in editor_to_central_pos: central_pos = editor_to_central_pos[editor] else: central_pos = 6 substrate = 'A' if 'ABE' in editor else 'C' nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns] central_col = find_central_col(central_pos, nt_cols, substrate) if central_col is None: continue mut_cols = [col for col in df.columns if col != 'Count'] col_to_ref_nt = {col: col[0] for col in mut_cols} df_dd = defaultdict(list) for idx, row in df.iterrows(): df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt)) df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt)) for col in df_dd: df[col] = df_dd[col] numer = sum(df[df['Simulated precise'] == True]['Count']) denom = sum(df[df['Num. edits'] > 0]['Count']) sim_precision = numer / denom if denom > 0 else np.nan stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision) stats_dd['Simulated bystander position'].append(int(central_col[1:])) stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos) edited_ct = sum(df[df['Num. edits'] > 0]['Count']) stats_dd['Edited count'].append(edited_ct) stats_dd['Name (unique)'].append(nm) timer.update() stats_df_collected = pd.DataFrame(stats_dd) stats_df = lib_design.merge( stats_df_collected, on = 'Name (unique)', how = 'outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return