def adjust_treatment_control(treat_nm, control_nm, start_idx, end_idx): adj_d = _data.load_data(treat_nm, 'ah6a1a_hf_bc') control_data = _data.load_data(control_nm, 'h6_anyindel') lib_design, seq_col = _data.get_lib_design(treat_nm) lib_design = lib_design.iloc[start_idx:end_idx + 1] nms = lib_design['Name (unique)'] ''' ''' print('Subtracting control from treatment data...') shared_nms = [nm for nm in nms if nm in adj_d] new_adj_d = dict() timer = util.Timer(total=len(shared_nms)) for nm in shared_nms: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] t = subtract_treatment_control(t, c) new_adj_d[nm] = t timer.update() ## # Write ## with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx), 'wb') as f: pickle.dump(new_adj_d, f) return
def adjust_treatment_control(treat_nm, control_nm): treat_data = _data.load_data(treat_nm, 'h6_anyindel') control_data = _data.load_data(control_nm, 'h6_anyindel') treat_data = treat_data[treat_data['Total count'] >= 100] control_data = control_data[control_data['Total count'] >= 100] mdf = treat_data.merge( control_data, on=['Name'], how='inner', suffixes=['_t', '_c'], ) dd = defaultdict(list) for idx, row in mdf.iterrows(): adj_val = row['Indel count_t'] - ( row['Indel count_c'] / row['Total count_c']) * row['Total count_t'] adj_val = max(0, adj_val) dd['Indel count adj'].append(adj_val) for col in dd: mdf[col] = dd[col] mdf['Indel fraction adj'] = mdf['Indel count adj'] / mdf['Total count_t'] mdf.to_csv(out_dir + '%s.csv' % (treat_nm)) return
def filter_inprofile_batch_effects(): df = pd.read_csv(_config.DATA_DIR + 'batch_effects.csv') inprofile_batches = set(df['Batch']) be_treatments = [ s for s in treat_control_df['Treatment'] if 'Cas9' not in s ] timer = util.Timer(total=len(be_treatments)) for treat_nm in be_treatments: batch = exp_nm_to_batch[treat_nm] if batch in inprofile_batches: print(treat_nm, batch) adj_d = _data.load_data(treat_nm, 'ag4a2_adjust_batch_effects') to_remove = df[df['Batch'] == batch] lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm) with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) else: inp_fn = inp_dir + '%s.pkl' % (treat_nm) subprocess.check_output('cp %s %s' % (inp_fn, out_dir), shell=True) timer.update() return
def remove_batch_effects(treat_nm, start_idx, end_idx): batch_nm = exp_nm_to_batch[treat_nm] lib_design, seq_col = _data.get_lib_design(treat_nm) lib_nm = _data.get_lib_nm(treat_nm) lib_design = lib_design.iloc[start_idx:end_idx + 1] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} adj_d = _data.load_data(treat_nm, 'ah6a1b_subtract') batch_muts_to_remove = pd.read_csv( inp_dir + 'removed_batch_effects_%s.csv' % (lib_nm), index_col=0) if len(batch_muts_to_remove) == 0: inp_pkl = _config.OUT_PLACE + f'ah6a1b_subtract/{treat_nm}_{start_idx}_{end_idx}.pkl' out_pkl = out_dir + f'{treat_nm}_{start_idx}_{end_idx}.pkl' command = f'cp {inp_pkl} {out_pkl}' subprocess.check_output(command, shell=True) return # Remove mutations to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] == batch_nm] to_remove = to_remove[to_remove['Name'].isin(nms)] adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm) with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx), 'wb') as f: pickle.dump(adj_d, f) return
def indel_anyindel(exp_nm): try: data = _data.load_data(exp_nm, 'ah6c_reduce_1bp_indel_fq') except: print('Error : could not load data') sys.exit(1) lib_design, seq_col = _data.get_lib_design(exp_nm) dd = defaultdict(list) timer = util.Timer(total = len(data)) for target_nm in data: df = data[target_nm] tot_count = sum(df['Count']) dd['Total count'].append(tot_count) dd['Name'].append(target_nm) crit = (df['Category'] != 'wildtype') indel_count = sum(df[crit]['Count']) dd['Indel count'].append(indel_count) if tot_count != 0: dd['Indel freq'].append(indel_count / tot_count) else: dd['Indel freq'].append(np.nan) crit = (df['Category'] == 'del') del_count = sum(df[crit]['Count']) dd['Del count'].append(del_count) if tot_count != 0: dd['Del freq'].append(del_count / tot_count) else: dd['Del freq'].append(np.nan) crit = (df['Category'] == 'ins') ins_count = sum(df[crit]['Count']) dd['Ins count'].append(ins_count) if tot_count != 0: dd['Ins freq'].append(ins_count / tot_count) else: dd['Ins freq'].append(np.nan) crit = (df['Category'] == 'wildtype') wt_count = sum(df[crit]['Count']) dd['Wildtype count'].append(wt_count) if tot_count != 0: dd['Wildtype freq'].append(wt_count / tot_count) else: dd['Wildtype freq'].append(np.nan) timer.update() df = pd.DataFrame(dd) df.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def form_L2_group_ae_newgenotype_Cas9_adjust(group_nm, l1_nms): datas = [ _data.load_data(l1_nm, 'ae_newgenotype_Cas9_adjust') for l1_nm in l1_nms ] datas = [s for s in datas if s is not None] lib_design, seq_col = _data.get_lib_design(l1_nms[0]) ''' g5 format: data is a dict, keys = target site names values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column ''' group_ds = dict() timer = util.Timer(total=len(lib_design)) for idx, row in lib_design.iterrows(): nm = row['Name (unique)'] timer.update() # num_present = sum([bool(nm in data) for data in datas]) ''' Combine: two strategies 1. Normalize readcount, then add (equal contribution) 2. Directly add (weighted by readcount) * using this strategy ''' group_d = None for data in datas: if nm not in data: continue d = data[nm] d = d.drop(columns=['_Sequence Context', '_Cutsite', '_ExpDir']) nt_cols = [s for s in d.columns if s != 'Count'] d = d.fillna(value='.') if group_d is None: group_d = d else: group_d = group_d.append(d, ignore_index=True) group_d = group_d.groupby(nt_cols)['Count'].sum() group_d = group_d.reset_index() if group_d is not None: group_d = group_d.sort_values(by='Count', ascending=False) group_d = group_d.reset_index() group_d = group_d.replace(to_replace='.', value=np.nan) group_ds[nm] = group_d with open(out_dir + '%s.pkl' % (group_nm), 'wb') as f: pickle.dump(group_ds, f) return
def adjust_treatment_control(treat_nm, control_nm): cas_data = _data.load_data(treat_nm, 'e_newgenotype_Cas9') lib_data = _data.load_data(control_nm, 'e_newgenotype_Cas9') cas_data = cas_data.drop(columns='ULMI count') lib_data = lib_data.drop(columns='ULMI count') lib_design, seq_col = _data.get_lib_design(treat_nm) adj_d = dict() stats_dd = defaultdict(list) timer = util.Timer(total=len(lib_design)) for idx, row in lib_design.iterrows(): nm = row['Name (unique)'] timer.update() cas = cas_data[cas_data['_Experiment'] == nm] lib = lib_data[lib_data['_Experiment'] == nm] stats_dd['Name'].append(nm) if len(cas) == 0: stats_dd['Status'].append('No treatment') continue if len(lib) == 0: stats_dd['Status'].append('No control') adj_d[nm] = t continue stats_dd['Status'].append('Adjusted') new_cas = build_new_cas(lib, cas) adj_d[nm] = new_cas with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) stats_df = pd.DataFrame(stats_dd) stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm)) return
def adjust_treatment_control(treat_nm, control_nm, start_idx, end_idx): adj_d = _data.load_data(treat_nm, 'ag5a1a_hf_bc') control_data = _data.load_data(control_nm, 'g5_combin_be') lib_design, seq_col = _data.get_lib_design(treat_nm) lib_design = lib_design.iloc[start_idx:end_idx + 1] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' g5 format: data is a dict, keys = target site names values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column ''' ''' ''' print('Subtracting control from treatment data...') shared_nms = [nm for nm in nms if nm in adj_d] new_adj_d = dict() timer = util.Timer(total=len(shared_nms)) for nm in shared_nms: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] t = subtract_treatment_control(t, c, seq) new_adj_d[nm] = t timer.update() ## # Write ## with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx), 'wb') as f: pickle.dump(new_adj_d, f) return
def load_Y(): ''' Combine data together in human friendly formats ''' all_means = dict() print('WARNING: script depends on c_poswise_basic') c_fold = _config.OUT_PLACE + 'c_poswise_basic/' c_mtimes = [os.path.getmtime(c_fold + fn) for fn in os.listdir(c_fold)] ag4_fold = _config.OUT_PLACE + 'ag4_poswise_be_adjust/' ag4_mtimes = [ os.path.getmtime(ag4_fold + fn) for fn in os.listdir(ag4_fold) ] if min(c_mtimes) < max(ag4_mtimes): prompt = 'The most recent modification to a file in ag4 occurred after the earliest modification in c -- c might not be up to date. Continue? (y) ' ans = input(prompt) if ans != 'y': sys.exit(0) timer = util.Timer(total=len(treat_control_df)) for idx, row in treat_control_df.iterrows(): timer.update() treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # pw_df = get_poswise_df(adj_d, nm_to_seq, treat_nm) pw_df = pd.read_csv(_config.OUT_PLACE + 'c_poswise_basic/%s.csv' % (treat_nm)) means = get_means(pw_df) means.to_csv(out_dir + 'means_%s.csv' % (treat_nm)) all_means[treat_nm] = means import pickle with open(out_dir + 'Y.pkl', 'wb') as f: pickle.dump(all_means, f) return
def form_L2_group_ag4_poswise_be_adjust(group_nm, l1_nms): datas = [ _data.load_data(l1_nm, 'ag4_poswise_be_adjust') for l1_nm in l1_nms ] datas = [s for s in datas if s is not None] lib_design, seq_col = _data.get_lib_design(l1_nms[0]) ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' group_ds = dict() timer = util.Timer(total=len(lib_design)) for idx, row in lib_design.iterrows(): nm = row['Name (unique)'] timer.update() # num_present = sum([bool(nm in data) for data in datas]) ''' Combine: two strategies 1. Normalize readcount, then add (equal contribution) 2. Directly add (weighted by readcount) * using this strategy ''' group_d = None for data in datas: if nm not in data: continue d = data[nm] if group_d is None: group_d = d else: group_d += d if group_d is not None: group_ds[nm] = group_d with open(out_dir + '%s.pkl' % (group_nm), 'wb') as f: pickle.dump(group_ds, f) return
def adjust_treatment_control(treat_nm, control_nm): treat_data = _data.load_data(treat_nm, 'h6_anyindel') control_data = _data.load_data(control_nm, 'h6_anyindel') lib_design, seq_col = _data.get_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' h6 format: data is a dict, keys = target site names values = dfs 'Category', 'Indel start', 'Indel end', 'Indel length', 'MH length', 'Inserted bases', 'Count', 'Name', ''' adj_d = dict() stats_dd = defaultdict(list) ''' Filter positions with abnormally high control mut freq. ''' hc_decisions = defaultdict(list) print('Filtering positions with high frequency control mutations...') timer = util.Timer(total = len(lib_design)) for idx, row in lib_design.iterrows(): timer.update() nm = row['Name (unique)'] seq = row[seq_col] stats_dd['Name'].append(nm) if nm not in treat_data: stats_dd['Status'].append('No treatment') continue t = treat_data[nm] if nm not in control_data: stats_dd['Status'].append('No control') adj_d[nm] = t continue stats_dd['Status'].append('Adjusted') c = control_data[nm] # Adjust t = filter_high_control_muts(t, c, seq, treat_nm, nm, hc_decisions) if t is not None: adj_d[nm] = t stats_df = pd.DataFrame(stats_dd) stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm)) hc_df = pd.DataFrame(hc_decisions) hc_df = hc_df.sort_values(by = 'max_hf_freq', ascending = False) hc_df = hc_df.reset_index(drop = True) hc_df.to_csv(out_dir + '%s_hc_dec.csv' % (treat_nm)) ''' Filter treatment mutations that can be explained by control freq. In practice, this step is most effective for control mutations with relatively high frequency => relatively high variance ''' print('Gathering statistics on treatment mutations explained by control mutations...') bc_decisions = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, bc_decisions) timer.update() ''' Using global statistics, filter mutations while controlling false discovery rate ''' bc_fdr_threshold = 0.05 bc_df = pd.DataFrame(bc_decisions) other_distribution = bc_df[bc_df['pval'] > 0.995] bc_df = bc_df[bc_df['pval'] <= 0.995] bc_df = bc_df.sort_values(by = 'pval') bc_df = bc_df.reset_index(drop = True) fdr_decs, hit_reject = [], False for idx, pval in enumerate(bc_df['pval']): if hit_reject: dec = False else: fdr_critical = ((idx + 1) / len(bc_df)) * bc_fdr_threshold dec = bool(pval <= fdr_critical) fdr_decs.append(dec) if dec is False and hit_reject is True: hit_reject = False bc_df['FDR accept'] = fdr_decs other_distribution['FDR accept'] = False bc_df = bc_df.append(other_distribution, ignore_index = True) bc_df.to_csv(out_dir + '%s_bc_dec.csv' % (treat_nm)) print('Filtering treatment mutations explained by control mutations...') to_remove = bc_df[bc_df['FDR accept'] == False] adj_d = filter_binom_control_muts(to_remove, adj_d, control_data, nm_to_seq) ## # Write ## with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) return
def adjust_treatment_control(treat_nm, control_nm): treat_data = _data.load_data(treat_nm, 'g5_combin_be') control_data = _data.load_data(control_nm, 'g5_combin_be') treat_minq = _data.load_minq(treat_nm, 'g5_combin_be') control_minq = _data.load_minq(control_nm, 'g5_combin_be') lib_design, seq_col = _data.get_lib_design(treat_nm) ''' g5 format: data is a dict, keys = target site names values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column ''' adj_d = dict() stats_dd = defaultdict(list) hc_decisions = defaultdict(list) nm_to_seq = dict() ''' Filter positions with abnormally high control mut freq. ''' print('Filtering positions with high frequency control mutations...') timer = util.Timer(total = len(lib_design)) for idx, row in lib_design.iterrows(): nm = row['Name (unique)'] seq = row[seq_col] nm_to_seq[nm] = seq timer.update() stats_dd['Name'].append(nm) if nm not in treat_data: stats_dd['Status'].append('No treatment') continue t = treat_data[nm] if nm not in control_data: stats_dd['Status'].append('No control') adj_d[nm] = t continue stats_dd['Status'].append('Adjusted') c = control_data[nm] # Adjust t = filter_high_control_muts(t, c, seq, treat_nm, nm, hc_decisions) adj_d[nm] = t stats_df = pd.DataFrame(stats_dd) stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm)) hc_df = pd.DataFrame(hc_decisions) hc_df = hc_df.sort_values(by = 'c_fq', ascending = False) hc_df = hc_df.reset_index(drop = True) hc_df.to_csv(out_dir + '%s_hc_dec.csv' % (treat_nm)) ''' Filter treatment mutations that can be explained by control freq. In practice, this step is most effective for control mutations with relatively high frequency => relatively high variance ''' print('Gathering statistics on treatment mutations explained by control mutations...') bc_decisions = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, bc_decisions) timer.update() ''' Using global statistics, filter mutations while controlling false discovery rate ''' bc_fdr_threshold = 0.05 bc_df = pd.DataFrame(bc_decisions) other_distribution = bc_df[bc_df['pval'] > 0.995] bc_df = bc_df[bc_df['pval'] <= 0.995] bc_df = bc_df.sort_values(by = 'pval') bc_df = bc_df.reset_index(drop = True) fdr_decs, hit_reject = [], False for idx, pval in enumerate(bc_df['pval']): if hit_reject: dec = False else: fdr_critical = ((idx + 1) / len(bc_df)) * bc_fdr_threshold dec = bool(pval <= fdr_critical) fdr_decs.append(dec) if dec is False and hit_reject is True: hit_reject = False bc_df['FDR accept'] = fdr_decs other_distribution['FDR accept'] = False bc_df = bc_df.append(other_distribution, ignore_index = True) bc_df.to_csv(out_dir + '%s_bc_dec.csv' % (treat_nm)) print('Filtering treatment mutations explained by control mutations...') to_remove = bc_df[bc_df['FDR accept'] == False] adj_d = filter_binom_control_muts(to_remove, adj_d, control_data, nm_to_seq) ''' ''' print('Subtracting control from treatment data...') timer = util.Timer(total = len(adj_d)) for nm in adj_d: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] t = subtract_treatment_control(t, c, seq) adj_d[nm] = t timer.update() ''' Filter treatment mutations that are best explained by spontaneous random mutations. Tend to be very low frequency with no counterpart in control ''' print('Gathering statistics on treatment mutations explained by Illumina sequencing errors...') ie_decisions = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] c_minq = control_minq[nm] t_minq = treat_minq[nm] gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, ie_decisions) timer.update() ie_fdr_threshold = 0.05 ie_df = pd.DataFrame(ie_decisions) other_distribution = ie_df[ie_df['pval'] > 0.995] ie_df = ie_df[ie_df['pval'] <= 0.995] ie_df = ie_df.sort_values(by = 'pval') ie_df = ie_df.reset_index(drop = True) fdr_decs, hit_reject = [], False for idx, pval in enumerate(ie_df['pval']): if hit_reject: dec = False else: fdr_critical = ((idx + 1) / len(ie_df)) * ie_fdr_threshold dec = bool(pval <= fdr_critical) fdr_decs.append(dec) if dec is False and hit_reject is True: hit_reject = False ie_df['FDR accept'] = fdr_decs other_distribution['FDR accept'] = False ie_df = ie_df.append(other_distribution, ignore_index = True) ie_df.to_csv(out_dir + '%s_ie_dec.csv' % (treat_nm)) print('Filtering treatment mutations explained by Illumina sequencing errors...') to_remove = ie_df[ie_df['FDR accept'] == False] adj_d = filter_illumina_error_muts(to_remove, adj_d, control_data, nm_to_seq) ## # Write ## with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) return
""" import pandas as pd import numpy as np from collections import OrderedDict import xgboost as xgb import lightgbm as lgb from sklearn.model_selection import KFold # GridSearchCV, cross_val_score from sklearn.metrics import log_loss, roc_auc_score import _data import catboost import warnings warnings.filterwarnings("ignore") # pre-process and extraction X, public, private = _data.load_data() # before fitting in models id_public = public.pop("ID") id_private = private.pop("ID") X.pop("ID") status = X.pop("Status") y = np.array([1 if i == "No-Show" else 0 for i in status]) # add features from DNN hidden layer X = pd.concat([X, pd.DataFrame(A1)], axis=1) # Grid Search # params = {"n_estimators": [10, 20, 30], "max_depth":[3, 5, 7, 9, 11],\ # "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],\ # "min_child_weight": [0.6, 0.7, 0.8], "scale_pos_weight": [0.8, 0.9, 1]}
def form_data(exp_nm, start_idx, end_idx): data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_nm = _data.get_lib_nm(exp_nm) disease_nms = _data.get_disease_sites(lib_design, lib_nm) # Subset for dumb parallelization, ensure only disease target sites used lib_design = lib_design.iloc[start_idx:end_idx + 1] lib_design = lib_design[lib_design['Name (unique)'].isin(disease_nms)] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total=len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] design_row = lib_design[lib_design['Name (unique)'] == nm].iloc[0] snp_pos = int(design_row['Position of SNP in gRNA']) correct_nt = design_row['Corrected nucleotide (gRNA orientation)'] path_nt = design_row['Pathogenic nucleotide (gRNA orientation)'] nt_cols = [ col for col in df.columns if col != 'Count' and col != 'Frequency' ] # Impute . as wildtype df = impute_dot_as_wildtype(df, nt_cols) total_ct = sum(df['Count']) # Ensure each row is unique df = df.groupby(nt_cols)['Count'].agg('sum').reset_index() # Filter unedited columns df = subset_edited_rows(df, nt_cols) edited_ct = sum(df['Count']) df = remove_noisy_edits(df, nt_cols, exp_nm) gt_correct_ct = get_precise_gt_correction_count( df, nt_cols, snp_pos, correct_nt, path_nt) ## Overall statistics stats_dd['Name (unique)'].append(nm) stats_dd['Obs. correction count'].append(gt_correct_ct) stats_dd['Obs. total count'].append(total_ct) stats_dd['Obs. edited count'].append(edited_ct) stats_dd['Obs. gt correct fraction in all reads'].append( gt_correct_ct / total_ct if total_ct > 0 else np.nan) stats_dd['Obs. gt correct precision in edited reads'].append( gt_correct_ct / edited_ct if edited_ct > 0 else np.nan) stats_dd['Obs. editing frequency'].append( edited_ct / total_ct if total_ct > 0 else np.nan) # Amino acid correction for CtoGA if 'AA sequence - reference' in design_row.index and type( design_row['AA sequence - reference']) == str: orients = list('-+') d1 = bool(design_row['Designed orientation w.r.t. genome'] == '+') d2 = bool(design_row['AA frame strand'] == '+') xor_int = int(d1 == d2) aa_strand_relative_to_seq = orients[xor_int] aa_stats = { 'Unedited AA': 0, 'Edited AA': 0, 'Goal AA': 0, } if design_row['AA sequence - pathogenic'] != design_row[ 'AA sequence - reference']: for jdx, edit_row in df.iterrows(): seq_30nt = edit_row_to_seq_30nt(design_row, edit_row, seq_col) obs_aas = nts_to_aas(seq_30nt, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) pp0idx = design_row['Protospacer position zero index'] seq_30nt_path = design_row[seq_col][pp0idx - 9:pp0idx + 21] aa_path_with_bc = nts_to_aas( seq_30nt_path, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) seq_30nt_wt = seq_30nt_path[:9 + snp_pos] + design_row[ 'Corrected nucleotide (gRNA orientation)'] + seq_30nt_path[ 9 + snp_pos + 1:] aa_wt_with_bc = nts_to_aas(seq_30nt_wt, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) if obs_aas == aa_path_with_bc: aa_stats['Unedited AA'] += edit_row['Count'] else: aa_stats['Edited AA'] += edit_row['Count'] if obs_aas == aa_wt_with_bc: aa_stats['Goal AA'] += edit_row['Count'] stats_dd['Obs. aa correct precision among edited gts'].append( aa_stats['Goal AA'] / edited_ct if edited_ct > 0 else np.nan) stats_dd['Obs. aa correct precision among edited aas'].append( aa_stats['Goal AA'] / aa_stats['Edited AA'] if aa_stats['Edited AA'] > 0 else np.nan) stats_dd['Obs. aa correct precision among all reads'].append( aa_stats['Goal AA'] / total_ct if total_ct > 0 else np.nan) if stats_dd[ 'Obs. aa correct precision among edited gts'] < stats_dd[ 'Obs. gt correct precision in edited reads']: import code code.interact(local=dict(globals(), **locals())) else: stats_dd['Obs. aa correct precision among edited gts'].append( np.nan) stats_dd['Obs. aa correct precision among edited aas'].append( np.nan) stats_dd['Obs. aa correct precision among all reads'].append( np.nan) timer.update() # Save stats_df_collected = pd.DataFrame(stats_dd) stats_df = lib_design.merge( stats_df_collected, on='Name (unique)', how='outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return
def calculate_statistics(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print('Gathering statistics...') dd = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: timer.update() pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = sum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 50] dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total = len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero'].append(num_zeros) dd['Total num target sites'].append(total) dd['Frequency of zero in target sites'].append(num_zeros / total) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) stats_df = pd.DataFrame(dd) stats_df.to_csv(out_dir + '%s.csv' % (treat_nm)) return
def form_data(exp_nm): data = _data.load_data(exp_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_lib_design(exp_nm) # Get target nt editor_type = _data.get_editor_type(exp_nm) if editor_type == 'CtoTeditor': target_nt = 'C' elif editor_type == 'AtoGeditor': target_nt = 'A' nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] grnas = lib_design['gRNA (20nt)'] design_cats = lib_design['Design category'] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} nm_to_grna = {nm: grna for nm, grna in zip(nms, grnas)} nm_to_design_cat = { nm: design_cat for nm, design_cat in zip(nms, design_cats) } dd = defaultdict(list) timer = util.Timer(total=len(data)) for nm in data: pw = data[nm] seq = nm_to_seq[nm] grna = nm_to_grna[nm] design_cat = nm_to_design_cat[nm] # Get category, subcategory, and match count match_count = get_match_count(grna, seq) if design_cat == 'guideseq': category = 'Off-target series' subcategory = nm.split('_')[2] # gene name elif design_cat == 'mismatch': category = 'Mismatch series' subcategory = nm.split('_')[1] # series number elif design_cat == 'chipseq': category = 'Chip series' elif design_cat == 'vivo': category = 'vivo' subcategory = 'vivo' else: assert match_count == 20, 'fail' category = 'On-target' subcategory = 'On-target' for jdx in range(len(pw)): pos = _data.idx_to_pos(jdx, exp_nm) if pos not in [6, 7]: continue ref_nt = seq[jdx] if ref_nt != target_nt: continue ref_idx = nt_to_idx[ref_nt] total = sum(pw[jdx]) edit_ct = 0 for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue edit_ct += pw[jdx][kdx] if total > 0: dd['Edited fraction'].append(edit_ct / total) else: dd['Edited fraction'].append(np.nan) dd['Edit count'].append(edit_ct) dd['Total count'].append(total) dd['Position'].append(pos) dd['Ref nt'].append(ref_nt) dd['Name'].append(nm) dd['Match count'].append(int(match_count)) dd['Category'].append(category) dd['Subcategory'].append(subcategory) timer.update() df = pd.DataFrame(dd) df.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def indel_anyindel(exp_nm): try: data = _data.load_data(exp_nm, 'ah6a1b_subtract') except: print('Error : could not load data') sys.exit(1) lib_design, seq_col = _data.get_lib_design(exp_nm) dd = defaultdict(list) timer = util.Timer(total=len(data)) for target_nm in data: df = data[target_nm] tot_count = sum(df['Count']) dd['Total count'].append(tot_count) dd['Name (unique)'].append(target_nm) crit = (df['Category'] != 'wildtype') indel_count = sum(df[crit]['Count']) dd['Indel count'].append(indel_count) if tot_count != 0: dd['Indel freq'].append(indel_count / tot_count) else: dd['Indel freq'].append(np.nan) crit = (df['Category'] == 'del') del_count = sum(df[crit]['Count']) dd['Del count'].append(del_count) if tot_count != 0: dd['Del freq'].append(del_count / tot_count) else: dd['Del freq'].append(np.nan) crit = (df['Category'] == 'ins') ins_count = sum(df[crit]['Count']) dd['Ins count'].append(ins_count) if tot_count != 0: dd['Ins freq'].append(ins_count / tot_count) else: dd['Ins freq'].append(np.nan) crit = (df['Category'] == 'wildtype') wt_count = sum(df[crit]['Count']) dd['Wildtype count'].append(wt_count) if tot_count != 0: dd['Wildtype freq'].append(wt_count / tot_count) else: dd['Wildtype freq'].append(np.nan) timer.update() df = pd.DataFrame(dd) data = lib_design.merge( df, on='Name (unique)', how='outer', ) # Annotate csvs pam_start_idx = 33 pam_len = 5 get_pam = lambda row: row['Sequence context (61nt)'][ pam_start_idx:pam_start_idx + pam_len] data['Designed PAM (5nt)'] = data.apply(get_pam, axis='columns') get_true_grna_len = lambda row: 20 if row['gRNA (20nt)'][0] == 'G' else 21 data['True gRNA length'] = data.apply(get_true_grna_len, axis='columns') grna_pos1_idx = 13 grna_pos0_idx = grna_pos1_idx - 1 grna_5primeG_matches_target = lambda row: bool(row[ 'Sequence context (61nt)'][grna_pos1_idx] == 'G') if bool(row[ 'True gRNA length'] == 20) else bool(row['Sequence context (61nt)'] [grna_pos0_idx] == 'G') data['gRNA 5primeG matches target'] = data.apply( grna_5primeG_matches_target, axis='columns') # Prepare data # data = data[data['Total count'] >= 100] # # Gather statistics def grna_5primeg_and_len(row): grna_len = row['True gRNA length'] match = row['gRNA 5primeG matches target'] if match: return f'{grna_len}-nt gRNA, 5primeG matches' else: return f'{grna_len}-nt gRNA, 5primeG does not match' data['gRNA properties'] = data.apply(grna_5primeg_and_len, axis='columns') data.to_csv(out_dir + f'{exp_nm}.csv') data = data[data['Total count'] >= 100] pv_df = data.pivot(index='Name (unique)', columns='gRNA properties', values='Indel freq') pv_df.to_csv(out_dir + f'5primeG_{exp_nm}.csv') return
def adjust_treatment_control(treat_nm, control_nm): adj_d = _data.load_data(treat_nm, 'ag5a1b_subtract') control_data = _data.load_data(control_nm, 'g5_combin_be') treat_minq = _data.load_minq(treat_nm, 'g5_combin_be') control_minq = _data.load_minq(control_nm, 'g5_combin_be') lib_design, seq_col = _data.get_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' g5 format: data is a dict, keys = target site names values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column ''' ''' Filter treatment mutations that are best explained by spontaneous random mutations. Tend to be very low frequency with no counterpart in control ''' print( 'Gathering statistics on treatment mutations explained by Illumina sequencing errors...' ) ie_decisions = defaultdict(list) timer = util.Timer(total=len(adj_d)) for nm in adj_d: t = adj_d[nm] if nm not in control_data: continue c = control_data[nm] seq = nm_to_seq[nm] c_minq = control_minq[nm] t_minq = treat_minq[nm] gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, ie_decisions) timer.update() ie_fdr_threshold = 0.05 ie_df = pd.DataFrame(ie_decisions) other_distribution = ie_df[ie_df['pval'] > 0.995] ie_df = ie_df[ie_df['pval'] <= 0.995] ie_df = ie_df.sort_values(by='pval') ie_df = ie_df.reset_index(drop=True) fdr_decs, hit_reject = [], False for idx, pval in enumerate(ie_df['pval']): if hit_reject: dec = False else: fdr_critical = ((idx + 1) / len(ie_df)) * ie_fdr_threshold dec = bool(pval <= fdr_critical) fdr_decs.append(dec) if dec is False and hit_reject is True: hit_reject = False ie_df['FDR accept'] = fdr_decs other_distribution['FDR accept'] = False ie_df = ie_df.append(other_distribution, ignore_index=True) ie_df.to_csv(out_dir + '%s_ie_dec.csv' % (treat_nm)) print( 'Filtering treatment mutations explained by Illumina sequencing errors...' ) to_remove = ie_df[ie_df['FDR accept'] == False] adj_d = filter_illumina_error_muts(to_remove, adj_d, control_data, nm_to_seq) ## # Write ## with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) return
def form_data(exp_nm, start_idx, end_idx): ''' Annotate library design with total count, edited count, fraction edited, etc. ''' data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_nm = _data.get_lib_nm(exp_nm) lib_design = lib_design.iloc[start_idx : end_idx + 1] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) new_data = dict() nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total = len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)]) if 'index' in df.columns: df = df[[col for col in df.columns if col != 'index']] if len(df) == 0: continue ## 8/21/19 ''' Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt ''' editor = _data.get_editor_nm(exp_nm) editor_to_central_pos = { 'ABE': 6, 'ABE-CP': 6, 'AID': 6, 'BE4': 6, 'BE4-CP': 8, 'CDA': 5, 'eA3A': 6, 'evoAPOBEC': 5, } if editor in editor_to_central_pos: central_pos = editor_to_central_pos[editor] else: central_pos = 6 substrate = 'A' if 'ABE' in editor else 'C' nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns] central_col = find_central_col(central_pos, nt_cols, substrate) if central_col is None: continue mut_cols = [col for col in df.columns if col != 'Count'] col_to_ref_nt = {col: col[0] for col in mut_cols} df_dd = defaultdict(list) for idx, row in df.iterrows(): df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt)) df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt)) for col in df_dd: df[col] = df_dd[col] numer = sum(df[df['Simulated precise'] == True]['Count']) denom = sum(df[df['Num. edits'] > 0]['Count']) sim_precision = numer / denom if denom > 0 else np.nan stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision) stats_dd['Simulated bystander position'].append(int(central_col[1:])) stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos) edited_ct = sum(df[df['Num. edits'] > 0]['Count']) stats_dd['Edited count'].append(edited_ct) stats_dd['Name (unique)'].append(nm) timer.update() stats_df_collected = pd.DataFrame(stats_dd) stats_df = lib_design.merge( stats_df_collected, on = 'Name (unique)', how = 'outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return
def calc_indels_global(exp_nm): ''' Across all target sites, measure frequency of indels by length and starting position. Allow low readcount cutoff, but also do not normalize data within each target site by total readcount. ''' data = None if exp_nm in set(treat_control_df['Treatment']): data = _data.load_data(exp_nm, 'ah6a1b_subtract') elif exp_nm in set(treat_control_df['Control']): data = _data.load_data(exp_nm, 'h6_anyindel') pass if data is None: print('Error : could not load data') import code; code.interact(local=dict(globals(), **locals())) sys.exit(1) lib_design, seq_col = _data.get_lib_design(exp_nm) # Init pos_len_indel = dict() for indel_len in range(-40, 15 + 1): pos_nt_indel = dict() for pos_idx in range(-25, 50): pos_nt_indel[pos_idx] = [] pos_nt_indel['Name'] = [] pos_len_indel[indel_len] = pos_nt_indel mdf = pd.DataFrame() timer = util.Timer(total = len(data)) for target_nm in data: df = data[target_nm] if 'Frequency' not in df.columns: df['Frequency'] = df['Count'] / np.sum(df['Count']) tot_count = sum(df['Count']) if tot_count < 100: continue crit = (df['Category'] != 'wildtype') dfs = df[crit] # Init target_pos_len_vectors = dict() for indel_len in range(-40, 15 + 1): target_pos_vector_nt = defaultdict(lambda: 0) target_pos_len_vectors[indel_len] = target_pos_vector_nt indel_len_vector = defaultdict(lambda: 0) # Iterate df_annot = defaultdict(list) for idx, row in dfs.iterrows(): indel_start = int(row['Indel start']) indel_end = int(row['Indel end']) mh_len = row['MH length'] count = row['Count'] indel_len = int(row['Indel length']) cat = row['Category'] # Gather indel length frequencies if cat == 'del': indel_len = indel_len * -1 elif cat == 'ins': ''' h6_anyindel in each dir describes indel_start and indel_end for indexing (should have difference of 1 for insertions but they do not). so we fix this here ''' indel_len = indel_len indel_end = indel_start + 1 indel_len_vector[indel_len] += count # Adjust start and end package = adjust_indel_pos(indel_start, indel_end, mh_len) (adj_indel_start, adj_indel_end) = package # Add count to adj. start pos at the right length if indel_len in target_pos_len_vectors: target_pos_vector_nt = target_pos_len_vectors[indel_len] target_pos_vector_nt[adj_indel_start] += count df_annot['Indel start adj'].append(adj_indel_start) df_annot['Indel end adj'].append(adj_indel_end) for col in df_annot: dfs[col] = df_annot[col] dfs['Name'] = target_nm mdf = mdf.append(dfs, ignore_index = True) # Gather total frequency by position and length of indels for indel_len in pos_len_indel: pos_nt_indel = pos_len_indel[indel_len] tpvn = target_pos_len_vectors[indel_len] for col in pos_nt_indel: if col != 'Name': pos_nt_indel[col].append(tpvn[col]) else: pos_nt_indel[col].append(target_nm) timer.update() # Save for indel_len in pos_len_indel: pos_nt_indel = pos_len_indel[indel_len] pos_nt_df = pd.DataFrame(pos_nt_indel) pos_nt_df.to_csv(out_dir + '%s_pos_%snt.csv' % (exp_nm, indel_len)) pos_nt_df_melt = pd.melt(pos_nt_df, id_vars = 'Name', var_name = 'Position', value_name = 'Count') pos_nt_df_melt.to_csv(out_dir + '%s_pos_melt_%snt.csv' % (exp_nm, indel_len)) # merged mdf.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def adjust_treatment_control(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') # adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb')) lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print( 'Gathering statistics on treatment mutations matching background profile by frequency of zeros...' ) dd = defaultdict(list) timer = util.Timer(total=len(adj_d)) for nm in adj_d: timer.update() pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = sum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 50] # Form stats_df and find p for binomial, which is typically ~0.99 dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total=len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero'].append(num_zeros) dd['Total num target sites'].append(total) dd['Frequency of zero in target sites'].append(num_zeros / total) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) fz_df = pd.DataFrame(dd) baseline_pos_range = pos_range[-5:] max_mean_activity = 0.025 min_num_targets = 50 crit = (fz_df['Position index'].isin(baseline_pos_range)) & \ (fz_df['Mean activity'] <= max_mean_activity) & \ (fz_df['Total num target sites'] >= min_num_targets) bg_bin_p = np.mean(fz_df[crit]['Frequency of zero in target sites']) if np.isnan(bg_bin_p): raise ValueError pvals = [] timer = util.Timer(total=len(fz_df)) for idx, row in fz_df.iterrows(): total = row['Total num target sites'] numzero = row['Num target sites with zero'] pval = binom.cdf(numzero, total, bg_bin_p) pvals.append(pval) timer.update() fz_df['pval'] = pvals fz_fdr_threshold = 0.001 fz_df = ben_hoch_fdr(fz_df, fz_fdr_threshold) fz_df.to_csv(out_dir + '%s_fraczero_dec.csv' % (treat_nm)) print( 'Filtering treatment mutations matching background profile by frequency of zeros...' ) to_remove = fz_df[fz_df['FDR accept'] == False] adj_d = filter_freqzero_background_mutations(to_remove, adj_d, nm_to_seq) ## # Write ## with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) return
def indel_anyindel_pos(exp_nm): data = None if exp_nm in set(treat_control_df['Treatment']): data = _data.load_data(exp_nm, 'ah6a3_remove_batch') is_control = False elif exp_nm in set(treat_control_df['Control']): data = _data.load_data(exp_nm, 'h6_anyindel') is_control = True if data is None: print('Error : could not load data') import code code.interact(local=dict(globals(), **locals())) sys.exit(1) lib_design, seq_col = _data.get_lib_design(exp_nm) # Init pos_dd = dict() for pos_idx in range(-25, 50): pos_dd[pos_idx] = [] pos_dd['Name'] = [] # Init len_dd = dict() for len_val in range(-40, 15 + 1): len_dd[len_val] = [] len_dd['Name'] = [] # Init pos_len_indel = dict() for indel_len in range(-40, 15 + 1): pos_nt_indel = dict() for pos_idx in range(-25, 50): pos_nt_indel[pos_idx] = [] pos_nt_indel['Name'] = [] pos_len_indel[indel_len] = pos_nt_indel mdf = pd.DataFrame() timer = util.Timer(total=len(data)) for target_nm in data: df = data[target_nm] if 'Frequency' not in df.columns: df['Frequency'] = df['Count'] / np.sum(df['Count']) tot_count = sum(df['Count']) if tot_count < 100: continue crit = (df['Category'] != 'wildtype') dfs = df[crit] # Init target_pos_vector = defaultdict(lambda: 0) target_pos_len_vectors = dict() for indel_len in range(-40, 15 + 1): target_pos_vector_nt = defaultdict(lambda: 0) target_pos_len_vectors[indel_len] = target_pos_vector_nt indel_len_vector = defaultdict(lambda: 0) # Iterate df_annot = defaultdict(list) for idx, row in dfs.iterrows(): indel_start = int(row['Indel start']) indel_end = int(row['Indel end']) mh_len = row['MH length'] freq = row['Frequency'] indel_len = int(row['Indel length']) cat = row['Category'] # Gather indel length frequencies if cat == 'del': indel_len = indel_len * -1 elif cat == 'ins': ''' h6_anyindel in each dir describes indel_start and indel_end for indexing (should have difference of 1 for insertions but they do not). so we fix this here ''' indel_len = indel_len indel_end = indel_start + 1 indel_len_vector[indel_len] += freq # Adjust start and end if not is_control: package = adjust_indel_pos(indel_start, indel_end, mh_len) (adj_indel_start, adj_indel_end) = package else: adj_indel_start, adj_indel_end = indel_start, indel_end # Gather total frequency by position for jdx in range(adj_indel_start, adj_indel_end): target_pos_vector[jdx] += freq # Gather total frequency by position of specific nt indels if indel_len in target_pos_len_vectors: target_pos_vector_nt = target_pos_len_vectors[indel_len] for jdx in range(adj_indel_start, adj_indel_end): target_pos_vector_nt[jdx] += freq df_annot['Indel start adj'].append(adj_indel_start) df_annot['Indel end adj'].append(adj_indel_end) for col in df_annot: dfs[col] = df_annot[col] dfs['Name'] = target_nm mdf = mdf.append(dfs, ignore_index=True) # Gather indel length frequencies for col in len_dd: if col != 'Name': len_dd[col].append(indel_len_vector[col]) else: len_dd[col].append(target_nm) # Gather total frequency by position for col in pos_dd: if col != 'Name': pos_dd[col].append(target_pos_vector[col]) else: pos_dd[col].append(target_nm) # Gather total frequency by position of 1 nt indels for indel_len in pos_len_indel: pos_nt_indel = pos_len_indel[indel_len] tpvn = target_pos_len_vectors[indel_len] for col in pos_nt_indel: if col != 'Name': pos_nt_indel[col].append(tpvn[col]) else: pos_nt_indel[col].append(target_nm) timer.update() # Save pos_df = pd.DataFrame(pos_dd) pos_df.to_csv(out_dir + '%s_pos.csv' % (exp_nm)) pos_df_melt = pd.melt(pos_df, id_vars='Name', var_name='Position', value_name='Frequency') pos_df_melt.to_csv(out_dir + '%s_pos_melt.csv' % (exp_nm)) # Save for indel_len in pos_len_indel: pos_nt_indel = pos_len_indel[indel_len] pos_nt_df = pd.DataFrame(pos_nt_indel) pos_nt_df.to_csv(out_dir + '%s_pos_%snt.csv' % (exp_nm, indel_len)) pos_nt_df_melt = pd.melt(pos_nt_df, id_vars='Name', var_name='Position', value_name='Frequency') pos_nt_df_melt.to_csv(out_dir + '%s_pos_melt_%snt.csv' % (exp_nm, indel_len)) # Save len_df = pd.DataFrame(len_dd) len_df.to_csv(out_dir + '%s_len.csv' % (exp_nm)) len_df_melt = pd.melt(len_df, id_vars='Name', var_name='Indel length', value_name='Frequency') len_df_melt.to_csv(out_dir + '%s_len_melt.csv' % (exp_nm)) # merged mdf.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def adjust_batch_effects(): # Gather statistics be_treatments = [ s for s in treat_control_df['Treatment'] if 'Cas9' not in s ] mdf = pd.DataFrame() timer = util.Timer(total=len(be_treatments)) print('Loading stats from each condition...') for treat_nm in be_treatments: df = pd.read_csv(inp_dir + '%s.csv' % (treat_nm), index_col=0) df['Treatment'] = treat_nm df['Batch'] = exp_nm_to_batch[treat_nm] df['Editor'] = exp_nm_to_editor[treat_nm] mdf = mdf.append(df, ignore_index=True) timer.update() mdf['Log mean activity'] = np.log10(mdf['Mean activity']) cbe_editors = set([e for e in mdf['Editor'] if 'ABE' not in e]) abe_editors = set([e for e in mdf['Editor'] if 'ABE' in e]) # ANOVA calculations from scipy.stats import f_oneway print( 'Calculating ANOVA on each position+mutation combination to identify batch effects...' ) dd = defaultdict(list) set_pos = set(mdf['Position']) timer = util.Timer(total=len(set_pos)) for pos in set_pos: timer.update() for ref_nt in set(mdf['Ref nt']): for obs_nt in set(mdf['Obs nt']): crit = (mdf['Position'] == pos) & \ (mdf['Ref nt'] == ref_nt) & \ (mdf['Obs nt'] == obs_nt) if len(mdf[crit]) == 0: continue if pos in [22, 23]: continue args = tuple([ mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity'] for batch_nm in set(mdf[crit]['Batch']) ]) fstat, pval = f_oneway(*args) dd['pval'].append(pval) dd['Statistic'].append(fstat) dd['Position'].append(pos) dd['Ref nt'].append(ref_nt) dd['Obs nt'].append(obs_nt) stats_df = pd.DataFrame(dd) stats_df['-log10p'] = -np.log10(stats_df['pval']) # Apply Bonferroni p-value cutoff print( 'Finding significant batch effects with a Bonferroni corrected p-value threshold...' ) pval = 0.005 bonf_threshold = pval / len(stats_df) stats_df['bonferroni accept'] = (stats_df['pval'] <= bonf_threshold) stats_df.to_csv(out_dir + 'mutation_dec.csv') ''' Identify mutations for removal At mutations passing Bonferroni corrected ANOVA test, identify batches where mutations are frequent ''' print('Identifying batches to remove mutations from...') to_remove = stats_df[stats_df['bonferroni accept'] == True] dd = defaultdict(list) timer = util.Timer(total=len(to_remove)) for idx, row in to_remove.iterrows(): timer.update() pos = row['Position'] ref_nt = row['Ref nt'] obs_nt = row['Obs nt'] crit = (mdf['Position'] == pos) & \ (mdf['Ref nt'] == ref_nt) & \ (mdf['Obs nt'] == obs_nt) means = { batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \ for batch_nm in set(mdf[crit]['Batch']) } mean_vals = list(means.values()) mean_means = np.mean(mean_vals) median_means = np.median(mean_vals) crit = (mdf['Position'] == pos) & \ (mdf['Ref nt'] == ref_nt) & \ (mdf['Obs nt'] == obs_nt) & \ (mdf['Editor'].isin(cbe_editors)) cbe_means = { batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \ for batch_nm in set(mdf[crit]['Batch']) } cbe_mean_means = np.mean(list(cbe_means.values())) cbe_median_means = np.median(list(cbe_means.values())) crit = (mdf['Position'] == pos) & \ (mdf['Ref nt'] == ref_nt) & \ (mdf['Obs nt'] == obs_nt) & \ (mdf['Editor'].isin(abe_editors)) abe_means = { batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \ for batch_nm in set(mdf[crit]['Batch']) } abe_mean_means = np.mean(list(abe_means.values())) abe_median_means = np.median(list(abe_means.values())) # Ignore batch effects with small effect size if max(mean_vals) - min(mean_vals) < 0.002: continue # Batch effect should be enriched over a rare background when controlling for editor type bg_threshold = 0.0005 # if cbe_mean_means > bg_threshold or abe_mean_means > bg_threshold: if cbe_median_means > bg_threshold or abe_median_means > bg_threshold: continue for batch_nm in means: if means[batch_nm] >= 0.002: dd['Position'].append(pos) dd['Ref nt'].append(ref_nt) dd['Obs nt'].append(obs_nt) dd['Batch'].append(batch_nm) batch_muts_to_remove = pd.DataFrame(dd) batch_muts_to_remove.to_csv(out_dir + 'removed_batch_effects.csv') # Remove mutations print('Removing batch effects in each condition...') timer = util.Timer(total=len(be_treatments)) for treat_nm in be_treatments: adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust') lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} batch = exp_nm_to_batch[treat_nm] to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] == batch] adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm) with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) timer.update() return
def form_data(exp_nm, start_idx, end_idx): ''' Annotate library design with total count, edited count, fraction edited, etc. ''' data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_design = lib_design.iloc[start_idx:end_idx + 1] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) new_data = dict() nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total=len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] num_mismatches = lambda x, y: sum( [bool(n1 != n2) for n1, n2 in zip(x, y)]) if 'index' in df.columns: df = df[[col for col in df.columns if col != 'index']] if len(df) == 0: continue ## Row-wise statistics mut_cols = [col for col in df.columns if col != 'Count'] col_to_ref_nt = {col: col[0] for col in mut_cols} df_dd = defaultdict(list) for idx, row in df.iterrows(): df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt)) df_dd['Has aberrant CBE edit'].append( has_aberrant_cbe_edits(row, col_to_ref_nt)) for col in df_dd: df[col] = df_dd[col] new_data[nm] = df ## Overall statistics stats_dd['Name (unique)'].append(nm) edited_ct = sum(df[df['Num. edits'] > 0]['Count']) stats_dd['Edited count'].append(edited_ct) cbe_aberrant_ct = sum(df[df['Has aberrant CBE edit'] == True]['Count']) stats_dd['CBE aberrant count'].append(cbe_aberrant_ct) total_ct = sum(df['Count']) stats_dd['Total count'].append(total_ct) try: frac = edited_ct / total_ct except ZeroDivisionError: frac = np.nan stats_dd['Fraction edited'].append(frac) try: frac = cbe_aberrant_ct / total_ct except ZeroDivisionError: frac = np.nan stats_dd['Fraction CBE aberrant edit'].append(frac) timer.update() # Save with open(out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx), 'wb') as f: pickle.dump(new_data, f) stats_df_collected = pd.DataFrame(stats_dd) import code code.interact(local=dict(globals(), **locals())) stats_df = lib_design.merge( stats_df_collected, on='Name (unique)', how='outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return
def check(): _ = _data.load_data() # 動作確認用に呼ぶだけ呼んでおく create_model().check()