def main(): print(NAME) for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm != '12kChar': continue if 'U2OS' in treat_nm: continue num_targets = 12000 num_targets_per_split = 2000 print(treat_nm) mdf = pd.DataFrame() data = None stats_df = pd.DataFrame() for start_idx in range(0, num_targets, num_targets_per_split): stats_fn = inp_dir + '%s_%s_%s_stats.csv' % ( treat_nm, start_idx, start_idx + num_targets_per_split - 1) df = pd.read_csv(stats_fn, index_col=0) stats_df = stats_df.append(df, ignore_index=True) stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm)) print('Done') return
def remove_batch_effects(treat_nm, start_idx, end_idx): batch_nm = exp_nm_to_batch[treat_nm] lib_design, seq_col = _data.get_lib_design(treat_nm) lib_nm = _data.get_lib_nm(treat_nm) lib_design = lib_design.iloc[start_idx:end_idx + 1] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} adj_d = _data.load_data(treat_nm, 'ah6a1b_subtract') batch_muts_to_remove = pd.read_csv( inp_dir + 'removed_batch_effects_%s.csv' % (lib_nm), index_col=0) if len(batch_muts_to_remove) == 0: inp_pkl = _config.OUT_PLACE + f'ah6a1b_subtract/{treat_nm}_{start_idx}_{end_idx}.pkl' out_pkl = out_dir + f'{treat_nm}_{start_idx}_{end_idx}.pkl' command = f'cp {inp_pkl} {out_pkl}' subprocess.check_output(command, shell=True) return # Remove mutations to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] == batch_nm] to_remove = to_remove[to_remove['Name'].isin(nms)] adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm) with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx), 'wb') as f: pickle.dump(adj_d, f) return
def main(): print(NAME) for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm == 'LibA': num_target_sites = 2000 num_sites_per_split = 200 else: num_target_sites = 12000 num_sites_per_split = 2000 print(treat_nm) mdf = pd.DataFrame() data = None for start_idx in range(0, num_target_sites, num_sites_per_split): data_fn = inp_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, start_idx + num_sites_per_split - 1) with open(data_fn, 'rb') as f: temp_d = pickle.load(f) if data is None: data = temp_d else: for key in temp_d: data[key] = temp_d[key] # Data with open(inp_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(data, f) print('Done') return
def gather_statistics(exp_nm, params): (muts, allowed_pos, feature_radius) = params # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total = len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics for mut_nm in muts: print(mut_nm) mut = muts[mut_nm] if len(mut) == 1: d_temp = data[data['Mutation'] == mut[0]] else: d_temp = data[data['Mutation'].isin(mut)] d_temp['Mutation'] = mut_nm d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() for ml_task in ['classify_zero', 'regress_nonzero']: print(ml_task) results = train_models(exp_nm, d_temp, mut_nm, ml_task) save_results(exp_nm, mut_nm, ml_task, results) return
def load_human_data(dataset_id): if 'CSNVL' not in dataset_id: lib_nm = _data.get_lib_nm(dataset_id) lib_design, seq_col = _data.get_lib_design(dataset_id) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] else: # Use any conds to load 12kChar, CtoT, and AtoG libs dids = ['190418_mES_12kChar_AID', '190329_HEK293T_AtoG_ABE', '190307_HEK_CtoT_BE4'] nms, seqs = [], [] for did in dids: lib_design, seq_col = _data.get_lib_design(did) nms += list(lib_design['Name (unique)']) seqs += list(lib_design[seq_col]) nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} Y_dir = _config.OUT_PLACE + 'combin_data_Y_imputewt/' with gzip.open(Y_dir + '%s.pkl.gz' % (dataset_id), 'rb') as f: Y = pickle.load(f) NAMES = list(Y.keys()) Y = list(Y.values()) # Load X if 'CSNVL' not in dataset_id: zero_idxs = [_data.pos_to_idx(0, dataset_id)] * len(NAMES) else: zero_idxs = [] for nm in NAMES: if 'satmut' in nm: # 21 zero_idxs.append(_data.zero_pos['12kChar']) else: # CtoT = AtoG = 10 zero_idxs.append(_data.zero_pos['CtoT']) X = [] timer = _util.Timer(total = len(NAMES)) for nm, y, zero_idx in zip(NAMES, Y, zero_idxs): seq = nm_to_seq[nm] # seq_30nt = seq[zero_idx - 9 : zero_idx + 20 + 1] if zero_idx >= 9 + 10: # 12kChar pass else: # CtoT, AtoG libs prefix = 'GATGGGTGCGACGCGTCAT' seq = prefix + seq zero_idx += len(prefix) seq_50nt = seq[zero_idx - 9 - 10 : zero_idx + 20 + 10 + 1] assert len(seq_50nt) == 50 X.append(seq_50nt) return X, Y, NAMES
def generate_train_test(X, Y, NAMES, dataset_id, train_test_id, valid_frac = 0.10): if 'CSNVL' not in dataset_id: lib_nm = _data.get_lib_nm(dataset_id) else: # Use traintest for 12kChar lib_nm = '12kChar' tt_df = pd.read_csv(_config.OUT_PLACE + 'gen_traintest_idxs/%s_%s.csv' % (lib_nm, train_test_id), index_col = 0) nms_train = set(tt_df[tt_df['Category'] == 'Train']['Name']) nms_test = set(tt_df[tt_df['Category'] == 'Test']['Name']) train_idxs = [NAMES.index(nm) for nm in nms_train if nm in NAMES] test_idxs = [NAMES.index(nm) for nm in nms_test if nm in NAMES] # Validation set is last % of training set num_valid = int(len(train_idxs) * valid_frac) valid_idxs = train_idxs[-num_valid:] train_idxs = train_idxs[:-num_valid] # Optional: subset training set train_idxs = train_idxs[:int(hyperparameters['training_fraction'] * len(train_idxs))] print(f'Training set size: {len(train_idxs)}') print(f'Validation set size: {len(valid_idxs)}') print(f'Test set size: {len(test_idxs)}') print(f'Total size: {len(train_idxs) + len(valid_idxs) + len(test_idxs)}') X_train = [X[idx] for idx in train_idxs] X_valid = [X[idx] for idx in valid_idxs] X_test = [X[idx] for idx in test_idxs] Y_train = [Y[idx] for idx in train_idxs] Y_valid = [Y[idx] for idx in valid_idxs] Y_test = [Y[idx] for idx in test_idxs] NAMES_train = [NAMES[idx] for idx in train_idxs] NAMES_valid = [NAMES[idx] for idx in valid_idxs] NAMES_test = [NAMES[idx] for idx in test_idxs] datasets = { 'train': BaseEditing_Dataset(x = X_train, y = Y_train, nms = NAMES_train), 'valid': BaseEditing_Dataset(x = X_valid, y = Y_valid, nms = NAMES_valid), 'test': BaseEditing_Dataset(x = X_test, y = Y_test, nms = NAMES_test), } x_dim = datasets['train'].x_dim y_mask_dim = datasets['train'].y_mask_dim dataset_sizes = { 'train': len(X_train), 'valid': len(X_valid), 'test': len(X_test), } return datasets, dataset_sizes, x_dim, y_mask_dim
def main(): print(NAME) for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm == 'LibA': num_targets = 2000 num_targets_per_split = 200 elif lib_nm == 'CtoGA': num_targets = 4000 num_targets_per_split = 500 else: num_targets = 12000 num_targets_per_split = 2000 print(treat_nm) mdf = pd.DataFrame() data = None stats_df = pd.DataFrame() for start_idx in range(0, num_targets, num_targets_per_split): data_fn = inp_dir + '%s_%s_%s.pkl' % ( treat_nm, start_idx, start_idx + num_targets_per_split - 1) with open(data_fn, 'rb') as f: temp_d = pickle.load(f) if data is None: data = temp_d else: for key in temp_d: data[key] = temp_d[key] stats_fn = inp_dir + '%s_%s_%s_stats.csv' % ( treat_nm, start_idx, start_idx + num_targets_per_split - 1) df = pd.read_csv(stats_fn, index_col=0) stats_df = stats_df.append(df, ignore_index=True) # Data with open(inp_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(data, f) stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm)) print('Done') return
def gather_statistics(exp_nm): # Load data data = pd.read_csv(inp_dir + '_batch_adjusted_all_ratios-ps0_1bpcorrect.csv', index_col=0) data = data[data['Condition'] == exp_nm] # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name (unique)'].isin(ontarget_sites)] # Annotate with local sequence context # lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total=len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name (unique)']] lib_zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)']) # local_context = row['gRNA (20nt)'] local_context = seq[lib_zero_idx - 9:lib_zero_idx + 20 + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] print(data.shape) results = train_models(exp_nm, data, 'Log10 batch-adjusted base edit to indel ratio') save_results(exp_nm, results) return
def main(): print(NAME) for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm not in ['CtoT', 'AtoG', 'CtoGA']: continue if lib_nm != 'CtoGA': continue if lib_nm in ['CtoT', 'AtoG']: end_idx = 12000 jump = 2000 elif lib_nm == 'CtoGA': end_idx = 4000 jump = 500 print(treat_nm) mdf = pd.DataFrame() data = None stats_df = pd.DataFrame() for start_idx in range(0, end_idx, jump): stats_fn = inp_dir + '%s_%s_%s_stats.csv' % (treat_nm, start_idx, start_idx + jump - 1) df = pd.read_csv(stats_fn, index_col=0) stats_df = stats_df.append(df, ignore_index=True) # Data stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm)) print('Done') return
def main(): print(NAME) for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) num_targets = 12000 num_targets_per_split = 2000 print(treat_nm) mdf = pd.DataFrame() data = None stats_df = pd.DataFrame() for start_idx in range(0, num_targets, num_targets_per_split): stats_fn = inp_dir + '%s_%s_%s.csv' % (treat_nm, start_idx, start_idx + num_targets_per_split - 1) df = pd.read_csv(stats_fn, index_col = 0) stats_df = stats_df.append(df, ignore_index = True) # Data stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm)) # Pivot C->T crit = (stats_df['Ref nt'] == 'C') & (stats_df['Obs nt'] == 'T') stats_df = stats_df[crit] pv_df = stats_df.pivot( index = 'Target site', columns = 'Position', values = 'Frequency', ) pv_df.to_csv(inp_dir + 'poswise_editing_%s.csv' % (treat_nm)) print('Done') return
def gather_statistics(celltype, lib_nm, editor_nm): print(celltype, lib_nm, editor_nm) [rep1, rep2] = _data.get_replicates(celltype, lib_nm, editor_nm) df1 = pd.read_csv(inp_dir + '%s.csv' % (rep1), index_col=0) df2 = pd.read_csv(inp_dir + '%s.csv' % (rep2), index_col=0) lib_nm = _data.get_lib_nm(rep1) lib_design, seq_col = _data.get_lib_design(rep1) ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) # Prepare data # data = data[data['Total count'] >= 100] df1 = df1[df1['Name (unique)'].isin(ontarget_sites)] df2 = df2[df2['Name (unique)'].isin(ontarget_sites)] id_cols = [ 'Name (unique)', 'gRNA (20nt)', seq_col, ] mdf = df1.merge(df2, on=id_cols, suffixes=['_r1', '_r2']) stat_col = 'Fraction edited' mdf['absdiff'] = np.abs(mdf['%s_r1' % (stat_col)] - mdf['%s_r2' % (stat_col)]) mdf['abslfc'] = np.abs( np.log2(mdf['%s_r1' % (stat_col)]) - np.log2(mdf['%s_r2' % (stat_col)])) n_col = 'Total count' mdf['Total n'] = mdf['%s_r1' % (n_col)] + mdf['%s_r2' % (n_col)] mdf.to_csv(out_dir + '%s_%s_%s.csv' % (celltype, lib_nm, editor_nm)) return
def train_models(exp_nm, data, ml_task, seq_col): # Prepare models and data if ml_task == 'regress_nonzero': evals = { 'spearmanr': lambda t, p, w: spearmanr(t, p)[0], 'pearsonr': lambda t, p, w: pearsonr(t, p)[0], 'pearsonr weighted': lambda t, p, w: weighted_pearsonr(t, p, w), 'r2_score weighted': lambda t, p, w: sklearn.metrics.r2_score(t, p, sample_weight = w), 'r2_score unweighted': lambda t, p, w: sklearn.metrics.r2_score(t, p), } data = data[~np.isnan(data['Y'])] data = data.reset_index(drop = True) # Prepare additional features package = featurize(data, exp_nm, seq_col) (X_all, param_nms) = package import code; code.interact(local=dict(globals(), **locals())) # Train test split lib_nm = _data.get_lib_nm(exp_nm) package = get_traintest_package(X_all, data, lib_nm) (x_train, x_test, y_train, y_test, w_train, w_test, nms_train, nms_test) = package # Train models ms_dd = defaultdict(list) ms_dd['Name'].append(exp_nm) model_nm = 'GBTR' # Hyperparameter optimization ''' Approx 20 seconds per fit. 5 * 3 * 6 * 5 * 20 seconds = 2.5 hours ''' from sklearn.model_selection import GridSearchCV hyperparameters = { 'n_estimators': [100, 250, 500], 'min_samples_leaf': [2, 5], 'max_depth': [2, 3, 4, 5], } # hyperparameters = { # 'n_estimators': [100, 200], # 'min_samples_leaf': [1], # 'max_depth': [3, 4], # } model = GridSearchCV( GradientBoostingRegressor(), hyperparameters, cv = 5, verbose = True, ) model.fit(x_train, y_train, sample_weight = w_train) gscv_df = pd.DataFrame(model.cv_results_) gscv_df.to_csv(out_dir + '%s_hyperparamresults.csv' % (exp_nm)) with open(out_dir + '%s_bestmodel.pkl' % (exp_nm), 'wb') as f: pickle.dump(model.best_estimator_, f) pred_train = model.predict(x_train) pred_test = model.predict(x_test) # Store model performance stats in modelstats_dd for ml_eval_nm in evals: eval_f = evals[ml_eval_nm] try: ev = eval_f(y_train, pred_train, w_train) except ValueError: ev = np.nan ms_dd['%s %s train' % (model_nm, ml_eval_nm)].append(ev) try: ev = eval_f(y_test, pred_test, w_test) except ValueError: ev = np.nan ms_dd['%s %s test' % (model_nm, ml_eval_nm)].append(ev) # Record predictions in data pred_df = pd.DataFrame({ 'Name (unique)': nms_train + nms_test, 'y_pred_%s' % (model_nm): list(pred_train) + list(pred_test), 'TrainTest_%s' % (model_nm): ['train'] * len(nms_train) + ['test'] * len(nms_test) }) data = data.merge(pred_df, on = 'Name (unique)') ms_df = pd.DataFrame(ms_dd) ms_df = ms_df.reindex(sorted(ms_df.columns), axis = 1) return (ms_df, data)
def fig_editing_profiles(treat_nm): ''' g4 format: data is a dict, keys = target site names values = np.array with shape = (target site len, 4) entries = int for num. Q30 observations ''' adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb')) lib_design, seq_col = _data.get_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} lib_nm = _data.get_lib_nm(treat_nm) ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm)) ''' Filter treatment mutations that match the unedited background profile using the statistic: fraction of target sites with non-zero event frequency ''' print('Forming long df...') dd = defaultdict(list) timer = util.Timer(total = len(adj_d)) for nm in adj_d: timer.update() if nm not in ontarget_nms: continue pw = adj_d[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): tot = np.nansum(pw[jdx]) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(tot) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) if tot == 0: dd['Frequency'].append(np.nan) else: dd['Frequency'].append(count / tot) dd['Position index'].append(jdx) dd['Position'].append(_data.idx_to_pos(jdx, treat_nm)) dd['Name'].append(nm) df = pd.DataFrame(dd) df = df[df['Total count'] >= 100] n_targetsites_in_condition = len(df) # Form stats_df dd = defaultdict(list) pos_range = sorted(set(df['Position index'])) timer = util.Timer(total = len(pos_range)) for pos_idx in pos_range: timer.update() df_s1 = df[df['Position index'] == pos_idx] for ref_nt in nts: df_s2 = df_s1[df_s1['Ref nt'] == ref_nt] for obs_nt in nts: if obs_nt == ref_nt: continue crit = (df_s2['Obs nt'] == obs_nt) dfs = df_s2[crit] dfs_freq = dfs['Frequency'] num_zeros = sum(dfs_freq == 0) total = len(dfs_freq) if total == 0: continue dd['Num target sites with zero for mutation'].append(num_zeros) dd['Total num target sites for mutation'].append(total) dd['Frequency of zero in target sites for mutation'].append(num_zeros / total) dd['Num target sites in condition'].append(n_targetsites_in_condition) dd['Mean activity'].append(np.mean(dfs_freq)) dd['Position index'].append(pos_idx) dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm)) dd['Obs nt'].append(obs_nt) dd['Ref nt'].append(ref_nt) hm_df = pd.DataFrame(dd) hm_df.to_csv(out_dir + '%s.csv' % (treat_nm)) # Median normalize background_range = range(25, 34 + 1) for ref_nt in nts: for obs_nt in nts: if obs_nt == ref_nt: continue crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity'])) medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity']) hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi)) hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm)) return
def gather_statistics(exp_nm): feature_radius = 10 allowed_pos = range(3, 8 + 1) # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total=len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius:pidx] + seq[pidx + 1:pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics # for mut_nm in muts: # print(mut_nm) # mut = muts[mut_nm] # if len(mut) == 1: # d_temp = data[data['Mutation'] == mut[0]] # else: # d_temp = data[data['Mutation'].isin(mut)] # d_temp['Mutation'] = mut_nm # d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] # group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] # d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() print(data.columns) print(set(data['Mutation'])) acc_muts = [ 'C_T', 'C_G', 'C_A', ] data = data[data['Mutation'].isin(acc_muts)] data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt']) data = data.pivot_table( index=['Name', 'Position', 'Local context'], columns='Mutation', values='Frequency', ).reset_index() data = data.fillna(value=0) numerator = data['C_G'] + data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_GA_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_T'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_T_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_A_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_A'] + data['C_G'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_GA' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] # Generate qsubs only for unfinished jobs num_scripts = 0 for idx, row in treat_control_df.iterrows(): exp_nm = row['Treatment'] lib_nm = _data.get_lib_nm(exp_nm) if 'Cas9' in exp_nm: continue if lib_nm == 'LibA': num_target_sites = 2000 num_sites_per_split = 200 else: num_target_sites = 12000 num_sites_per_split = 2000 try: mb_file_size = os.path.getsize(inp_dir + '%s.pkl' % (exp_nm)) / 1e6 except FileNotFoundError: mb_file_size = 0 ram_gb = 2 if mb_file_size > 200: ram_gb = 4 if mb_file_size > 400: ram_gb = 8 if mb_file_size > 1000: ram_gb = 16 for start_idx in range(0, num_target_sites, num_sites_per_split): end_idx = start_idx + num_sites_per_split - 1 # out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx) # if os.path.exists(out_pkl_fn): # if os.path.getsize(out_pkl_fn) > 0: # continue command = 'python %s.py %s %s %s' % (NAME, exp_nm, start_idx, end_idx) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, exp_nm, start_idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append( 'qsub -V -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def form_data(exp_nm, start_idx, end_idx): data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_nm = _data.get_lib_nm(exp_nm) disease_nms = _data.get_disease_sites(lib_design, lib_nm) # Subset for dumb parallelization, ensure only disease target sites used lib_design = lib_design.iloc[start_idx:end_idx + 1] lib_design = lib_design[lib_design['Name (unique)'].isin(disease_nms)] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total=len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] design_row = lib_design[lib_design['Name (unique)'] == nm].iloc[0] snp_pos = int(design_row['Position of SNP in gRNA']) correct_nt = design_row['Corrected nucleotide (gRNA orientation)'] path_nt = design_row['Pathogenic nucleotide (gRNA orientation)'] nt_cols = [ col for col in df.columns if col != 'Count' and col != 'Frequency' ] # Impute . as wildtype df = impute_dot_as_wildtype(df, nt_cols) total_ct = sum(df['Count']) # Ensure each row is unique df = df.groupby(nt_cols)['Count'].agg('sum').reset_index() # Filter unedited columns df = subset_edited_rows(df, nt_cols) edited_ct = sum(df['Count']) df = remove_noisy_edits(df, nt_cols, exp_nm) gt_correct_ct = get_precise_gt_correction_count( df, nt_cols, snp_pos, correct_nt, path_nt) ## Overall statistics stats_dd['Name (unique)'].append(nm) stats_dd['Obs. correction count'].append(gt_correct_ct) stats_dd['Obs. total count'].append(total_ct) stats_dd['Obs. edited count'].append(edited_ct) stats_dd['Obs. gt correct fraction in all reads'].append( gt_correct_ct / total_ct if total_ct > 0 else np.nan) stats_dd['Obs. gt correct precision in edited reads'].append( gt_correct_ct / edited_ct if edited_ct > 0 else np.nan) stats_dd['Obs. editing frequency'].append( edited_ct / total_ct if total_ct > 0 else np.nan) # Amino acid correction for CtoGA if 'AA sequence - reference' in design_row.index and type( design_row['AA sequence - reference']) == str: orients = list('-+') d1 = bool(design_row['Designed orientation w.r.t. genome'] == '+') d2 = bool(design_row['AA frame strand'] == '+') xor_int = int(d1 == d2) aa_strand_relative_to_seq = orients[xor_int] aa_stats = { 'Unedited AA': 0, 'Edited AA': 0, 'Goal AA': 0, } if design_row['AA sequence - pathogenic'] != design_row[ 'AA sequence - reference']: for jdx, edit_row in df.iterrows(): seq_30nt = edit_row_to_seq_30nt(design_row, edit_row, seq_col) obs_aas = nts_to_aas(seq_30nt, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) pp0idx = design_row['Protospacer position zero index'] seq_30nt_path = design_row[seq_col][pp0idx - 9:pp0idx + 21] aa_path_with_bc = nts_to_aas( seq_30nt_path, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) seq_30nt_wt = seq_30nt_path[:9 + snp_pos] + design_row[ 'Corrected nucleotide (gRNA orientation)'] + seq_30nt_path[ 9 + snp_pos + 1:] aa_wt_with_bc = nts_to_aas(seq_30nt_wt, design_row['AA frame position'], snp_pos, aa_strand_relative_to_seq) if obs_aas == aa_path_with_bc: aa_stats['Unedited AA'] += edit_row['Count'] else: aa_stats['Edited AA'] += edit_row['Count'] if obs_aas == aa_wt_with_bc: aa_stats['Goal AA'] += edit_row['Count'] stats_dd['Obs. aa correct precision among edited gts'].append( aa_stats['Goal AA'] / edited_ct if edited_ct > 0 else np.nan) stats_dd['Obs. aa correct precision among edited aas'].append( aa_stats['Goal AA'] / aa_stats['Edited AA'] if aa_stats['Edited AA'] > 0 else np.nan) stats_dd['Obs. aa correct precision among all reads'].append( aa_stats['Goal AA'] / total_ct if total_ct > 0 else np.nan) if stats_dd[ 'Obs. aa correct precision among edited gts'] < stats_dd[ 'Obs. gt correct precision in edited reads']: import code code.interact(local=dict(globals(), **locals())) else: stats_dd['Obs. aa correct precision among edited gts'].append( np.nan) stats_dd['Obs. aa correct precision among edited aas'].append( np.nan) stats_dd['Obs. aa correct precision among all reads'].append( np.nan) timer.update() # Save stats_df_collected = pd.DataFrame(stats_dd) stats_df = lib_design.merge( stats_df_collected, on='Name (unique)', how='outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return
def form_data(exp_nm, start_idx, end_idx): ''' Annotate library design with total count, edited count, fraction edited, etc. ''' data = _data.load_data(exp_nm, 'ag5a4_profile_subset') lib_design, seq_col = _data.get_lib_design(exp_nm) lib_nm = _data.get_lib_nm(exp_nm) lib_design = lib_design.iloc[start_idx : end_idx + 1] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)] nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} stats_dd = defaultdict(list) new_data = dict() nms_shared = [nm for nm in nms if nm in data] timer = util.Timer(total = len(nms_shared)) for iter, nm in enumerate(nms_shared): df = data[nm] seq = nm_to_seq[nm] num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)]) if 'index' in df.columns: df = df[[col for col in df.columns if col != 'index']] if len(df) == 0: continue ## 8/21/19 ''' Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt ''' editor = _data.get_editor_nm(exp_nm) editor_to_central_pos = { 'ABE': 6, 'ABE-CP': 6, 'AID': 6, 'BE4': 6, 'BE4-CP': 8, 'CDA': 5, 'eA3A': 6, 'evoAPOBEC': 5, } if editor in editor_to_central_pos: central_pos = editor_to_central_pos[editor] else: central_pos = 6 substrate = 'A' if 'ABE' in editor else 'C' nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns] central_col = find_central_col(central_pos, nt_cols, substrate) if central_col is None: continue mut_cols = [col for col in df.columns if col != 'Count'] col_to_ref_nt = {col: col[0] for col in mut_cols} df_dd = defaultdict(list) for idx, row in df.iterrows(): df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt)) df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt)) for col in df_dd: df[col] = df_dd[col] numer = sum(df[df['Simulated precise'] == True]['Count']) denom = sum(df[df['Num. edits'] > 0]['Count']) sim_precision = numer / denom if denom > 0 else np.nan stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision) stats_dd['Simulated bystander position'].append(int(central_col[1:])) stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos) edited_ct = sum(df[df['Num. edits'] > 0]['Count']) stats_dd['Edited count'].append(edited_ct) stats_dd['Name (unique)'].append(nm) timer.update() stats_df_collected = pd.DataFrame(stats_dd) stats_df = lib_design.merge( stats_df_collected, on = 'Name (unique)', how = 'outer', ) stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] # Generate qsubs only for unfinished jobs treat_control_df = pd.read_csv(_config.DATA_DIR + 'treatment_control_design.csv', index_col=0) num_scripts = 0 for idx, row in treat_control_df.iterrows(): treat_nm, control_nm = row['Treatment'], row['Control'] lib_nm = _data.get_lib_nm(treat_nm) if lib_nm == 'LibA': num_targets = 2000 num_targets_per_split = 200 elif lib_nm == 'CtoGA': num_targets = 4000 num_targets_per_split = 500 else: num_targets = 12000 num_targets_per_split = 2000 ''' Empirically determined pickle > 37 mb: needs 4 gb ram pickle > 335 mb: needs 8 gb ram ''' print(treat_nm) mb_file_size = _data.check_file_size(treat_nm, 'ah6a1a_hf_bc') ram_gb = 2 if mb_file_size > 30: ram_gb = 4 if mb_file_size > 300: ram_gb = 8 if mb_file_size > 1000: ram_gb = 16 ''' Can be very slow - up to 8h+ for some conditions. Could help to split 3 steps into 3 scripts. Statistical tests should be performed globally (for accurate FDR thresholds), and luckily these are the fast parts of the pipeline Subtracting control from treatment involves a lot of dataframe manipulations and is the bottleneck step. Fortunately, this can be parallelized ''' for start_idx in range(0, num_targets, num_targets_per_split): end_idx = start_idx + num_targets_per_split - 1 out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx) if os.path.exists(out_pkl_fn): if os.path.getsize(out_pkl_fn) > 0: continue command = 'python %s.py %s %s %s %s' % (NAME, treat_nm, control_nm, start_idx, end_idx) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s_%s.sh' % (script_id, treat_nm, control_nm, start_idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append( 'qsub -V -P regevlab -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] # Generate qsubs only for unfinished jobs treat_control_df = pd.read_csv(_config.DATA_DIR + 'treatment_control_design.csv', index_col=0) num_scripts = 0 for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm == 'LibA': num_targets = 2000 num_targets_per_split = 200 elif lib_nm == 'CtoGA': num_targets = 4000 num_targets_per_split = 500 else: num_targets = 12000 num_targets_per_split = 2000 for start_idx in range(0, num_targets, num_targets_per_split): end_idx = start_idx + num_targets_per_split - 1 # Skip completed out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx) if os.path.isfile(out_pkl_fn): if os.path.getsize(out_pkl_fn) > 0: continue command = 'python %s.py %s %s %s' % (NAME, treat_nm, start_idx, end_idx) script_id = NAME.split('_')[0] try: mb_file_size = _data.check_file_size(treat_nm, 'ag5a4_profile_subset') except FileNotFoundError: mb_file_size = 0 ram_gb = 2 if mb_file_size > 140: ram_gb = 4 if mb_file_size > 400: ram_gb = 8 if mb_file_size > 1000: ram_gb = 16 # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm, start_idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append( 'qsub -V -P regevlab -l h_rt=4:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def adjust_batch_effects(lib_nm): print(lib_nm) # Gather statistics be_treatments = [] batch_set = set() batch_to_exp_nms = defaultdict(list) for treat_nm in treat_control_df['Treatment']: if 'Cas9' in treat_nm: continue if _data.get_lib_nm(treat_nm) != lib_nm: continue batch_nm = exp_nm_to_batch[treat_nm] be_treatments.append(treat_nm) batch_set.add(batch_nm) batch_to_exp_nms[batch_nm].append(treat_nm) lib_design, seq_col = _data.get_lib_design(be_treatments[0]) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} md = dict() timer = util.Timer(total=len(be_treatments)) print('Loading stats from each condition...') for treat_nm in be_treatments: with open(inp_dir + '%s.pkl' % (treat_nm), 'rb') as f: d = pickle.load(f) md[treat_nm] = d # df['Treatment'] = treat_nm # df['Batch'] = exp_nm_to_batch[treat_nm] # df['Editor'] = exp_nm_to_editor[treat_nm] timer.update() # ANOVA calculations from scipy.stats import f_oneway print( 'Calculating ANOVA on all unique indels in all target sites to identify batch effects...' ) dd = defaultdict(list) means_dd = defaultdict(lambda: defaultdict(lambda: dict())) timer = util.Timer(total=len(nms)) for exp_nm in nms: mut_dd, all_mut_nms = form_dd(md, exp_nm) for mut_nm in all_mut_nms: anova_args = defaultdict(list) # Note: Ensure we do not implicitly treat a lack of data as an observation of zero for exp_nm_2 in mut_dd: anova_args[exp_nm_to_batch[exp_nm_2]].append( mut_dd[exp_nm_2][mut_nm]) ''' Ensure non-degenerate ANOVA testing. If every batch has 0 std, we have identical values. It's likely that these identical values are 0 because of the sparsity of the data when considering unique indels (highly heterogeneous) at 12,000 target sites. If every batch with a non-zero value has only one observation, skip. ''' # Only perform ANOVA test on indels where at least one batch has non-zero std (otherwise it was seen only once in any batch, so it's not a batch effect) num_non_zero_stds = 0 mean_d, std_d = dict(), dict() for batch in batch_set: if batch in anova_args: mean_val = np.mean(anova_args[batch]) std_val = np.std(anova_args[batch]) if std_val > 0: num_non_zero_stds += 1 else: mean_val = np.nan std_val = np.nan mean_d[batch] = mean_val std_d[batch] = std_val degenerate_flag = False if num_non_zero_stds == 0: for batch in batch_set: batch_data = anova_args[batch] if len(batch_data) == 0: continue has_non_zero = bool(batch_data.count(0) != len(batch_data)) if has_non_zero and len(batch_data) == 1: degenerate_flag = True # elif has_non_zero and len(batch_data) > 1: # import code; code.interact(local=dict(globals(), **locals())) if degenerate_flag: continue aa = tuple([s for s in anova_args.values() if len(s) != 0]) if len(aa) < 2: continue fstat, pval = f_oneway(*aa) if np.isnan(pval): continue dd['Statistic'].append(fstat) dd['pval'].append(pval) dd['MutName'].append(mut_nm) dd['Name'].append(exp_nm) for batch in batch_set: dd['Mean %s' % (batch)].append(mean_d[batch]) dd['Std %s' % (batch)].append(std_d[batch]) means_dd[exp_nm][mut_nm][batch] = mean_val timer.update() stats_df = pd.DataFrame(dd) if len(stats_df) == 0: empty_df = pd.DataFrame() empty_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm)) empty_df.to_csv(out_dir + 'removed_batch_effects_%s.csv' % (lib_nm)) empty_df.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm)) return stats_df['-log10p'] = -np.log10(stats_df['pval']) # Apply FDR print( 'Finding significant batch effects while controlling false discovery...' ) fdr_threshold = 0.01 other_distribution = stats_df[stats_df['pval'] > 0.995] stats_df = stats_df[stats_df['pval'] <= 0.995] stats_df = stats_df.sort_values(by='pval') stats_df = stats_df.reset_index(drop=True) fdr_decs, hit_reject = [], False for idx, pval in enumerate(stats_df['pval']): if hit_reject: dec = False else: fdr_critical = ((idx + 1) / len(stats_df)) * fdr_threshold dec = bool(pval <= fdr_critical) fdr_decs.append(dec) if dec is False and hit_reject is True: hit_reject = False stats_df['FDR accept'] = fdr_decs other_distribution['FDR accept'] = False stats_df = stats_df.append(other_distribution, ignore_index=True) stats_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm)) ''' Identify mutations for removal At mutations passing Bonferroni corrected ANOVA test, identify batches where mutations are frequent ''' print('Identifying batches to remove mutations from...') to_remove = stats_df[stats_df['FDR accept'] == True] dd = defaultdict(list) dd_stats = defaultdict(list) timer = util.Timer(total=len(to_remove)) for idx, row in to_remove.iterrows(): timer.update() exp_nm = row['Name'] mut_nm = row['MutName'] means = means_dd[exp_nm][mut_nm] mean_vals = list(means.values()) mean_means = np.mean(mean_vals) for batch_nm in means: if means[batch_nm] >= mean_means or means[batch_nm] >= 0.005: dd['Batch'].append(batch_nm) dd['Name'].append(exp_nm) dd['MutName'].append(mut_nm) for batch_nm in means: dd_stats['%s' % (batch_nm)].append(means[batch_nm]) dd_stats['MutName'].append(mut_nm) dd_stats['Name'].append(exp_nm) batch_muts_to_remove = pd.DataFrame(dd) batch_muts_to_remove.to_csv(out_dir + 'removed_batch_effects_%s.csv' % (lib_nm)) batch_muts_stats = pd.DataFrame(dd_stats) batch_muts_stats.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm)) # Mutations are removed in ah6a3 return