def gather_statistics(exp_nm, params): (muts, allowed_pos, feature_radius) = params # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total = len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics for mut_nm in muts: print(mut_nm) mut = muts[mut_nm] if len(mut) == 1: d_temp = data[data['Mutation'] == mut[0]] else: d_temp = data[data['Mutation'].isin(mut)] d_temp['Mutation'] = mut_nm d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() for ml_task in ['classify_zero', 'regress_nonzero']: print(ml_task) results = train_models(exp_nm, d_temp, mut_nm, ml_task) save_results(exp_nm, mut_nm, ml_task, results) return
def indel_anyindel_seq(exp_nm): ''' Annotate indels with related sequence context (e.g., bases in deletions) ''' df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} indel_dd = defaultdict(list) all_nms = set(df['Name']) timer = util.Timer(total=len(df)) for idx, row in df.iterrows(): # dfs = df[df['Name'] == nm] nm = row['Name'] seq = nm_to_seq[nm] left_del_nt = np.nan right_del_nt = np.nan del_nts = np.nan if row['Category'] == 'del': start_pos = int(row['Indel start adj']) start_idx = _data.pos_to_idx(start_pos, exp_nm) end_pos = int(row['Indel end adj']) end_idx = _data.pos_to_idx(end_pos, exp_nm) if start_idx >= 0 and end_idx <= len(seq): del_nts = seq[start_idx:end_idx] left_del_nt = del_nts[0] right_del_nt = del_nts[-1] indel_dd['Left del nt'].append(left_del_nt) indel_dd['Right del nt'].append(right_del_nt) indel_dd['Del nts'].append(del_nts) timer.update() for col in indel_dd: df[col] = indel_dd[col] df.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def load_human_data(dataset_id): if 'CSNVL' not in dataset_id: lib_nm = _data.get_lib_nm(dataset_id) lib_design, seq_col = _data.get_lib_design(dataset_id) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] else: # Use any conds to load 12kChar, CtoT, and AtoG libs dids = ['190418_mES_12kChar_AID', '190329_HEK293T_AtoG_ABE', '190307_HEK_CtoT_BE4'] nms, seqs = [], [] for did in dids: lib_design, seq_col = _data.get_lib_design(did) nms += list(lib_design['Name (unique)']) seqs += list(lib_design[seq_col]) nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} Y_dir = _config.OUT_PLACE + 'combin_data_Y_imputewt/' with gzip.open(Y_dir + '%s.pkl.gz' % (dataset_id), 'rb') as f: Y = pickle.load(f) NAMES = list(Y.keys()) Y = list(Y.values()) # Load X if 'CSNVL' not in dataset_id: zero_idxs = [_data.pos_to_idx(0, dataset_id)] * len(NAMES) else: zero_idxs = [] for nm in NAMES: if 'satmut' in nm: # 21 zero_idxs.append(_data.zero_pos['12kChar']) else: # CtoT = AtoG = 10 zero_idxs.append(_data.zero_pos['CtoT']) X = [] timer = _util.Timer(total = len(NAMES)) for nm, y, zero_idx in zip(NAMES, Y, zero_idxs): seq = nm_to_seq[nm] # seq_30nt = seq[zero_idx - 9 : zero_idx + 20 + 1] if zero_idx >= 9 + 10: # 12kChar pass else: # CtoT, AtoG libs prefix = 'GATGGGTGCGACGCGTCAT' seq = prefix + seq zero_idx += len(prefix) seq_50nt = seq[zero_idx - 9 - 10 : zero_idx + 20 + 10 + 1] assert len(seq_50nt) == 50 X.append(seq_50nt) return X, Y, NAMES
def indel_anyindel_seq(exp_nm): ''' Investigate if 1 nt deletions at abasic site are related to microhomology Control for position by focusing only on pos 5 ''' df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} dd = defaultdict(list) all_nms = set(df['Name']) five_idx = _data.pos_to_idx(4, exp_nm) import code; code.interact(local=dict(globals(), **locals())) timer = util.Timer(total = len(all_nms)) for nm in all_nms: dfs = df[df['Name'] == nm] seq = nm_to_seq[nm] accept = bool(seq[five_idx] == 'C') & (seq[five_idx + 1] != 'C') for jdx in range(five_idx - 1, -1, -1): if seq[jdx] != 'C': break num_c = abs(five_idx - jdx) if not accept: continue crit = (dfs['Category'] == 'del') & (dfs['Indel length'] == 1) & (dfs['Indel end adj'] == 5.0) row = dfs[crit] if len(row) == 0: dd['Frequency'].append(0) else: dd['Frequency'].append(sum(row['Frequency'])) dd['Num C'].append(num_c) dd['Name'].append(nm) timer.update() df = pd.DataFrame(dd) df.to_csv(out_dir + '%s.csv' % (exp_nm)) return
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm): # timer = util.Timer(total = len(to_remove)) for idx, row in to_remove.iterrows(): pos_idx = _data.pos_to_idx(row['Position'], treat_nm) kdx = nt_to_idx[row['Obs nt']] ref_nt = row['Ref nt'] for nm in adj_d: seq = nm_to_seq[nm] if seq[pos_idx] == ref_nt: t = adj_d[nm] t[pos_idx][kdx] = 0 adj_d[nm] = t # timer.update() return adj_d
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm): # timer = util.Timer(total = len(to_remove)) for idx, row in to_remove.iterrows(): pos_idx = _data.pos_to_idx(row['Position'], treat_nm) kdx = nt_to_idx[row['Obs nt']] ref_nt = row['Ref nt'] for nm in adj_d: seq = nm_to_seq[nm] try: if seq[pos_idx] == ref_nt: t = adj_d[nm] t[pos_idx][kdx] = np.nan adj_d[nm] = t except IndexError: # 8/14/19: Not sure why this would happen -- if indexerror, that pos_idx shouldn't be able to be considered for this treat_nm in the first place to identify a batch effect. Hacky fix :/ print(treat_nm, nm, pos_idx, len(seq)) pass # timer.update() return adj_d
def gather_statistics(exp_nm): feature_radius = 10 allowed_pos = range(3, 8 + 1) # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total=len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius:pidx] + seq[pidx + 1:pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics # for mut_nm in muts: # print(mut_nm) # mut = muts[mut_nm] # if len(mut) == 1: # d_temp = data[data['Mutation'] == mut[0]] # else: # d_temp = data[data['Mutation'].isin(mut)] # d_temp['Mutation'] = mut_nm # d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] # group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] # d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() print(data.columns) print(set(data['Mutation'])) acc_muts = [ 'C_T', 'C_G', 'C_A', ] data = data[data['Mutation'].isin(acc_muts)] data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt']) data = data.pivot_table( index=['Name', 'Position', 'Local context'], columns='Mutation', values='Frequency', ).reset_index() data = data.fillna(value=0) numerator = data['C_G'] + data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_GA_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_T'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_T_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_A'] denominator = data['C_T'] + data['C_G'] + data['C_A'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_A_over_C_D' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) ## numerator = data['C_G'] denominator = data['C_A'] + data['C_G'] data['Frequency'] = numerator / denominator data = data.dropna() mut_name = 'C_G_over_C_GA' data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype( str) + '_' + mut_name print(data.shape) for ml_task in ['regress_nonzero', 'classify_zero']: print(ml_task) results = train_models(exp_nm, data, mut_name, ml_task) save_results(exp_nm, mut_name, ml_task, results) return