def main(): print(NAME) import glob mdf = pd.DataFrame() fns = glob.glob(inp_dir + '*bootstrap*') timer = util.Timer(total=len(fns)) for fn in fns: cond = fn.split('/')[-1].replace('_bootstrap.csv', '') df = pd.read_csv(fn, index_col=0) df['Condition'] = cond mdf = mdf.append(df, ignore_index=True) timer.update() mdf.to_csv(out_dir + '_combined_gmean_bootstrap.csv') # Not bootstrap mdf = pd.DataFrame() fns = [fn for fn in os.listdir(inp_dir) if 'bootstrap' not in fn] timer = util.Timer(total=len(fns)) for fn in fns: df = pd.read_csv(inp_dir + fn) cond = fn.replace('.csv', '') df['Condition'] = cond n = len(df) df['Regression weight'] = 1 / n mdf = mdf.append(df, ignore_index=True) timer.update() mdf.to_csv(out_dir + '_all_ratios.csv') return
def combine(modelexp_nm): # aggstats mdf = pd.DataFrame() timer = util.Timer(total=params['num_splits']) for split in range(params['num_splits']): inp_fn = inp_dir + f'{modelexp_nm}/aggstats_{split}.csv' if os.path.isfile(inp_fn): df = pd.read_csv(inp_fn, index_col=0) mdf = mdf.append(df, ignore_index=True, sort=False) timer.update() mdf.to_csv(out_dir + f'{modelexp_nm}-aggstats.csv') # evals me_df = pd.read_csv(_config.DATA_DIR + f'{modelexp_nm}.csv') mdf = pd.DataFrame() timer = util.Timer(total=len(me_df)) for idx in range(len(me_df)): inp_fn = inp_dir + f'{modelexp_nm}/evals_{idx}.csv' if os.path.isfile(inp_fn): df = pd.read_csv(inp_fn, index_col=0) mdf = mdf.append(df, ignore_index=True, sort=False) timer.update() mdf.to_csv(out_dir + f'{modelexp_nm}-evals.csv') return
def gather(editor): print(editor) conds = exp_design_df[exp_design_df['Editor'] == editor]['Name'] # mdf mdf = pd.DataFrame() timer = util.Timer(total=len(conds)) for cond in conds: try: df = pd.read_csv(inp_dir + f'{cond}.csv', index_col=0) mdf = mdf.append(df, ignore_index=True, sort=False) except: continue timer.update() mdf.to_csv(out_dir + f'{editor}.csv') # len melt mdf = pd.DataFrame() timer = util.Timer(total=len(conds)) for cond in conds: try: df = pd.read_csv(inp_dir + f'{cond}_len_melt.csv', index_col=0) df['Condition'] = cond mdf = mdf.append(df, ignore_index=True, sort=False) except: continue timer.update() mdf.to_csv(out_dir + f'{editor}_len_melt.csv') # pos_melt by nt # Reduce to mean indel_lens = range(-20, 15 + 1) timer = util.Timer(total=len(indel_lens)) for indel_len in indel_lens: mdf = pd.DataFrame() for cond in conds: try: df = pd.read_csv(inp_dir + f'{cond}_pos_{indel_len}nt.csv', index_col=0) df['Condition'] = cond df['Indel length'] = indel_len mdf = mdf.append(df, ignore_index=True, sort=False) except: continue mdf.to_csv(out_dir + f'{editor}_pos_{indel_len}nt.csv') timer.update() return
def get_library_stats(inp_dir_lib, data_type): import glob fns = glob.glob(inp_dir_lib + f'*len_melt.csv') dd = defaultdict(list) timer = util.Timer(total=len(fns)) for fn in fns: cond_nm = fn.split('/')[-1].replace('_len_melt.csv', '') df = pd.read_csv(fn, index_col=0) dd['Name'].append(cond_nm) dd['Data type'].append(data_type) dd['Editor'].append( exp_design_df[exp_design_df['Name'] == cond_nm]['Editor'].iloc[0]) fq_1nt_indels = sum(df[df['Indel length'].isin( [-1, 1])]['Frequency']) / sum(df['Frequency']) dd['Fraction of 1-bp indels in indels'].append(fq_1nt_indels) fq_1nt_dels = sum(df[df['Indel length'].isin( [-1])]['Frequency']) / sum(df['Frequency']) dd['Fraction of 1-bp dels in indels'].append(fq_1nt_dels) fq_1nt_ins = sum(df[df['Indel length'].isin([-1])]['Frequency']) / sum( df['Frequency']) dd['Fraction of 1-bp ins in indels'].append(fq_1nt_ins) timer.update() return pd.DataFrame(dd)
def get_statistics(cond): df1 = pd.read_csv(inp_dir + f'{cond}_pos_1nt.csv', index_col=0) df2 = pd.read_csv(inp_dir + f'{cond}_pos_-1nt.csv', index_col=0) mdf = df1.append(df2, ignore_index=True) bs_dd = defaultdict(list) positions = [col for col in mdf.columns if col != 'Name'] timer = util.Timer(total=len(positions)) for pos in positions: dfs = mdf[pos] means = [] for bs_idx in range(1000): bs_data = np.random.choice(dfs, size=len(dfs), replace=True) means.append(np.mean(bs_data)) bs_dd['Position'].append(pos) bs_dd['Mean'].append(np.mean(dfs)) bs_dd['Mean - stderr'].append(np.percentile(means, 50 - 34)) bs_dd['Mean + stderr'].append(np.percentile(means, 50 + 34)) bs_dd['2.5th percentile'].append(np.percentile(means, 2.5)) bs_dd['97.5th percentile'].append(np.percentile(means, 97.5)) timer.update() bs_df = pd.DataFrame(bs_dd) bs_df = bs_df.sort_values(by='Position').reset_index() bs_df.to_csv(out_dir + f'{cond}.csv') return
def individualize(inp_dir, out_dir): # a_gather produces large dataframes of 2000 experiments concatenated together. # extracting dataframes for each individual experiment is slow, while it's faster to just read in individual csv's for each experiment. (This functions produces individual csv's). for inp_fn in os.listdir(inp_dir): if not fnmatch.fnmatch(inp_fn, '*csv'): continue # if inp_fn not in ['PRL-Lib1-mES.csv', 'PRL-DisLib-mES.csv', 'Lib1-mES.csv']: # continue inp_nm = inp_fn.replace('.csv', '') out_fold = out_dir + inp_nm + '/' util.ensure_dir_exists(out_fold) df = pd.read_csv(inp_dir + inp_fn) exps = set(df['Experiment']) print inp_nm timer = util.Timer(total=len(exps)) for exp in exps: out_fn = out_fold + '%s.csv' % (exp) d = df[df['Experiment'] == exp] d.to_csv(out_fn) timer.update() return
def main(nm='', start='', end=''): print(NAME) print(nm) start, end = int(start), int(end) out_dir = out_place + nm + '/' util.ensure_dir_exists(out_dir) print('Preparing alignment output directories...') nms = all_names[start:end + 1] prepare_align_outdirs(out_dir, nms) print('Done') global expected_cutsite expected_cutsite = len('GATGGGTGCGACGCGTCAT') + 28 inp_dir = inp_place + nm + '/' timer = util.Timer(total=len(nms)) for target_nm in nms: data = defaultdict(list) for split in os.listdir(inp_dir): if split == 'aligns': continue inp_fn = inp_dir + '%s/%s.txt' % (split, target_nm) remaster_aligns(inp_fn, data) save_alignments(data, out_dir, target_nm) timer.update() return
def main(inp_dir, out_dir, nm='none', start='none', end='none'): print NAME util.ensure_dir_exists(out_dir) if nm == 'none' and start == 'none' and end == 'none': gen_qsubs() return if nm != 'none' and start == 'none' and end == 'none': # Run single print nm res, context = set_master_expected_cutsite(nm) if res is False: return genotype_data(inp_dir, out_dir, nm, context) return # Run many start, end = int(start), int(end) timer = util.Timer(total=end - start + 1) for idnum in range(start, end + 1): srr_id = 'SRR%s' % (idnum) # print srr_id res, context = set_master_expected_cutsite(srr_id) if res is False: continue genotype_data(inp_dir, out_dir, srr_id, context) timer.update() return out_dir
def add_negative_controls(aret): # ensure gfp is not included gfpgrnas = [] with open(_config.DATA_DIR + 'egfp_NGG_NNG_seq.patman_format.txt') as f: for i, line in enumerate(f): if line[0] != '>': gfpgrnas.append(line.strip()) headers, sqs = aret sqs += gfpgrnas sqs = [s[-15:-3] for s in sqs] # use 12-mer seed region sqs = set(sqs) new_h, reads = [], [] timer = util.Timer(total=125) for i in range(125): ok = False while True: skip = False cand = ''.join(np.random.choice(['A', 'C', 'G', 'T'], 12)) for s in sqs: if mismatch(cand, s) < 3: skip = True break if skip: continue # if ok... proceed reads.append('ATATATCTTGTGGAAAGGACGAAACACC' + ''.join(np.random.choice(['A', 'C', 'G', 'T'], 8)) + cand + 'GTTTAAGAGCTATGCTGGAAACAGCATAGC') new_h.append('neg_control_' + str(i)) break timer.update() return new_h, reads
def build_vo_data(out_dir, exp): print exp inp_dir = '/cluster/mshen/prj/vanoverbeek/out/e10_control_adjustment/' srrids = get_srr_ids(exp.replace('VO_', '')) data = defaultdict(list) # Build data timer = util.Timer(total=len(srrids)) for srr_id in srrids: csv_fn = inp_dir + '%s.csv' % (srr_id) if os.path.isfile(csv_fn): d = pd.read_csv(csv_fn) if len(d) > 0: individual_piechart(d, data) timer.update() # Pickle, convert defaultdict to regular dict picklable_data = dict() for key in data: picklable_data[key] = data[key] with open(out_dir + '%s.pkl' % (exp), 'w') as f: pickle.dump(picklable_data, f) return data
def load_transversion_data(nm_to_conds): nms = list(nm_to_conds.keys()) df = pd.read_csv(inp_dir_mutant + f'mmdf_12kChar.csv') combined_conds = [f'C_GA_{cond}' for cond in sorted(nms)] id_cols = ['Name', 'Position'] dfs = df[id_cols + combined_conds] dfs.to_csv(out_dir + 'transversion_purity.csv') # Bootstrap bs_dd = dict() timer = util.Timer(total=len(nms)) for nm in nms: col = f'C_GA_{nm}' data = dfs[col].dropna() bs_means = [] for bs_idx in range(5000): bs_data = np.random.choice(data, size=len(data)) bs_means.append(np.mean(bs_data)) bs_dd[nm] = bs_means timer.update() bs_df = pd.DataFrame(bs_dd) bs_df.to_csv(out_dir + 'transversion_purity-bootstrap.csv') return
def get_poswise_df(data, nm_to_seq, treat_nm): dd = defaultdict(list) timer = util.Timer(total=len(data)) for nm in data: pw = data[nm] seq = nm_to_seq[nm] for jdx in range(len(pw)): pos = _data.idx_to_pos(jdx, treat_nm) ref_nt = seq[jdx] ref_idx = nt_to_idx[ref_nt] total = sum(pw[jdx]) for kdx in range(len(pw[jdx])): if kdx == ref_idx: continue count = pw[jdx][kdx] dd['Count'].append(count) dd['Total count'].append(total) dd['Obs nt'].append(nts[kdx]) dd['Ref nt'].append(ref_nt) dd['Position'].append(pos) dd['Name'].append(nm) timer.update() df = pd.DataFrame(dd) return df
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm): timer = util.Timer(total=len(to_remove)) for idx, row in to_remove.iterrows(): exp_nm = row['Name'] mut_nm = row['MutName'] if exp_nm not in adj_d: continue t = adj_d[exp_nm] cat, ids, ide, idl, mhl, ib = mut_nm.split('_') t_cat_set = set(t['Category']) if len(t_cat_set) == 1 and 'wildtype' in t_cat_set: continue crit = (t['Category'] == cat) & (t['Indel start'] == float(ids)) & ( t['Indel end'] == float(ide)) & (t['Indel length'] == float(idl)) & ( t['MH length'] == float(mhl)) & (t['Inserted bases'] == ib) t.loc[crit, 'Count'] = 0 t = t[t['Count'] > 0] t['Frequency'] = t['Count'] / sum(t['Count']) adj_d[exp_nm] = t timer.update() return adj_d
def build_nm_to_idxs(df): ''' Exploits ordered structure ''' print(f'Building index ...') d = dict() curr_nm = '' start_idx = -1 timer = util.Timer(total=len(df)) for idx, row in df.iterrows(): nm = row['Read name'] if nm != curr_nm: if curr_nm != '': d[curr_nm] = { 'start_idx': start_idx, 'end_idx': idx, } start_idx = idx curr_nm = nm timer.update() # Last load d[curr_nm] = { 'start_idx': start_idx, 'end_idx': idx, } return d
def get_statistics(editor): df = pd.read_csv(inp_dir + f'{editor}_len_melt.csv', index_col=0) bs_dd = defaultdict(list) indel_lens = set(df['Indel length']) timer = util.Timer(total=len(indel_lens)) for indel_len in indel_lens: dfs = df[df['Indel length'] == indel_len] means = [] for bs_idx in range(1000): bs_data = np.random.choice(dfs['Frequency'], size=len(dfs), replace=True) means.append(np.mean(bs_data)) bs_dd['Indel length'].append(indel_len) bs_dd['Mean'].append(np.mean(dfs['Frequency'])) bs_dd['Mean - stderr'].append(np.percentile(means, 50 - 34)) bs_dd['Mean + stderr'].append(np.percentile(means, 50 + 34)) bs_dd['2.5th percentile'].append(np.percentile(means, 2.5)) bs_dd['97.5th percentile'].append(np.percentile(means, 97.5)) timer.update() bs_df = pd.DataFrame(bs_dd) bs_df = bs_df.sort_values(by='Indel length').reset_index() bs_df.to_csv(out_dir + f'{editor}.csv') return
def main(inp_dir, out_dir, srr_id='', start='none', end='none'): print NAME util.ensure_dir_exists(out_dir) # Function calls if srr_id == '' and start == 'none' and end == 'none': gen_qsubs() return if srr_id != '' and start == 'none' and end == 'none': if is_control(srr_id): print 'is control' return control_adjustment(inp_dir, out_dir, srr_id) return start, end = int(start), int(end) timer = util.Timer(total=end - start + 1) for idnum in range(start, end + 1): srr_id = 'SRR%s' % (idnum) ans = is_control(srr_id) if ans is False: control_adjustment(inp_dir, out_dir, srr_id) timer.update() return out_dir
def filter_inprofile_batch_effects(): df = pd.read_csv(_config.DATA_DIR + 'batch_effects.csv') inprofile_batches = set(df['Batch']) be_treatments = [ s for s in treat_control_df['Treatment'] if 'Cas9' not in s ] timer = util.Timer(total=len(be_treatments)) for treat_nm in be_treatments: batch = exp_nm_to_batch[treat_nm] if batch in inprofile_batches: print(treat_nm, batch) adj_d = _data.load_data(treat_nm, 'ag4a2_adjust_batch_effects') to_remove = df[df['Batch'] == batch] lib_design, seq_col = _data.get_g4_lib_design(treat_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm) with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f: pickle.dump(adj_d, f) else: inp_fn = inp_dir + '%s.pkl' % (treat_nm) subprocess.check_output('cp %s %s' % (inp_fn, out_dir), shell=True) timer.update() return
def get_cigars(inp_dir, out_dir, spacers): print '\tGetting cigars...' timer = util.Timer(total = len(spacers)) for spc in spacers.values(): for i in range(len(spc['runs'])): cigars = {} run = spc['runs'][i] foldnm = _lib.exp_fold_name(run) fn = _config.DATA_DIR + foldnm + '/' + run + '.sam' num_aligns, num_kept = 0.0, 0.0 with open(fn) as f: for _, line in enumerate(f): if not line.startswith('@'): num_aligns += 1 chro, start = line.split()[2], int(line.split()[3]) cigar = line.split()[5] if chro == 'chr' + spc['chr']: if spc['start']-300 <= start <= spc['start']+300: if cigar not in cigars: cigars[cigar] = 0 cigars[cigar] += 1 num_kept += 1 frac_kept = num_kept / num_aligns if frac_kept < 0.80 and num_aligns > 1000: print '\tWARNING: Kept:', num_kept / num_aligns, 'of', num_aligns, 'for spacer', spc['num'], ':', run, spc['libnms'][i] out_fn = out_dir + spc['libnms'][i] + '.txt' with open(out_fn, 'w') as f: for cigar in cigars: f.write('>' + str(cigars[cigar]) + '\n' + cigar + '\n') timer.update() return
def build_library_data(out_dir, exp): print exp # if exp in ['2k-mES-Cas9-Tol2']: # inp_dir = '/cluster/mshen/prj/mmej_manda2/out/2017-10-27/e_newgenotype/' if exp in [ 'Lib1-mES', 'Lib1-HCT116', 'Lib1-HEK293T', 'DisLib-U2OS', 'DisLib-mES', 'DisLib-HEK293T', 'DisLib-U2OS-HEK-Mixture', 'PRL-Lib1-mES', 'PRL-DisLib-mES' ]: inp_dir = '/cluster/mshen/prj/mmej_figures/out/b_individualize/' exp_dir = inp_dir + exp + '/' data = defaultdict(list) timer = util.Timer(total=len(os.listdir(exp_dir))) for fn in os.listdir(exp_dir): if not fnmatch.fnmatch(fn, '*csv'): continue csv_fn = exp_dir + fn d = pd.read_csv(csv_fn) individual_piechart(d, data) timer.update() # Pickle, convert defaultdict to regular dict picklable_data = dict() for key in data: picklable_data[key] = data[key] with open(out_dir + '%s.pkl' % (exp), 'w') as f: pickle.dump(picklable_data, f) return data
def build_vo_data(out_dir, exp, wildtype = False): print exp inp_dir = '/cluster/mshen/prj/vanoverbeek/out/b_polish/' if wildtype: castype = 'WT' else: castype = '48h' srrids = get_srr_ids(exp.replace('VO_', ''), castype) data = defaultdict(list) # Build data timer = util.Timer(total = len(srrids)) for srr_id in srrids: get_mismatches(inp_dir + srr_id + '/', data, srr_id = srr_id) timer.update() # Pickle, convert defaultdict to regular dict picklable_data = dict() for key in data: if key not in picklable_data: picklable_data[key] = data[key] with open(out_dir + '%s.pkl' % (exp), 'w') as f: pickle.dump(picklable_data, f) return data
def merge_lowpances(df): num_samples = len(aligned_lowpance['1']) new_df = pd.DataFrame() timer = util.Timer(total=num_samples) for idx in range(num_samples): new_samplenm = f'Fq {1 + idx}' samples = [ f'Fq {aligned_lowpance[pance][idx]}' for pance in aligned_lowpance ] dfs = df[df['Sample name'].isin(samples)] pv_df = dfs.pivot(index='Full genotype', columns='Sample name', values='Frequency') pv_df = pv_df.fillna(value=0) pv_df['Mean fq'] = pv_df.apply(np.mean, axis='columns') pv_df['Mean fq'] /= sum(pv_df['Mean fq']) pv_df['Full genotype'] = pv_df.index pv_df = pv_df[['Full genotype', 'Mean fq']] dfm = pv_df.melt(id_vars=['Full genotype'], value_name='Frequency') dfm['Sample name'] = new_samplenm dfm['Sample'] = 1 + idx new_df = new_df.append(dfm, ignore_index=True) timer.update() return new_df
def count_reads(exp, inp_dir, lib_design): dd = defaultdict(list) timer = util.Timer(total=len(lib_design['Name (unique)'])) for nm in lib_design['Name (unique)']: ctd = get_counts_subfold(inp_dir + nm + '/') dd['Name'].append(nm) try: dd['Total count'].append(ctd['Total count']) except: import code code.interact(local=dict(globals(), **locals())) dd['Total ULMI count'].append(ctd['Total ULMI count']) dd['WT count'].append(ctd['WT count']) dd['WT ULMI count'].append(ctd['WT ULMI count']) dd['Indel count'].append(ctd['Indel count']) dd['Indel ULMI count'].append(ctd['Indel ULMI count']) timer.update() df = pd.DataFrame(dd) df.to_csv(out_dir + '%s.csv' % (exp)) return
def split(inp_fn, out_nm): inp_fn_numlines = util.line_count(inp_fn) num_splits = 60 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 # print 'Using split size %s' % (split_size) split_num = 0 timer = util.Timer(total=num_splits) for idx in range(1, inp_fn_numlines, split_size): start = idx end = start + split_size out_fn = out_dir + out_nm + '_%s.fq' % (split_num) skip = False if os.path.isfile(out_fn): size_mb = os.path.getsize(out_fn) / 1e6 if size_mb > 0: skip = True if not skip: command = 'tail -n +%s %s | head -n %s > %s' % ( start, inp_fn, end - start, out_fn) subprocess.check_output(command, shell=True) split_num += 1 # print(command) timer.update() return
def get_trajectory(major_threshold): mdf = pd.DataFrame() timer = util.Timer(total=len(design_df)) for nm in design_df['Short name']: df = pd.read_csv(inp_dir + f'{nm}_t{major_threshold}.csv', index_col=0) # Filter df = df[df['Count'] >= 5] fq_col = f'Fq {nm}' df[fq_col] = df['Count'] / sum(df['Count']) df = df[['Full genotype', fq_col]] if len(mdf) == 0: mdf = df else: mdf = mdf.merge(df, on='Full genotype', how='outer') timer.update() mdf = mdf.fillna(value=0) mdf.to_csv(out_dir + f'pv_trajectory_t{major_threshold}.csv') dfm = mdf.melt(id_vars='Full genotype', var_name='Sample name', value_name='Frequency') dfm['Sample'] = [int(s.split()[-1]) for s in dfm['Sample name']] dfm.to_csv(out_dir + f'mel_trajectory_t{major_threshold}.csv') return
def main(argv): print(NAME) modelexp_nm = argv[0] print(modelexp_nm) exp_design = pd.read_csv(_config.DATA_DIR + f'{modelexp_nm}.csv') hyperparam_cols = [col for col in exp_design.columns if col != 'Name'] new_out_dir = out_dir + f'{modelexp_nm}/' util.ensure_dir_exists(new_out_dir) print(f'Collating experiments...') model_out_dir = _config.OUT_PLACE + f'_fitness_from_reads_pt_multi/{modelexp_nm}/' num_fails = 0 timer = util.Timer(total = len(exp_design)) for idx, row in exp_design.iterrows(): int_nm = row['Name'] real_nm = row['dataset'] try: command = f'cp {model_out_dir}/model_{int_nm}/_final_fitness.csv {new_out_dir}/fitness_{int_nm}.csv' subprocess.check_output(command, shell = True) command = f'cp {model_out_dir}/model_{int_nm}/_final_genotype_matrix.csv {new_out_dir}/genotype_matrix_{int_nm}.csv' subprocess.check_output(command, shell = True) except: num_fails += 1 timer.update() print(f'Collated {len(exp_design)} experiments with {num_fails} failures') return
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) if 'Lib1' in data_nm or 'VO' in data_nm: dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name') if 'DisLib' in data_nm: dataset = _data.load_dataset(data_nm, exp_subset = 'clin', exp_subset_col = 'Designed Name') # Remove data with iterated editing dlwt = _config.d.DISLIB_WT for idx, row in dlwt.iterrows(): if row['wt_repairable'] == 'iterwt': del dataset[row['name']] if dataset is None: return timer = util.Timer(total = len(dataset)) # for exp in dataset.keys()[:100]: for exp in dataset.keys(): df = dataset[exp] calc_statistics(df, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_statistics(data_nm1, data_nm2): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc. # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) # If Library, subset VO spacers dataset1 = _data.load_dataset(data_nm1) dataset2 = _data.load_dataset(data_nm2) if dataset1 is None or dataset2 is None: return # Find shared exps and iterate through them, passing both shared exps together to calc_statistics shared_exps = set(dataset1.keys()) & set(dataset2.keys()) if len(shared_exps) == 0: print 'ERROR: No shared exps' timer = util.Timer(total = len(shared_exps)) for exp in shared_exps: d1 = dataset1[exp] d2 = dataset2[exp] calc_statistics(d1, d2, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name') if dataset is None: return e_dir = '/cluster/mshen/prj/mmej_figures/out/e_ins_modeling/' timer = util.Timer(total = 100) for rs in range(100): # for rs in range(1): prefix = e_dir + 'len_%s_%s' % (data_nm, rs) test_exps = pickle.load(open(prefix + '_testexps.pkl')) rate_model = pickle.load(open(prefix + '_model.pkl')) bp_model = pickle.load(open(prefix + '_bp.pkl')) for exp in test_exps: df = dataset[exp] calc_statistics(df, exp, rate_model, bp_model, alldf_dict, rs, data_nm) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc. # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) dataset = _data.load_dataset(data_nm) if dataset is None: return timer = util.Timer(total=len(dataset)) for exp in dataset: df = dataset[exp] calc_statistics(df, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) col_order = [ '_Experiment', 'Editing Rate', '0gt Frequency', 'Ngt Frequency', '-10', '-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10' ] if len(col_order) != len(alldf.columns): print 'ERROR: Will drop columns' alldf = alldf[col_order] return alldf
def gather_statistics(exp_nm, params): (muts, allowed_pos, feature_radius) = params # Load data data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0) # Set up library info lib_nm = _data.get_lib_nm(exp_nm) lib_design, seq_col = _data.get_lib_design(exp_nm) nms = lib_design['Name (unique)'] seqs = lib_design[seq_col] nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)} # Prepare data data = data[data['Total count'] >= 100] data['Frequency'] = data['Count'] / data['Total count'] ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm) data = data[data['Name'].isin(ontarget_sites)] data = data[data['Position'].isin(allowed_pos)] data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt'] data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation'] # Annotate with local sequence context lib_zero_idx = _data.pos_to_idx(0, exp_nm) dd = defaultdict(list) print('Annotating data with local sequence contexts...') timer = util.Timer(total = len(data)) for idx, row in data.iterrows(): seq = nm_to_seq[row['Name']] pidx = row['Position'] + lib_zero_idx local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1] dd['Local context'].append(local_context) timer.update() for col in dd: data[col] = dd[col] # # Gather statistics for mut_nm in muts: print(mut_nm) mut = muts[mut_nm] if len(mut) == 1: d_temp = data[data['Mutation'] == mut[0]] else: d_temp = data[data['Mutation'].isin(mut)] d_temp['Mutation'] = mut_nm d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation'] group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']] d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index() for ml_task in ['classify_zero', 'regress_nonzero']: print(ml_task) results = train_models(exp_nm, d_temp, mut_nm, ml_task) save_results(exp_nm, mut_nm, ml_task, results) return