def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] >= 5] if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) pred_df = pred_df[pred_df['Length'] >= 5] pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum( pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0) s = mdf[ns_criteria] alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency']) alldf_dict['Observed Ngt'] += list(s['Frequency']) alldf_dict['_Experiment'] += [exp] * len(s['Frequency']) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] >= 5] if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel_cpf1(seq, cutsite) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] r = pearsonr(obs, pred)[0] alldf_dict['gt_r'].append(r) alldf_dict['_Experiment'].append(exp) return alldf_dict
def predict(inp_fn): # Calculate statistics on df, saving to alldf_dict # Deletion positions _predict2.init_model(run_iter='aay', param_iter='aae') df_buffer = init_df_buffer() df_buffer_nm = '' timer = util.Timer(total=util.line_count(inp_fn)) with open(inp_fn) as f: for i, line in enumerate(f): if i % 2 == 0: header = line.strip() if df_buffer_nm == '': df_buffer_nm = header if i % 2 == 1: sequence = line.strip() if len(sequence) < 60: continue df_buffer = add_del_profiles(header, sequence, df_buffer) print len(df_buffer) if len(df_buffer) > 100000: flush_df_buffer(df_buffer, df_buffer_nm) df_buffer_nm = '' df_buffer = init_df_buffer() timer.update() return
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is ins if sum(_lib.crispr_subset(df)['Count']) <= 1000: return ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & ( df['Indel with Mismatches'] != 'yes') ins_count = sum(df[ins_criteria]['Count']) del_criteria = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') del_count = sum(df[del_criteria]['Count']) if del_count == 0: return alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count)) mhdel_crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Microhomology-Based'] == 'yes') mhdel_count = sum(df[mhdel_crit]['Count']) try: alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count / (mhdel_count + ins_count)) except ZeroDivisionError: alldf_dict['Ins1bp/MHDel Ratio'].append(0) ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count']) alldf_dict['Ins1bp Ratio'].append(ins_ratio) seq, cutsite = _lib.get_sequence_cutsite(df) alldf_dict['Sequence Context'].append(seq[-55:-30] + 'NNNN' + seq[-26:]) alldf_dict['Fourbp'].append(seq[cutsite - 2:cutsite + 2]) alldf_dict['Base1'].append(seq[cutsite - 2]) alldf_dict['Base2'].append(seq[cutsite - 1]) alldf_dict['Base3'].append(seq[cutsite]) alldf_dict['Base4'].append(seq[cutsite + 1]) _predict2.init_model() del_score = _predict2.total_deletion_score(seq, cutsite) alldf_dict['Del Score'].append(del_score) dlpred = _predict2.deletion_length_distribution(seq, cutsite) from scipy.stats import entropy norm_entropy = entropy(dlpred) / np.log(len(dlpred)) alldf_dict['Entropy'].append(norm_entropy) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how = 'outer', on = join_cols) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs gt entropy'].append(obs_entropy) alldf_dict['pred gt entropy'].append(pred_entropy) df = orig_df[orig_df['Category'] == 'del'] df = df[df['Length'] <= 28] df['Frequency'] = _lib.normalize_frequency(df) obs_dl = [] for del_len in range(1, 28+1): freq = sum(df[df['Length'] == del_len]['Frequency']) obs_dl.append(freq) pred_dl = _predict2.deletion_length_distribution(seq, cutsite) obs_entropy = entropy(obs_dl) / np.log(len(obs_dl)) pred_entropy = entropy(pred_dl) / np.log(len(pred_dl)) alldf_dict['obs dl entropy'].append(obs_entropy) alldf_dict['pred dl entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return obs_d = defaultdict(list) df = orig_df # Grab observed deletions, MH and MH-less for del_len in range(1, 59+1): crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len) s = df[crit] mh_s = s[s['Microhomology-Based'] == 'yes'] for idx, row in mh_s.iterrows(): obs_d['Count'].append(row['Count']) obs_d['Genotype Position'].append(row['Genotype Position']) obs_d['Length'].append(row['Length']) obs_d['Category'].append('del') mhless_s = s[s['Microhomology-Based'] != 'yes'] obs_d['Length'].append(del_len) obs_d['Count'].append(sum(mhless_s['Count'])) obs_d['Genotype Position'].append('e') obs_d['Category'].append('del') obs_df = pd.DataFrame(obs_d) # Grab observed 1 bp insertions ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes') ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) obs_df = obs_df.append(ins_df, ignore_index = True) obs_df['Frequency'] = _lib.normalize_frequency(obs_df) crispr_subset = _lib.crispr_subset(orig_df) frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count']) # print frac_explained # Save this for aggregate plotting alldf_dict['Fraction Explained'].append(frac_explained) # Predict MH dels and MH-less dels _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_indels(seq, cutsite, rate_model, bp_model) mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length']) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0] # Merge observed and predicted, double check correlation # Store in a way that I can plot it. data_nm_out_dir = out_dir + data_nm + '/' util.ensure_dir_exists(data_nm_out_dir) exp_out_dir = data_nm_out_dir + exp + '/' util.ensure_dir_exists(exp_out_dir) out_fn = exp_out_dir + '%.3f.csv' % (r) mdf.to_csv(out_fn) # Store in alldf_dict alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) df = df.append(ins_df, ignore_index=True) df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_mhdel(seq, cutsite) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp genotype frequencies pred_1bpins_d = defaultdict(list) for ins_base in bp_model[fivebase]: freq = bp_model[fivebase][ins_base] freq *= rate_1bpins / (1 - rate_1bpins) pred_1bpins_d['Category'].append('ins') pred_1bpins_d['Length'].append(1) pred_1bpins_d['Inserted Bases'].append(ins_base) pred_1bpins_d['Predicted_Frequency'].append(freq) pred_1bpins_df = pd.DataFrame(pred_1bpins_d) pred_df = pred_df.append(pred_1bpins_df, ignore_index=True) pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] r = pearsonr(obs, pred)[0] alldf_dict['gt_r'].append(r) obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs entropy'].append(obs_entropy) alldf_dict['pred entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is ins if sum(_lib.crispr_subset(df)['Count']) <= 1000: return editing_rate = sum(_lib.crispr_subset(df)['Count']) / sum( _lib.notnoise_subset(df)['Count']) alldf_dict['Editing Rate'].append(editing_rate) ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & ( df['Indel with Mismatches'] != 'yes') ins_count = sum(df[ins_criteria]['Count']) del_criteria = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') del_count = sum(df[del_criteria]['Count']) if del_count == 0: return alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count)) mhdel_crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Microhomology-Based'] == 'yes') mhdel_count = sum(df[mhdel_crit]['Count']) try: alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count / (mhdel_count + ins_count)) except ZeroDivisionError: alldf_dict['Ins1bp/MHDel Ratio'].append(0) ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count']) alldf_dict['Ins1bp Ratio'].append(ins_ratio) seq, cutsite = _lib.get_sequence_cutsite(df) fivebase = seq[cutsite - 1] alldf_dict['Fivebase'].append(fivebase) _predict2.init_model() del_score = _predict2.total_deletion_score(seq, cutsite) alldf_dict['Del Score'].append(del_score) dlpred = _predict2.deletion_length_distribution(seq, cutsite) from scipy.stats import entropy norm_entropy = entropy(dlpred) / np.log(len(dlpred)) alldf_dict['Entropy'].append(norm_entropy) local_seq = seq[cutsite - 4:cutsite + 4] gc = (local_seq.count('C') + local_seq.count('G')) / len(local_seq) alldf_dict['GC'].append(gc) if fivebase == 'A': fivebase_oh = np.array([1, 0, 0, 0]) if fivebase == 'C': fivebase_oh = np.array([0, 1, 0, 0]) if fivebase == 'G': fivebase_oh = np.array([0, 0, 1, 0]) if fivebase == 'T': fivebase_oh = np.array([0, 0, 0, 1]) alldf_dict['Fivebase_OH'].append(fivebase_oh) threebase = seq[cutsite] alldf_dict['Threebase'].append(threebase) if threebase == 'A': threebase_oh = np.array([1, 0, 0, 0]) if threebase == 'C': threebase_oh = np.array([0, 1, 0, 0]) if threebase == 'G': threebase_oh = np.array([0, 0, 1, 0]) if threebase == 'T': threebase_oh = np.array([0, 0, 0, 1]) alldf_dict['Threebase_OH'].append(threebase_oh) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df = orig_df # Get observed frameshift rates obs_fs = {'+0': 0, '+1': 0, '+2': 0} all_ins_lens = set(df[df['Category'].isin(['ins', 'ins_notatcut'])]['Length']) for ins_len in all_ins_lens: crit = (df['Category'].isin(['ins', 'ins_notatcut'])) & (df['Length'] == ins_len) fs = ins_len % 3 count = sum(df[crit]['Count']) key = '+%s' % (int(fs)) obs_fs[key] += count all_del_lens = set(df[df['Category'].isin(['del', 'del_notatcut'])]['Length']) for del_len in all_del_lens: crit = (df['Category'].isin(['del', 'del_notatcut'])) & (df['Length'] == del_len) fs = (-1 * del_len) % 3 count = sum(df[crit]['Count']) key = '+%s' % (int(fs)) obs_fs[key] += count tot = sum(obs_fs.values()) for key in obs_fs: obs_fs[key] /= tot # Predict _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp frequency freq = rate_1bpins / (1 - rate_1bpins) pred = list(dlpred) pred.insert(0, freq) pred = np.array(pred) / sum(pred) pred_fs = {'+0': 0, '+1': 0, '+2': 0} pred_fs['+1'] += pred[0] for idx in range(1, len(pred)): del_len = idx fs = (-1 * del_len) % 3 key = '+%s' % (int(fs)) pred_fs[key] += pred[idx] # Bae predict bae_fs = {'+0': 0, '+1': 0, '+2': 0} bae_dlpred = bae_prediction(seq, cutsite) for idx in range(len(bae_dlpred)): del_len = idx + 1 fs = (-1 * del_len) % 3 key = '+%s' % (int(fs)) bae_fs[key] += bae_dlpred[idx] for fs in ['+0', '+1', '+2']: alldf_dict['Frame'].append(fs) alldf_dict['Bae'].append(bae_fs[fs]) alldf_dict['inDelphi'].append(pred_fs[fs]) alldf_dict['Obs'].append(obs_fs[fs]) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df = orig_df obs_dellens = [] # 1 bp insertion crit = (df['Category'] == 'ins') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == 1) obs_dellens.append(sum(df[crit]['Count'])) # 1 - 28 bp deletions for del_len in range(1, 28 + 1): crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len) obs_dellens.append(sum(df[crit]['Count'])) obs_dellens = np.array(obs_dellens) / sum(obs_dellens) obs = obs_dellens # Predict _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp frequency freq = rate_1bpins / (1 - rate_1bpins) pred = list(dlpred) pred.insert(0, freq) pred = np.array(pred) / sum(pred) r = pearsonr(obs, pred)[0] alldf_dict['dl_r'].append(r) obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs entropy'].append(obs_entropy) alldf_dict['pred entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict