def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] >= 5] if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) pred_df = pred_df[pred_df['Length'] >= 5] pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum( pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0) s = mdf[ns_criteria] alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency']) alldf_dict['Observed Ngt'] += list(s['Frequency']) alldf_dict['_Experiment'] += [exp] * len(s['Frequency']) return alldf_dict
def add_del_profiles(header, sequence, df_buffer): for idx in range(len(sequence)): seq = '' if sequence[idx:idx + 2] == 'CC': cutsite = idx + 6 seq = sequence[cutsite - 30:cutsite + 30] seq = compbio.reverse_complement(seq) if sequence[idx:idx + 2] == 'GG': cutsite = idx - 4 seq = sequence[cutsite - 30:cutsite + 30] if seq != '': if len(seq) != 60: continue local_cutsite = 30 pred_df = _predict2.predict_mhdel(seq, local_cutsite) pred_df['header'] = header pred_df['seq'] = sequence pred_df['pam'] = sequence[idx:idx + 2] pred_df['cutsite'] = cutsite pred_df['shuffled'] = 'no' df_buffer = df_buffer.append(pred_df, ignore_index=True) pre, post = list(seq[:34]), list(seq[36:]) random.shuffle(pre) random.shuffle(post) shuffled_seq = ''.join(pre) + 'GG' + ''.join(post) shuffled_pred_df = _predict2.predict_mhdel(seq, local_cutsite) shuffled_pred_df['header'] = header shuffled_pred_df['seq'] = sequence shuffled_pred_df['pam'] = sequence[idx:idx + 2] shuffled_pred_df['cutsite'] = cutsite shuffled_pred_df['shuffled'] = 'yes' df_buffer = df_buffer.append(shuffled_pred_df, ignore_index=True) return df_buffer
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how = 'outer', on = join_cols) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs gt entropy'].append(obs_entropy) alldf_dict['pred gt entropy'].append(pred_entropy) df = orig_df[orig_df['Category'] == 'del'] df = df[df['Length'] <= 28] df['Frequency'] = _lib.normalize_frequency(df) obs_dl = [] for del_len in range(1, 28+1): freq = sum(df[df['Length'] == del_len]['Frequency']) obs_dl.append(freq) pred_dl = _predict2.deletion_length_distribution(seq, cutsite) obs_entropy = entropy(obs_dl) / np.log(len(obs_dl)) pred_entropy = entropy(pred_dl) / np.log(len(pred_dl)) alldf_dict['obs dl entropy'].append(obs_entropy) alldf_dict['pred dl entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) df = df.append(ins_df, ignore_index=True) df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_mhdel(seq, cutsite) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp genotype frequencies pred_1bpins_d = defaultdict(list) for ins_base in bp_model[fivebase]: freq = bp_model[fivebase][ins_base] freq *= rate_1bpins / (1 - rate_1bpins) pred_1bpins_d['Category'].append('ins') pred_1bpins_d['Length'].append(1) pred_1bpins_d['Inserted Bases'].append(ins_base) pred_1bpins_d['Predicted_Frequency'].append(freq) pred_1bpins_df = pd.DataFrame(pred_1bpins_d) pred_df = pred_df.append(pred_1bpins_df, ignore_index=True) pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] r = pearsonr(obs, pred)[0] alldf_dict['gt_r'].append(r) obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs entropy'].append(obs_entropy) alldf_dict['pred entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict