def prepare_indelphi(seq, cut, celltype): print(celltype) inDelphi.init_model(celltype=celltype) pred_df, stats = inDelphi.predict(seq, cut) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) pred_df = inDelphi.add_genotype_column(pred_df, stats) pred_df = inDelphi.add_name_column(pred_df, stats) freq = pred_df.loc[:, 'Predicted frequency'] pred_df.loc[:, 'Predicted frequency'] = freq / freq.sum() pred_df = pred_df.sort_values(by=['Predicted frequency'], ascending=False) return (pred_df)
def format_predction(seq, pam_idx): # A wrapper function that formats the output of # the inDelphi prediction method cutsite = pam_idx - 3 seqA = seq[0:cutsite] seqB = seq[cutsite::] pred_df, stats = inDelphi.predict(seq, cutsite) pred_df = pred_df.groupby(['Category','Length'],as_index=False).agg({'Predicted frequency': 'sum'}) pred_df['New category'] = pred_df['Category'].astype(str).str[0] + pred_df['Length'].map(str) pred_df = pred_df[['New category', 'Predicted frequency']] pred_df.columns = ['type','pred'] pred_df['type'] = pred_df['type'].apply(lambda x: x.upper()) pred_df['pred'] = pred_df['pred'].astype(float)/100 #.round(decimals = 4)/100 return pred_df, stats
""" Run each component of inDephi to understand its inputs/outputs """ # zzjfrank, 2020-10-11 from inDelphi import init_model, predict # the example sequence in inDelphi webserver left = 'GCAGTCAGTGCAGTAGAGGATGTGTCGCTCTCCCGTACGGCGTGAAAATGACTAGCAAAG' right = 'TTGGGGCCTTTTTGGAAGACCTAGAGCCTTAGGCCACGGTACACAATGGTGTCCTGCATA' seq = left + right cutsite = len(left) pred_df, stats = predict(seq, cutsite)
for x in lines[1:]: x = x.rstrip() l = x.split("\t") wt_grna = l[2].upper() mut_grna = l[3].upper() wt_ref = l[6].upper() mut_ref = l[7].upper() wt_ref, wt_cut, wt_orientation = my_cut(wt_grna, wt_ref) mut_ref, mut_cut, mut_orientation = my_cut(mut_grna, mut_ref) #print wt_cut,wt_orientation #print mut_cut,mut_orientation fm = "\t".join(l[:5]) + "\t" + wt_ref + "\t" + mut_ref + "\t" + str( wt_cut) + "\t" + str(wt_orientation) + "\t" + str( mut_cut) + "\t" + str(mut_orientation) Fr.write(fm + "\n") wt_pred_df, wt_stats = inDelphi.predict(wt_ref, wt_cut) mut_pred_df, mut_stats = inDelphi.predict(mut_ref, mut_cut) #wt_stats['gRNA']=wt_grna #mut_stats['gRNA']=mut_grna #wt_stats['gRNA orientation']=wt_orientation #mut_stats['gRNA orientation']=mut_orientation #print wt_pred_df wt_df_indel = inDelphi.get_indel_length_fqs(wt_pred_df) mut_df_indel = inDelphi.get_indel_length_fqs(mut_pred_df) wt_df_indel.to_csv("wt_" + l[1] + "_indel.xls", sep="\t", index=False) mut_df_indel.to_csv("mut_" + l[1] + "_indel.xls", sep="\t", index=False) #break Fr.close()
bar = Bar('Simulating sequences:', max=len(target_seqs_data)) for gRNA_id, target_seq in target_seqs_data.items(): # Calculate activity score using doench for that guide ## 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5' seq_for_doench = target_seq[cutsite - 21:cutsite + 9] doench_score = calc_doench_score(seq_for_doench) ## doench score is from 0 to 100, scale to get numb of edited reads n_edit_seqs = round(doench_score * sim_reads / 100) #round to have an integer number of reads # Calculate editing outcomes inDelphi.init_model(celltype='mESC') pred_df, stats = inDelphi.predict(target_seq, cutsite) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) #pred_df = inDelphi.add_genotype_column(pred_df,stats) ## adds gaps in the deletions, use add_genotype_column to avoid gaps, but sequences could be confused pred_df = inDelphi.add_genotype_column_wgaps(pred_df, stats) pred_frequency = np.array(pred_df["Predicted frequency"]) # normalize probabilities to sum exactly 1 pred_frequency /= pred_frequency.sum() # Simulate data ## first, create the edited reads edit_seqs = np.random.choice(pred_df["Genotype"], p=pred_frequency, size=(n_edit_seqs)) ## add non edited reads up to the sim_reads objective wt_seqs = np.repeat(target_seq, sim_reads - n_edit_seqs)
import inDelphi from scipy.stats import linregress import numpy as np import pandas as pd sys.path.append("/cluster/bh0085") from mybio import util from _config import REDUCED_LIB, OUT_PLACE import imp if not "__file__" in vars(): __file__ = "f_test" NAME = util.get_fn(__file__) OUT_DIR = os.path.join(OUT_PLACE, NAME) util.ensure_dir_exists(OUT_DIR) all_predictions = pd.DataFrame() for model in ["mESC", "U2OS"]: imp.reload(inDelphi) inDelphi.init_model(celltype=model) for k, row in REDUCED_LIB.iterrows(): target_seq = row[ "Designed sequence (61-bp, cutsite at position 34 by 0-index)"] CUTSITE = 34 pred_df, stats = inDelphi.predict(target_seq, CUTSITE) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) pred_df = pred_df.assign(**{"celltype": model, "libid": k}) all_predictions = all_predictions.append(pred_df, ignore_index=True) all_predictions.to_csv(os.path.join(OUT_DIR, "indelphi_genotypes.csv"), index=False)