def updatePam(indel, orig_pam_loc, pam_dir): pam_loc = orig_pam_loc itype, isize, details, muts = tokFullIndel(indel) if itype != '-': if pam_dir == 'REVERSE': left_pos = pam_loc + 2 - (details['R'] - 1) + details['C'] right_pos = pam_loc + 2 - (details['L'] + 1) + details['C'] else: left_pos = pam_loc - 3 + (details['L'] + 1) + details['C'] right_pos = pam_loc - 3 + (details['R'] - 1) + details['C'] if itype == 'D': delsize = isize - details['I'] else: delsize = -isize + details['D'] if left_pos < pam_loc: pam_loc = max(pam_loc - delsize, left_pos) for (muttype, mutpos, nucl) in muts: if muttype == 'D': msize = mutpos if muttype == 'I': msize = -mutpos if muttype != 'S': continue if pam_dir == 'REVERSE': mutidx = pam_loc + 2 - mutpos else: mutidx = pam_loc - 3 + mutpos if mutidx < pam_loc: pam_loc = pam_loc - msize return pam_loc
def writeMCISummary(fout, id, p1, stats1, oligo_det, more_indels=False): if not more_indels: mcis = [getHighestIndel(p1)] else: mcis = [x[1] for x in getProfileCounts(p1) if x[1] != '-'] for mci in mcis: mci_reads = p1[mci] total_reads = stats1[0] - stats1[2] itype, isize, details, muts = tokFullIndel(mci) pam_loc, pam_dir, seq = oligo_det mh_seq, altered_seq = '', '' if itype == 'D' and ('I' not in details or details['I'] == 0): if details['C'] > 0: left_c_seq = getSequence(oligo_det, details['L'] + 1, details['L'] + details['C']) right_c_seq = getSequence(oligo_det, details['R'] - details['C'], details['R'] - 1) if left_c_seq == right_c_seq: mh_seq = left_c_seq altered_seq = getSequence(oligo_det, details['L'] + 1, details['R'] - 1) #Note includes MH seq at both ends str_args = (id, mci, details['L'], details['R'], details['C'], itype, isize, mci_reads, total_reads, mh_seq, altered_seq) fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n' % str_args)
def predictMutations(theta_file, target_seq, pam_idx, add_null=True): theta, train_set, theta_feature_columns = readTheta(theta_file) #generate indels left_trim = 0 tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000)) cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file) print(cmd); subprocess.check_call(cmd.split()) rep_reads = fetchRepReads(tmp_genindels_file) isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10]) #compute features for all generated indels tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000)) calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file) os.remove(tmp_genindels_file) feature_data, feature_columns = readFeaturesData(tmp_features_file) os.remove(tmp_features_file) if len(set(theta_feature_columns).difference(set(feature_columns))) != 0: raise Exception('Stored feature names associated with model thetas are not contained in those computed') if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns): feature_data = feature_data[['Indel'] + theta_feature_columns] feature_columns = theta_feature_columns #Predict the profile p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns) in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict) in_frame_perc = in_frame*100.0/(in_frame + out_frame) if add_null: p_predict['-'] = 1000 rep_reads['-'] = target_seq[left_trim:] return p_predict, rep_reads, in_frame_perc
def compileMappedNull(file_prefix, read_lookup, pam_lookup, exp_oligo_lookup): read_profiles, indel_seqs = {}, {} if not os.path.isfile(file_prefix + '_mappings.txt'): print('Could not find file', file_prefix + '_mappings.txt') else: #Add 5 pseudo reads for the NULL indel for all oligos (in case poorly represented in the NULL measure) if file_prefix.split('/')[-1] in exp_oligo_lookup: for (oligo_id, pam_loc, pam_dir, seq) in exp_oligo_lookup[file_prefix.split('/')[-1]]: read_profiles[oligo_id] = {'-': 5} indel_seqs[oligo_id] = {'-': seq} f = io.open(file_prefix + '_mappings.txt') rdr = csv.reader(f, delimiter='\t') for toks in rdr: oligo_id = toks[1].split('_')[0] read_id = oligo_id + '.' + toks[0].split()[0] if oligo_id not in read_profiles: read_profiles[oligo_id] = {} indel_seqs[oligo_id] = {} seq = read_lookup[read_id] indel = toks[2] + '_' + toks[3] #combine mutations with indels itype, isize, details, muts = tokFullIndel(indel) if indel == '-_-': indel = '-' if indel not in read_profiles[oligo_id]: read_profiles[oligo_id][indel] = 0 indel_seqs[oligo_id][indel] = seq read_profiles[oligo_id][indel] += 1 f.close() fout = io.open(file_prefix + '_nullsummary.txt', 'w') oligo_ids = [x for x in read_profiles.keys()] oligo_ids.sort() for oligo_id in oligo_ids: orig_pam_loc, pam_dir = pam_lookup[oligo_id] fout.write(u'@@@%s\n' % oligo_id) indel_counts = [(read_profiles[oligo_id][x], x) for x in read_profiles[oligo_id]] indel_counts.sort(reverse=True) total_counts = sum([x[0] for x in indel_counts]) for (count, indel) in indel_counts: seq = indel_seqs[oligo_id][indel] perc = count * 100.0 / total_counts pam_loc = updatePam(indel, orig_pam_loc, pam_dir) fout.write(u'%s\t%s\t%d\t%s\t%.3f\n' % (seq, indel, pam_loc, pam_dir, perc)) fout.close()
def fetchIndelSizeCounts(p1): inframe, outframe, size_counts, = 0, 0, {'I': {}, 'D': {}} for i in range(1, 21): size_counts['I'][i] = 0 size_counts['D'][i] = 0 for indel in p1: if indel == '-': continue itype, isize, details, muts = tokFullIndel(indel) net_isize = isize - details['I'] - details['D'] if net_isize % 3 == 0: inframe += p1[indel] else: outframe += p1[indel] if net_isize not in size_counts[itype]: size_counts[itype][net_isize] = 0 size_counts[itype][net_isize] += p1[indel] return inframe, outframe, size_counts
def isAllowableOligoIndel(oligo_indel): itype, isize, details, muts = tokFullIndel(oligo_indel) #Exclude reads from oligos with any mutations in the guide or PAM sequence is_ok = True mut_locs = [x for x in muts if x[0] not in ['N', 'I', 'D']] if len(mut_locs) > 0: if any([x[1] > -20 and x[1] < 6 for x in mut_locs]): is_ok = False if len(mut_locs) > 5: is_ok = False #Only allow oligo indels if they're size 1 or 2 insertion/deletions outside the guide or PAM sequence ins_del_muts = [x for x in muts if x[0] in ['I', 'D']] if len(ins_del_muts) > 0: if any([x[1] > 2 for x in ins_del_muts]): is_ok = False if oligo_indel[0] != '-': if isize > 2 or (details['L'] < 6 and details['R'] > -20): is_ok = False return is_ok
def padReadForIndel(read_seq, indel, pam_idx): itype, isize, details, muts = tokFullIndel(indel) red_idxs, green_idxs = set(), set() if itype == 'D': read_seq = read_seq[:pam_idx - 3 + details['L'] + details['C'] + 1] + ' ' * isize + read_seq[pam_idx - 3 + details['L'] + details['C'] + 1:] green_idxs = set( range(pam_idx - 3 + details['L'] + 1, pam_idx - 3 + details['L'] + 1 + details['C'])) if itype == 'I': green_idxs = set( range(pam_idx - 3 + details['L'] + 1, pam_idx - 3 + details['L'] + 1 + details['C'])) red_idxs = set( range(pam_idx - 3 + details['L'] + 1 + details['C'], pam_idx - 3 + details['L'] + details['C'] + 1 + isize)) return read_seq, red_idxs, green_idxs
def readSummaryToProfile(filename, profile, oligoid=None, noexclude=False, remove_long_indels=False, remove_wt=True, wt_thresh=3.0): if not os.path.isfile(filename): return 0, 0, 0 dirname = '/'.join(filename.split('/')[:-3]) filename_suffix = '/'.join(filename.split('/')[-3:]) wt_p, wt_p_wfilter = {}, {} if 'WT' not in dirname and dirname != '' and not noexclude and remove_wt: wt_filename = getWTDir(dirname) + '/' + filename_suffix #if wt_filename[0] == '/' and wt_filename[1:7] != 'lustre': wt_filename = wt_filename[1:] if not os.path.isfile(wt_filename): print('Warning: Could not find', wt_filename) else: readSummaryToProfile(wt_filename, wt_p, oligoid=oligoid, noexclude=True, remove_wt=False) _, wt_acc, _ = readSummaryToProfile(wt_filename, wt_p_wfilter, oligoid=oligoid, noexclude=False, remove_wt=False) if wt_acc < 10.0: return 0, 0, 0 #Need at least 20% acceptable reads in the wild type #(to remove oligos that are really messed up) total, accepted = 0, 0 f = io.open(filename) reader = csv.reader(f, delimiter='\t') if '-' not in profile: profile['-'] = 0 orig_null = profile['-'] curr_oligo_id = None wt_indels = [] for toks in reader: if toks[0][:3] == '@@@': curr_oligo_id = toks[0][3:] continue if oligoid != curr_oligo_id: continue indel = toks[0] oligo_indel = toks[1] num_reads = eval(toks[2]) total += num_reads if not noexclude: if oligo_indel != '-': if not isAllowableOligoIndel(oligo_indel): continue #Only allow indels that span the cut site and which are #not present in the corresponding WT sample if indel != '-': itype, isize, details, muts = tokFullIndel(indel) if itype != '-' and (details['L'] > 5 or details['R'] < -5): continue if remove_long_indels and isize > 30: continue if indel in wt_p and remove_wt: #Check the levels of the indel in the WT sample, #only include it if present at at least 3 x that level (including NULLS) # - will need to wait til we know total reads to do this wt_indels.append((indel, num_reads)) continue if indel not in profile: profile[indel] = 0 profile[indel] += num_reads accepted += num_reads for indel, num_reads in wt_indels: if num_reads * 1.0 / total > wt_p[indel] * wt_thresh / sum( [wt_p[x] for x in wt_p]): if indel not in profile: profile[indel] = 0 profile[indel] += num_reads accepted += num_reads f.close() if total == 0: perc_accepted = 0.0 else: perc_accepted = accepted * 100.0 / total return accepted, perc_accepted, profile['-'] - orig_null
def plotMicrohomologyMismatches(all_result_outputs, label=''): mut_hdrs = ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2'] cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel'] data = mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols) getLeft = lambda indel: tokFullIndel(indel)[2]['L'] getRight = lambda indel: tokFullIndel(indel)[2]['R'] getMHSize = lambda indel: tokFullIndel(indel)[2]['C'] oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1) reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])} is_reverse = lambda x: reverse_lookup[x] data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner') data['MH Size'] = data['Orig Indel'].apply(getMHSize) data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])] data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size'] data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size'] data['Is Reverse'] = data['Oligo ID'].apply(is_reverse) for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]: data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum'] data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum'] data[hdrL + ' Reads Ratio'] = data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data[hdrR + ' Reads Ratio'] = data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum'] data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum'] data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc'] data['1st Mismatch'] = data.apply(getMismatch, axis=1) data['Last Mismatch'] = data.apply(getLastMismatch, axis=1) data['MH GC Content'] = data.apply(getMhGC, axis=1) mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ] label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches', 'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence', 'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence', 'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)' } fig1 = PL.figure(figsize=(4,4)) fig_all = PL.figure(figsize=(10,10)) for i, (mh_typex, mh_typey) in enumerate(mh_indel_types): figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)] for fig, is_all in figs: PL.figure(fig.number) if is_all: PL.subplot(2,2,i+1) for nm,clr in zip([1,2],['royalblue','orange']): nm_data = data.loc[data['Num Mismatches'] == nm] sty, lsty = 'o', '-' sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)] PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm)) rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']]) if not is_all: print(grad, nm, mh_typex, mh_typey) if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2) PL.xlabel(label_lookup[mh_typex]) PL.ylabel(label_lookup[mh_typey]) PL.xlim((0,80)) PL.ylim((0,80)) PL.plot([0,80],[0,80],'k--') PL.legend() PL.show(block=False) saveFig('mm_mismatch_all') PL.figure(fig1.number) saveFig('mm_mismatch_one')
def plotD2(all_result_outputs, label=''): #Merge replicates mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) mci_merged_data_guides = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D2' )] #Note: type check discards equally most common indels pie_vals, pie_labels = [], [] is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][ -3] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3 is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][ -2] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2 is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row) is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row) is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row) lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)] pie_labels.append('Y|XY->Y') pie_vals.append(len(lrpt_data)) rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)] pie_labels.append('XY|X->X') pie_vals.append(len(rrpt_data)) rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)] pie_labels.append('XY|XY->XY') pie_vals.append(len(rpt_data)) is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0 ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)] pie_labels.append('Z|XY->Z') pie_vals.append(len(ro_data)) is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1 l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)] pie_labels.append('XY|Z->Z') pie_vals.append(len(l1_data)) seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union( set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union( set(rrpt_data['Oligo Id'])) is_unseen = lambda id: id not in seen_ids unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)] print(unseen_data) assert (len(unseen_data) == 0) #pie_labels.append('Other') #pie_vals.append(len(unseen_data)) #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()] #pie_vals = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels] PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title( 'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D2_indel_cats') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data)) PL.show(block=False) #__| PL.subplot(2, 3, 2) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data)) PL.show(block=False) #|__ PL.subplot(2, 3, 3) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-3:-1] == mh_str pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data)) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%sN|%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-5] == nt pie_vals.append(len( lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data)) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%s|N%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-4] == nt pie_vals.append(len( rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data)) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(rpt_data['Oligo Id']).intersection( set(dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 90)) PL.xlabel('XY') PL.title('XY|XY->XY') PL.ylabel( 'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #__| PL.subplot(2, 3, 2) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(ro_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('XY| -> __|') PL.ylabel( 'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #|__ PL.subplot(2, 3, 3) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(l1_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('|XY -> |__') PL.ylabel( 'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(lrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('XY|X->X') PL.ylabel( 'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(rrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('X|YX->X') PL.ylabel( 'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat_bars')
def plotD1(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']] pie_vals, pie_labels = [], [] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D1' )] #Note: type check discards equally most common indels spans_cutsite = lambda indel: tokFullIndel(indel)[2][ 'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0 for nt in 'ATGC': is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq) * nt) num_repeat_nt = len(dmci_data.loc[ dmci_data['Altered Sequence'].apply(is_mh) & dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data)) print(num_repeat_nt) pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt)) is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq)) num_non_repeat = len( dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat) | ~dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_non_repeat * 100.0 / len(dmci_data)) print(num_non_repeat) pie_labels.append('Removal from non-repeat') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=OLD_COLORS) PL.title( 'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D1') oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') print(len(merged_mci_data)) nt_dbl_perc_d1, cnt_labels = [], [] is_d1 = lambda indel: (indel.split('_')[0] == 'D1') non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3] nts = 'ATGC' for nt in nts: double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt) dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt, axis=1)] num_dbl_d1 = sum( dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI'] & (dbl_data['Oligo Id'] != 'Oligo28137') ) #Oligo28137: Corner case where a guide has CT|T and loses the C nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data)) cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data))) print(len(dbl_data)) non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt, axis=1)] print(len(non_dbl_data)) num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1) & non_dbl_data['Equal MCI']) nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data)) cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data))) PL.figure() PL.bar(range(5), nt_dbl_perc_d1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt) PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other']) PL.ylim((0, 40)) PL.xlabel('Nucleotides on either side of cut site') PL.ylabel( 'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) saveFig('D1_bar_3_rep')
def plotProfiles(profiles, rep_reads, pam_idxs, reverses, labels, title='', max_lines=10): if len(profiles) == 0: raise Exception('Empty list of profiles') colors = [ FORECAST_GREEN, 'C0', 'C2', 'C2', 'C1', 'C1', 'C3', 'C3', 'C4', 'C4', 'C5', 'C5', 'C6' ] PL.rcParams['svg.fonttype'] = 'none' ocounts = [getProfileCounts(p1) for p1 in profiles] counts = [{ indel: (cnt, indel, perc1a, perc1b) for (cnt, indel, perc1a, perc1b) in x } for x in ocounts] #Count total non-null reads for each sample (to report in labels) nonnull_reads = [ sum([x[indel][0] for indel in x if indel != '-']) for x in counts ] labels = [ '%s(%d Reads)' % (tit, nn) for (tit, nn) in zip(labels, nonnull_reads) ] #Fetch the indels to display as union of top N indels across profiles num_top = 20 top_indels = [[y[1] for y in x[:num_top]] for x in ocounts] union_top_indels = set() for x in top_indels: union_top_indels = union_top_indels.union(set(x)) for indel in union_top_indels: for count in counts: if indel not in count: count[indel] = (0, indel, 0.0, 0.0) union_top_indels = [x for x in union_top_indels] indel_toks = [tokFullIndel(indel) for indel in union_top_indels] max_insert = max([0] + [toks[1] for toks in indel_toks if toks[0] == 'I']) #Order indels by decreasing average percentage across profiles top_av_percs = [(np.mean([x[indel][-1] for x in counts]), indel) for indel in union_top_indels] top_av_percs.sort(reverse=True) max_indels = max_lines / len(profiles) #Figure out Trims null_reads = [ x['-'] if '-' in x else [x[y[1]] for y in ocnt if y[1] in x][0] for x, ocnt in zip(rep_reads, ocounts) ] null_reads = [ Bio.Seq.reverse_complement(x) if rev else x for x, rev in zip(null_reads, reverses) ] pam_idxs = [ len(x) - pam if rev else pam for x, pam, rev in zip(null_reads, pam_idxs, reverses) ] min_null, pam_idx = min([(len(null), pidx) for (null, pidx) in zip(null_reads, pam_idxs)]) Ls = [x - pam_idx for x in pam_idxs] Rs = [L + min_null - len(null) for (L, null) in zip(Ls, null_reads)] #Plot scale_factor = 10.0 / max([x[1][3] for x in ocounts]) fig = PL.figure(figsize=(9, 5 * len(labels))) fig.patch.set_visible(False) ax = PL.gca() ax.axis('off') N = min(len(union_top_indels), max_indels) line_height = 0.8 min_xloc, max_xloc = MIN_X, MAX_X PL.ylim((0, (N + 1.0) * line_height)) bar_ypos, bar_len = [[] for x in profiles], [[] for x in profiles] for i, (av_perc, indel) in enumerate(top_av_percs): if i > max_indels: break for repr, cnts, rev, L1, R1, j in zip(rep_reads, counts, reverses, Ls, Rs, range(len(Rs))): (cnt1, indel1, perc1a, perc1b) = cnts[indel] if indel in repr: if R1 == 0: R1 = len(repr[indel]) seq = Bio.Seq.reverse_complement( repr[indel])[L1:R1] if rev else repr[indel][L1:R1] padded_seq, red_idxs, green_idxs = padReadForIndel( seq, indel, pam_idx) min_xloc, max_xloc = plotSeqLetterwise( padded_seq, (N - i + (j + 0.3) * 1.0 / len(profiles)) * line_height, pam_idx, red_idxs=red_idxs, green_idxs=green_idxs) if indel != '-': bar_ypos[j].append( (N - i + (j + 0.4) * 1.0 / len(profiles)) * line_height) bar_len[j].append(perc1b * scale_factor) hist_loc = max_xloc + 10 for bar1_ypos, bar1_len, label1, clr in zip(bar_ypos, bar_len, labels, colors): PL.barh(bar1_ypos, bar1_len, height=0.8 * line_height / len(profiles), left=hist_loc, label=label1, color=clr) for (ypos, blen) in zip(bar1_ypos, bar1_len): PL.text(hist_loc + blen + 1, ypos - 0.5 / len(profiles) * line_height, '%.1f%%' % (blen / scale_factor)) xlims = (min_xloc - 10, MAX_X + 20 + (min_xloc - MIN_X)) PL.xlim(xlims) for i, (av_perc, indel) in enumerate(top_av_percs): if i > max_indels: break if indel == '-': PL.text(xlims[0], (N - i + 0.4) * line_height, 'Target:', fontweight='bold') else: PL.text(xlims[0], (N - i + 0.4) * line_height, indel.split('_')[0], fontweight='bold') PL.plot([min_xloc - 10, max_xloc + 10], [(N - i) * line_height, (N - i) * line_height], 'lightgrey') PL.plot([0, 0], [0, (N + 1) * line_height], 'k--') PL.plot([min_xloc - 10, hist_loc], [N * line_height, N * line_height], 'k') PL.plot([hist_loc, hist_loc], [0, N * line_height], 'k') PL.xticks([]) PL.yticks([]) if len(labels) > 1: PL.legend(loc='upper right') PL.text(hist_loc, (N + 0.5) * line_height, title, fontweight='bold') PL.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05) PL.show(block=False) PL.axis('off') saveFig('%s_%d' % (title.replace(' ', '_'), len(labels)), bbox=False) return fig