def computePieDataWithAmbig(data, label='', norm='I1 Total'): merged_data = mergeWithIndelData(data) pie_labels = [ 'I1_Rpt Left Reads - NonAmb', 'Ambiguous Rpt Reads', 'I1_Rpt Right Reads - NonAmb', 'I1_NonRpt Reads' ] labels = [ 'Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide' ] pie_data = { x: (merged_data[x] * 100.0 / merged_data[norm]).mean(axis=0) for x in pie_labels } PL.figure(figsize=(3, 3)) PL.pie([pie_data[x] for x in pie_labels], labels=labels, autopct='%.1f', labeldistance=1.05, startangle=120.0, counterclock=False) PL.title('Single nucleotide insertions (I1)') PL.show(block=False) saveFig('ambig_pie_%s' % label) return pie_data, pie_labels, data['Total reads'].median()
def plotSumPie(all_result_outputs, label=''): mapping = { 'Large D, No MH': 'D>=4,\nno MH', 'Large D, MH': 'D>=4,\nMH', 'Small D, No MH': 'D<4, no MH', 'Small D, MH': 'D<4, MH' } merged_data = mergeSamples(all_result_outputs, ['Total reads'] + ALL_LABELS, data_label='perOligoCounts') for col in ALL_LABELS: merged_data[col + ' Perc'] = merged_data[ col + ' Sum'] * 100.0 / merged_data['Total reads Sum'] merged_data.to_csv('data_dump_indel_pie.txt', sep='\t', columns=['Oligo Id'] + [col + ' Perc' for col in ALL_LABELS]) pie_vals = [merged_data[col + ' Perc'].mean() for col in ALL_LABELS] PL.figure(figsize=(4, 4)) wedge_labels = [mapping[x] if x in mapping else x for x in ALL_LABELS] PL.pie(pie_vals, labels=wedge_labels, autopct='%.1f', labeldistance=1.05, startangle=90.0, counterclock=False, colors=COLORS) PL.title('Average distribution\n of mutations\n per gRNA') PL.show(block=False) saveFig('pie_chart_cats')
def plotDominantBars(all_result_outputs, label=''): pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads'] mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData') mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3']) mci_merged_data['Is Dominant I1'] = (mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')) oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',sep='\t') remove_under = lambda x: x.replace('_','') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id','Guide']], how='inner',on='Oligo Id') nt_perc_i1, cnt_labels = [], [] nts = 'ATGC' for nt in nts: is_nt = lambda guide: (guide[-4] == nt) nt_data = merged_mci_data.loc[merged_mci_data['Guide'].apply(is_nt)] nt_perc_i1.append(sum(nt_data['Is Dominant I1'])*100.0/len(nt_data)) cnt_labels.append('%d/%d' % (sum(nt_data['Is Dominant I1']), len(nt_data))) PL.figure() PL.bar(range(4), nt_perc_i1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i-0.3,nt_perc_i1[i]+5.0,cnt) PL.xticks(range(4), [x for x in nts]) PL.xlabel('Nucleotide on Left of cut-site') PL.ylabel('Percent gRNAs with single nucleotide insertion\nas most common indel in all 3 replicates') PL.show(block=False) saveFig('I1_bar_3_rep')
def plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir): PL.figure(figsize=(4.2, 4.2)) data = pd.read_csv(pred_results_dir + '/old_new_kl_predicted_summaries.txt', sep='\t').fillna(-1.0) label1, label2 = 'New 2x800x In Frame Perc', 'New 1600x In Frame Perc' xdata, ydata = data[label1], data[label2] PL.plot(xdata, ydata, '.', label='Synthetic between library (R=%.2f)' % pearsonr(xdata, ydata)[0], color='C0', alpha=0.15) PL.plot(overbeek_inframes, ours_inframes, '^', label='Synthetic vs Endogenous (R=%.2f)' % pearsonr(overbeek_inframes, ours_inframes)[0], color='C1') for (x, y, id) in zip(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids): if abs(x - y) > 25.0: PL.text(x, y, id) PL.plot([0, 100], [0, 100], 'k--') PL.ylabel('Percent In-Frame Mutations') PL.xlabel('Percent In-Frame Mutations') PL.legend() PL.xticks([], []) PL.yticks([], []) PL.show(block=False) saveFig('in_frame_full_scatter')
def plotInFrameCorr(data): shi_data = pd.read_csv(getHighDataDir() + '/shi_deepseq_frame_shifts.txt', sep='\t') label1, label2 = 'New In Frame Perc', 'Predicted In Frame Per' PL.figure(figsize=(4, 4)) xdata, ydata = data[label1], data[label2] PL.plot(xdata, ydata, '.', alpha=0.15) PL.plot(shi_data['Measured Frame Shift'], shi_data['Predicted Frame Shift'], '^', color='orange') for x, y, id in zip(shi_data['Measured Frame Shift'], shi_data['Predicted Frame Shift'], shi_data['ID']): if x - y > 10: PL.text(x, y, id.split('/')[1][:-21]) PL.plot([0, 100], [0, 100], 'k--') PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0])) PL.xlabel('percent in frame mutations (measured)') PL.ylabel('percent in frame mutations (predicted)') PL.ylim((0, 80)) PL.xlim((0, 80)) PL.show(block=False) saveFig('in_frame_corr_%s_%s' % (label1.replace(' ', '_'), label2.replace(' ', '_')))
def plotMCIPie(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, ['MCI Type', 'Most Common Indel'], data_label='perOligoMCI') mci_common = mci_merged_data.loc[(mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3'])] pie_vals, pie_labels = [], [] for mci_type in ALL_LABELS: pie_vals.append(len( mci_common.loc[mci_common['MCI Type'] == mci_type])) pie_labels.append(mci_type) pie_vals.append(len(mci_merged_data) - len(mci_common)) pie_labels.append('Inconsistent\nbetween\nreplicates') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.05, startangle=90.0, counterclock=False, colors=COLORS) PL.title('Most frequent\nmutation per gRNA') PL.show(block=False) saveFig('pie_chart_cats_dominant')
def plotKLBoxes(data): cols = [ x for x in data.columns if 'KL' in x and 'Class KL' not in x and 'Old' not in x and 'Conventional' not in x and 'Combined' not in x ] cols.reverse() cols_label, max_kl = 'KL', 9 PL.figure(figsize=(4, 5)) pt = data.loc[(data['Combined v Predicted KL'] > 0.75) & (data['Combined v Predicted KL'] < 0.8) & (data['Old v New KL'] > 0.75) & (data['Old v New KL'] < 0.8)] print(pt['Old Oligo Id']) PL.boxplot([data[col] for col in cols], positions=range(len(cols)), patch_artist=True, boxprops=dict(facecolor='C2'), medianprops=dict(linewidth=2.5, color='C1'), showfliers=False) PL.xticks(range(len(cols)), [renameCol(x) for x in cols], rotation='vertical') for i, col in enumerate(cols): PL.text(i - 0.15, np.median(data[col]) + 0.02, '%.2f' % np.median(data[col])) PL.ylabel(cols_label) PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.5) PL.show(block=False) saveFig('kl_compare_old_new_predicted_%s' % cols_label.replace(' ', ''))
def plotMergedPieDataWithAmbig(all_result_outputs, label='', norm='I1 Total'): pie_labels = [ 'I1_Rpt Left Reads - NonAmb', 'Ambiguous Rpt Reads', 'I1_Rpt Right Reads - NonAmb', 'I1_NonRpt Reads' ] merged_data = mergeSamples(all_result_outputs, pie_labels + [norm], data_label='i1IndelData', merge_on=['Oligo Id']) labels = [ 'Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide' ] pie_data = { x: (merged_data[x + ' Sum'] * 100.0 / merged_data[norm + ' Sum']).mean(axis=0) for x in pie_labels } PL.figure(figsize=(3, 3)) PL.pie([pie_data[x] for x in pie_labels], labels=labels, autopct='%.1f', labeldistance=1.05, startangle=120.0, counterclock=False) PL.title('Single nucleotide insertions (I1)') PL.show(block=False) saveFig('ambig_pie')
def plotInFrameCorr(data): for label1, label2 in [('Combined in Frame Perc', 'Predicted In Frame Per')]: PL.figure(figsize=(4,4)) xdata, ydata = data[label1], data[label2] PL.plot(xdata,ydata, '.') PL.plot([0,100],[0,100],'k--') PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0])) PL.xlabel(renameCol(label1)) PL.ylabel(renameCol(label2)) PL.show(block=False) saveFig('in_frame_corr_%s_%s' % (label1.replace(' ','_'),label2.replace(' ','_')))
def compareMHlines(all_result_outputs, label='', y_axis='Percent Non-Null Reads', data_label='RegrLines'): color_map = { 'K562': 'b', 'K562_1600x': 'lightblue', 'BOB': 'g', 'RPE1': 'purple', 'TREX2': 'k', 'TREX2_2A': 'gray', 'HAP1': 'r', 'E14TG2A': 'orange', 'eCAS9': 'c', 'WT': 'pink', 'CHO': 'salmon' } lysty_map = {2: '--', 3: '--', 5: '--', 7: '-', 10: '-.', 16: ':', 20: ':'} dirnames = [x[1] for x in all_result_outputs] lystys = [lysty_map[parseSampleName(x)[1]] for x in dirnames] clrs = [color_map[parseSampleName(x)[0]] for x in dirnames] for mh_len in [9]: PL.figure() regr_lines = [x[0][data_label][mh_len] for x in all_result_outputs] for dirname, regr_line, lysty, clr in zip(dirnames, regr_lines, lystys, clrs): PL.plot(regr_line[0], regr_line[1], label='%s (R=%.1f)' % (getSimpleName(dirname), regr_line[2]), linewidth=2, color=clr, linestyle=lysty, alpha=0.5) PL.xlabel('Distance between nearest ends of microhomologous sequences', fontsize=14) PL.ylabel( 'Correspondng microhomology-mediated deletion\n as percent of total mutated reads', fontsize=14) PL.tick_params(labelsize=16) PL.legend(loc='upper right') PL.ylim((0, 70)) PL.xlim((0, 20)) PL.xticks(range(0, 21, 5)) PL.title('Microhomology Length %d' % mh_len, fontsize=18) PL.show(block=False) saveFig('mh_regr_lines_all_samples__%d' % mh_len)
def plotDominantPieDataWithAmbig(all_result_outputs, label=''): pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads'] mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData') mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3']) mci_common_i1 = mci_merged_data.loc[mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')] labels = ['Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide'] pie_data = [] for label in pie_labels: is_rpt = lambda row: row['MCI Reads'] == row[label] pie_data.append(sum(mci_common_i1.apply(is_rpt,axis=1))*100.0/len(mci_common_i1)) PL.figure(figsize=(3,3)) PL.pie(pie_data, labels=labels, autopct='%.1f', labeldistance=1.05, startangle=180.0, counterclock=False) PL.title('Dominant single nucleotide insertions (I1)\n%d from %d gRNAs' % (len(mci_common_i1), len(mci_merged_data))) PL.show(block=False) saveFig('I1_dom_pie_3_rep')
def plotPercScatterAnalysis(data, label='test', y_axis = 'Percent Non-Null Reads', plot_scatters=False, plot_regr_lines=False, scatter_mh_lens=[], mh_lens=[9]): plot_dir = getPlotDir() regr_lines = {} for mh_len in mh_lens: mh_data = data.loc[data['MH Len'] == mh_len] mh_rdata = mh_data.loc[(mh_data['MH Dist'] >= 0) & (mh_data['MH Dist'] < (30-mh_len)) ] regr = linear_model.LinearRegression() rx, ry = mh_rdata[['MH Dist']], mh_rdata[[y_axis]] #np.log(mh_rdata[[y_axis]]) regr.fit(rx, ry) corr = scipy.stats.pearsonr(rx, ry) min_x, max_x = rx.min()[0], rx.max()[0] x_pts = [min_x, max_x] regr_lines[mh_len] = (x_pts,[regr.predict(x)[0] for x in x_pts],corr[0]) if plot_scatters and mh_len in scatter_mh_lens: fig = PL.figure(figsize=(5,5)) PL.plot( mh_data['MH Dist'], mh_data[y_axis], '.', alpha=0.4 ) PL.plot(regr_lines[mh_len][0],regr_lines[mh_len][1],'dodgerblue',linewidth=3) PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=14) PL.ylabel('Percent of mutated reads of corresponding\nMH-mediated deletion',fontsize=14) PL.tick_params(labelsize=14) PL.xlim((0,20)) PL.title('Microhomology of length %d (r=%.2f)' % (mh_len,corr[0]),fontsize=14) PL.show(block=False) saveFig('mh_scatter_len%d_%s' % (mh_len,label.split('/')[-1])) if plot_regr_lines: fig = PL.figure() output_data = {} for mh_len in mh_lens: fit_data = regr_lines[mh_len] if mh_len > 15: continue lsty = '--' if mh_len < 9 else '-' PL.plot(fit_data[0], fit_data[1], linewidth=2, linestyle=lsty, label='MH length %d (R=%.1f)' % (mh_len, fit_data[2])) PL.title(label,fontsize=18) PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=14) PL.ylabel('Percent of mutated reads of corresponding\nMH-mediated deletion',fontsize=14) PL.tick_params(labelsize=18) PL.legend() PL.ylim((0,100)) PL.show(block=False) saveFig(plot_dir + '/mh_scatter_all_len_%s' % label.split('/')[-1]) return regr_lines
def plotMergedI1Repeats(all_result_outputs, label=''): merged_data = mergeSamples(all_result_outputs, ['I1_Rpt Left Reads - NonAmb','Total reads'], data_label='i1IndelData', merge_on=['Oligo Id','Repeat Nucleotide Left']) nt_mean_percs, nts = [], ['A','T','G','C'] for nt in nts: nt_data = merged_data.loc[merged_data['Repeat Nucleotide Left'] == nt] nt_mean_percs.append((nt_data['I1_Rpt Left Reads - NonAmb Sum']*100.0/nt_data['Total reads Sum']).mean()) PL.figure(figsize=(3,3)) PL.bar(range(4),nt_mean_percs) for i in range(4): PL.text(i-0.25,nt_mean_percs[i]+0.8,'%.1f' % nt_mean_percs[i]) PL.xticks(range(4),nts) PL.ylim((0,26)) PL.xlabel('PAM distal nucleotide\nadjacent to the cut site') PL.ylabel('I1 repeated left nucleotide\nas percent of total mutated reads') PL.show(block=False) saveFig('i1_rtp_nt')
def i1RepeatNucleotides(data, label=''): merged_data = mergeWithIndelData(data) nt_mean_percs, nts = [], ['A','T','G','C'] for nt in nts: nt_data = merged_data.loc[merged_data['Repeat Nucleotide Left'] == nt] nt_mean_percs.append((nt_data['I1_Rpt Left Reads - NonAmb']*100.0/nt_data['Total reads']).mean()) PL.figure(figsize=(3,3)) PL.bar(range(4),nt_mean_percs) for i in range(4): PL.text(i-0.25,nt_mean_percs[i]+0.8,'%.1f' % nt_mean_percs[i]) PL.xticks(range(4),nts) PL.ylim((0,26)) PL.xlabel('PAM distal nucleotide\nadjacent to the cut site') PL.ylabel('I1 repeated left nucleotide\nas percent of total mutated reads') PL.show(block=False) saveFig('i1_rtp_nt_%s' % label)
def plotHeatMap(data, col='KL without null', label=''): #Compute and collate medians sel_cols = [x for x in data.columns if col in x] cmp_meds = data[sel_cols].median(axis=0) samples = sortSampleNames(getUniqueSamples(sel_cols)) cell_lines = [ 'CHO', 'E14TG2A', 'BOB', 'RPE1', 'HAP1', 'K562', 'eCAS9', 'TREX2' ] sample_idxs = [(cell_lines.index(parseSampleName(x)[0]), x) for x in getUniqueSamples(sel_cols)] sample_idxs.sort() samples = [x[1] for x in sample_idxs] N = len(samples) meds = np.zeros((N, N)) for colname in sel_cols: dir1, dir2 = getDirsFromFilename(colname.split('$')[-1]) idx1, idx2 = samples.index(dir1), samples.index(dir2) meds[idx1, idx2] = cmp_meds[colname] meds[idx2, idx1] = cmp_meds[colname] for i in range(N): print(' '.join(['%.2f' % x for x in meds[i, :]])) print(np.median(meds[:, :-4], axis=0)) #Display in Heatmap PL.figure(figsize=(5, 5)) PL.imshow(meds, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.colorbar() PL.xticks(range(N)) PL.yticks(range(N)) PL.title( "Median KL" ) # between %d mutational profiles (for %s with >%d mutated reads)" % (col, len(data), label, MIN_READS)) ax1 = PL.gca() ax1.set_yticklabels([getSimpleName(x) for x in samples], rotation='horizontal') ax1.set_xticklabels([getSimpleName(x) for x in samples], rotation='vertical') PL.subplots_adjust(left=0.25, right=0.95, top=0.95, bottom=0.25) PL.show(block=False) saveFig('median_kl_heatmap_cell_lines')
def compareMHK562lines(all_result_outputs, label='', y_axis = 'Percent Non-Null Reads', data_label='RegrLines'): dirnames = [x[1] for x in all_result_outputs] clrs = ['silver','grey','darkgreen','green','lightgreen','royalblue','dodgerblue','skyblue','mediumpurple','orchid','red','orange','salmon'] fig = PL.figure(figsize=(6,6)) leg_handles = [] mh_lens = [3,4,5,6,7,8,9,10,11,12,13,14,15] for mh_len, clr in zip(mh_lens,clrs): regr_lines = [x[0][data_label][mh_len] for x in all_result_outputs] mean_line = np.mean([x[:2] for x in regr_lines], axis=0) leg_handles.append(PL.plot(mean_line[0], mean_line[1], label='MH Len=%d (R=%.1f)' % (mh_len,np.mean([x[2] for x in regr_lines])) , linewidth=2, color=clr )[0]) PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=16) PL.ylabel('Correspondng microhomology-mediated deletion\n as percent of total mutated reads',fontsize=16) PL.tick_params(labelsize=16) PL.legend(handles=[x for x in reversed(leg_handles)], loc='upper right') PL.ylim((0,80)) PL.xlim((0,20)) PL.xticks(range(0,21,5)) PL.show(block=False) saveFig('mh_regr_lines_K562')
def plotGCContent(all_result_outputs, label=''): #Merge data across samples unique_cols = ['Oligo ID','Indel', 'GC Content', 'MH Len', 'MH Dist'] datas = [x[0]['Data'][unique_cols + ['Indel Reads', 'Non-Null Reads']] for x in all_result_outputs] merged_data = datas[0] for i, data in enumerate(datas[1:]): merged_data = pd.merge(merged_data, data, on=unique_cols, suffixes=('','%d' % (i+2)), how='outer') suffix = lambda i: '%d' % (i+1) if i > 0 else '' merged_data['Indel Reads Sum'] = merged_data[['Indel Reads' + suffix(i) for i in range(len(datas))]].sum(axis=1) merged_data['Non-Null Reads Sum'] = merged_data[['Non-Null Reads' + suffix(i) for i in range(len(datas))]].sum(axis=1) #Compute mean regression lines across samples for each MH length mean_lines = {} for mh_len in range(2,16): if mh_len not in all_result_outputs[0][0]['RegrLines']: continue regr_lines = [x[0]['RegrLines'][mh_len][:2] for x in all_result_outputs] mean_lines[mh_len] = np.mean(regr_lines, axis=0) #Restrict to only MH dist in (0,10) and adjust for mh len-dist relationship for mh_len in [9]: compute_resid = lambda row: row['Perc Reads']# - getRegrValue(row['MH Len'],row['MH Dist'],mean_lines) sel_data = merged_data.loc[(merged_data['MH Len'] == mh_len) & (merged_data['MH Dist'] >= 0) & (merged_data['MH Dist'] <= 10)] sel_data['Perc Reads'] = sel_data['Indel Reads Sum']*100.0/sel_data['Non-Null Reads Sum'] sel_data['Perc Reads Residual'] = sel_data.apply(compute_resid, axis=1) PL.figure(figsize=(4,4)) gcs = sel_data['GC Content'].unique(); gcs.sort() boxdata_lk = {gc: sel_data.loc[sel_data['GC Content'] == gc]['Perc Reads Residual'] for gc in gcs} gcs = [gc for gc in gcs if len(boxdata_lk[gc])>20] #Limit to GC with at least 20 data points boxdata = [boxdata_lk[gc] for gc in gcs] print([len(x) for x in boxdata]) PL.boxplot(boxdata) PL.ylabel('Percent total mutated reads of MH-mediated deletion') PL.xlabel('GC content of microhomologous sequence') PL.title('Microhomology of length %d\n(at max 10 distance)' % mh_len) PL.xticks(range(1,len(gcs)+1),gcs) PL.show(block=False) saveFig('gc_content_mh%d' % mh_len)
def runAnalysis(): data = pd.read_csv(getHighDataDir() + '/old_new_kl_summaries.txt', sep='\t').fillna(-1.0) kl_cols = [ x for x in data.columns if 'KL' in x and 'Class KL' not in x and 'Old v Old' not in x ] max_kl = 9 PL.figure(figsize=(2.5, 4)) bps = [] box_types = [('C2', 'Within Library'), ('C0', 'Between Library')] for i, (clr, box_type) in enumerate(box_types): col_box_data = [ data[col] for col in kl_cols if renameCol(col) == box_type ] pos = [2 * x + i + 1 for x in range(len(col_box_data))] print('KL', box_type, np.median(col_box_data, axis=1)) bps.append( PL.boxplot(col_box_data, positions=pos, patch_artist=True, boxprops=dict(facecolor=clr), showfliers=False)) PL.xticks([1.5, 3.5, 5.5], ['Same\ngRNA', 'Other\ngRNA', 'Other\ngRNA\n(Rpt)']) PL.plot([2.5, 2.5], [0, max_kl], '-', color='silver') PL.plot([4.5, 4.5], [0, max_kl], '-', color='silver') PL.xlim((0.5, 6.5)) PL.ylim((0, max_kl)) PL.ylabel('KL') PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.25) PL.legend([bp["boxes"][0] for bp in bps], [x[1] for x in box_types], loc='upper left') PL.show(block=False) saveFig('kl_compare_old_new_KL')
def plotMicrohomologyMismatches(all_result_outputs, label=''): mut_hdrs = ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2'] cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel'] data = mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols) getLeft = lambda indel: tokFullIndel(indel)[2]['L'] getRight = lambda indel: tokFullIndel(indel)[2]['R'] getMHSize = lambda indel: tokFullIndel(indel)[2]['C'] oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1) reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])} is_reverse = lambda x: reverse_lookup[x] data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner') data['MH Size'] = data['Orig Indel'].apply(getMHSize) data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])] data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size'] data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size'] data['Is Reverse'] = data['Oligo ID'].apply(is_reverse) for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]: data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum'] data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum'] data[hdrL + ' Reads Ratio'] = data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data[hdrR + ' Reads Ratio'] = data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum'] data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum'] data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc'] data['1st Mismatch'] = data.apply(getMismatch, axis=1) data['Last Mismatch'] = data.apply(getLastMismatch, axis=1) data['MH GC Content'] = data.apply(getMhGC, axis=1) mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ] label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches', 'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence', 'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence', 'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)' } fig1 = PL.figure(figsize=(4,4)) fig_all = PL.figure(figsize=(10,10)) for i, (mh_typex, mh_typey) in enumerate(mh_indel_types): figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)] for fig, is_all in figs: PL.figure(fig.number) if is_all: PL.subplot(2,2,i+1) for nm,clr in zip([1,2],['royalblue','orange']): nm_data = data.loc[data['Num Mismatches'] == nm] sty, lsty = 'o', '-' sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)] PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm)) rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']]) if not is_all: print(grad, nm, mh_typex, mh_typey) if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2) PL.xlabel(label_lookup[mh_typex]) PL.ylabel(label_lookup[mh_typey]) PL.xlim((0,80)) PL.ylim((0,80)) PL.plot([0,80],[0,80],'k--') PL.legend() PL.show(block=False) saveFig('mm_mismatch_all') PL.figure(fig1.number) saveFig('mm_mismatch_one')
def plotProfiles(profiles, rep_reads, pam_idxs, reverses, labels, title='', max_lines=10): if len(profiles) == 0: raise Exception('Empty list of profiles') colors = [ FORECAST_GREEN, 'C0', 'C2', 'C2', 'C1', 'C1', 'C3', 'C3', 'C4', 'C4', 'C5', 'C5', 'C6' ] PL.rcParams['svg.fonttype'] = 'none' ocounts = [getProfileCounts(p1) for p1 in profiles] counts = [{ indel: (cnt, indel, perc1a, perc1b) for (cnt, indel, perc1a, perc1b) in x } for x in ocounts] #Count total non-null reads for each sample (to report in labels) nonnull_reads = [ sum([x[indel][0] for indel in x if indel != '-']) for x in counts ] labels = [ '%s(%d Reads)' % (tit, nn) for (tit, nn) in zip(labels, nonnull_reads) ] #Fetch the indels to display as union of top N indels across profiles num_top = 20 top_indels = [[y[1] for y in x[:num_top]] for x in ocounts] union_top_indels = set() for x in top_indels: union_top_indels = union_top_indels.union(set(x)) for indel in union_top_indels: for count in counts: if indel not in count: count[indel] = (0, indel, 0.0, 0.0) union_top_indels = [x for x in union_top_indels] indel_toks = [tokFullIndel(indel) for indel in union_top_indels] max_insert = max([0] + [toks[1] for toks in indel_toks if toks[0] == 'I']) #Order indels by decreasing average percentage across profiles top_av_percs = [(np.mean([x[indel][-1] for x in counts]), indel) for indel in union_top_indels] top_av_percs.sort(reverse=True) max_indels = max_lines / len(profiles) #Figure out Trims null_reads = [ x['-'] if '-' in x else [x[y[1]] for y in ocnt if y[1] in x][0] for x, ocnt in zip(rep_reads, ocounts) ] null_reads = [ Bio.Seq.reverse_complement(x) if rev else x for x, rev in zip(null_reads, reverses) ] pam_idxs = [ len(x) - pam if rev else pam for x, pam, rev in zip(null_reads, pam_idxs, reverses) ] min_null, pam_idx = min([(len(null), pidx) for (null, pidx) in zip(null_reads, pam_idxs)]) Ls = [x - pam_idx for x in pam_idxs] Rs = [L + min_null - len(null) for (L, null) in zip(Ls, null_reads)] #Plot scale_factor = 10.0 / max([x[1][3] for x in ocounts]) fig = PL.figure(figsize=(9, 5 * len(labels))) fig.patch.set_visible(False) ax = PL.gca() ax.axis('off') N = min(len(union_top_indels), max_indels) line_height = 0.8 min_xloc, max_xloc = MIN_X, MAX_X PL.ylim((0, (N + 1.0) * line_height)) bar_ypos, bar_len = [[] for x in profiles], [[] for x in profiles] for i, (av_perc, indel) in enumerate(top_av_percs): if i > max_indels: break for repr, cnts, rev, L1, R1, j in zip(rep_reads, counts, reverses, Ls, Rs, range(len(Rs))): (cnt1, indel1, perc1a, perc1b) = cnts[indel] if indel in repr: if R1 == 0: R1 = len(repr[indel]) seq = Bio.Seq.reverse_complement( repr[indel])[L1:R1] if rev else repr[indel][L1:R1] padded_seq, red_idxs, green_idxs = padReadForIndel( seq, indel, pam_idx) min_xloc, max_xloc = plotSeqLetterwise( padded_seq, (N - i + (j + 0.3) * 1.0 / len(profiles)) * line_height, pam_idx, red_idxs=red_idxs, green_idxs=green_idxs) if indel != '-': bar_ypos[j].append( (N - i + (j + 0.4) * 1.0 / len(profiles)) * line_height) bar_len[j].append(perc1b * scale_factor) hist_loc = max_xloc + 10 for bar1_ypos, bar1_len, label1, clr in zip(bar_ypos, bar_len, labels, colors): PL.barh(bar1_ypos, bar1_len, height=0.8 * line_height / len(profiles), left=hist_loc, label=label1, color=clr) for (ypos, blen) in zip(bar1_ypos, bar1_len): PL.text(hist_loc + blen + 1, ypos - 0.5 / len(profiles) * line_height, '%.1f%%' % (blen / scale_factor)) xlims = (min_xloc - 10, MAX_X + 20 + (min_xloc - MIN_X)) PL.xlim(xlims) for i, (av_perc, indel) in enumerate(top_av_percs): if i > max_indels: break if indel == '-': PL.text(xlims[0], (N - i + 0.4) * line_height, 'Target:', fontweight='bold') else: PL.text(xlims[0], (N - i + 0.4) * line_height, indel.split('_')[0], fontweight='bold') PL.plot([min_xloc - 10, max_xloc + 10], [(N - i) * line_height, (N - i) * line_height], 'lightgrey') PL.plot([0, 0], [0, (N + 1) * line_height], 'k--') PL.plot([min_xloc - 10, hist_loc], [N * line_height, N * line_height], 'k') PL.plot([hist_loc, hist_loc], [0, N * line_height], 'k') PL.xticks([]) PL.yticks([]) if len(labels) > 1: PL.legend(loc='upper right') PL.text(hist_loc, (N + 0.5) * line_height, title, fontweight='bold') PL.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05) PL.show(block=False) PL.axis('off') saveFig('%s_%d' % (title.replace(' ', '_'), len(labels)), bbox=False) return fig
def plotD1(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']] pie_vals, pie_labels = [], [] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D1' )] #Note: type check discards equally most common indels spans_cutsite = lambda indel: tokFullIndel(indel)[2][ 'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0 for nt in 'ATGC': is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq) * nt) num_repeat_nt = len(dmci_data.loc[ dmci_data['Altered Sequence'].apply(is_mh) & dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data)) print(num_repeat_nt) pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt)) is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq)) num_non_repeat = len( dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat) | ~dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_non_repeat * 100.0 / len(dmci_data)) print(num_non_repeat) pie_labels.append('Removal from non-repeat') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=OLD_COLORS) PL.title( 'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D1') oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') print(len(merged_mci_data)) nt_dbl_perc_d1, cnt_labels = [], [] is_d1 = lambda indel: (indel.split('_')[0] == 'D1') non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3] nts = 'ATGC' for nt in nts: double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt) dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt, axis=1)] num_dbl_d1 = sum( dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI'] & (dbl_data['Oligo Id'] != 'Oligo28137') ) #Oligo28137: Corner case where a guide has CT|T and loses the C nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data)) cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data))) print(len(dbl_data)) non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt, axis=1)] print(len(non_dbl_data)) num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1) & non_dbl_data['Equal MCI']) nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data)) cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data))) PL.figure() PL.bar(range(5), nt_dbl_perc_d1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt) PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other']) PL.ylim((0, 40)) PL.xlabel('Nucleotides on either side of cut site') PL.ylabel( 'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) saveFig('D1_bar_3_rep')
def plotD2(all_result_outputs, label=''): #Merge replicates mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) mci_merged_data_guides = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D2' )] #Note: type check discards equally most common indels pie_vals, pie_labels = [], [] is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][ -3] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3 is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][ -2] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2 is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row) is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row) is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row) lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)] pie_labels.append('Y|XY->Y') pie_vals.append(len(lrpt_data)) rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)] pie_labels.append('XY|X->X') pie_vals.append(len(rrpt_data)) rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)] pie_labels.append('XY|XY->XY') pie_vals.append(len(rpt_data)) is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0 ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)] pie_labels.append('Z|XY->Z') pie_vals.append(len(ro_data)) is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1 l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)] pie_labels.append('XY|Z->Z') pie_vals.append(len(l1_data)) seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union( set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union( set(rrpt_data['Oligo Id'])) is_unseen = lambda id: id not in seen_ids unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)] print(unseen_data) assert (len(unseen_data) == 0) #pie_labels.append('Other') #pie_vals.append(len(unseen_data)) #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()] #pie_vals = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels] PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title( 'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D2_indel_cats') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data)) PL.show(block=False) #__| PL.subplot(2, 3, 2) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data)) PL.show(block=False) #|__ PL.subplot(2, 3, 3) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-3:-1] == mh_str pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data)) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%sN|%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-5] == nt pie_vals.append(len( lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data)) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%s|N%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-4] == nt pie_vals.append(len( rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data)) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(rpt_data['Oligo Id']).intersection( set(dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 90)) PL.xlabel('XY') PL.title('XY|XY->XY') PL.ylabel( 'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #__| PL.subplot(2, 3, 2) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(ro_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('XY| -> __|') PL.ylabel( 'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #|__ PL.subplot(2, 3, 3) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(l1_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('|XY -> |__') PL.ylabel( 'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(lrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('XY|X->X') PL.ylabel( 'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(rrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('X|YX->X') PL.ylabel( 'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat_bars')
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')