def createOverbeekTemplates(selected_id=None): ctrl_samdir = getHighDataDir() + '/overbeek_control_sam_files' output_template_dir = getHighDataDir() + '/overbeek_template_files' if not os.path.isdir(output_template_dir): os.mkdir(output_template_dir) lookup = loadLocationSpacerLookup() f = io.open(getHighDataDir() + '/overbeek_self_targets.csv') reader = csv.reader(f, delimiter='\t') for toks in reader: idx = eval(toks[-1].split()[-1]) id, samid = 'Overbeek%d' % idx, 'Overbeek_%d' % idx if selected_id is not None and selected_id != id: continue loc, spacer_seq, primer = lookup[id] fout = io.open(output_template_dir + '/%s_template.fasta' % id, 'w') ctrl_sam_file = ctrl_samdir + '/%s.sam' % samid template_seq, pam_loc, pam_dir = extractTemplateSequenceAndPamLoc( ctrl_sam_file, loc, id, spacer_seq, primer) fout.write(u'>%s_%s %d %s\n%s\n' % (id, spacer_seq, pam_loc, pam_dir, template_seq)) fout.close()
def runAnalysis(): spec = {'results_dir': getHighDataDir() + '/microhomology/mh_freqs_by_len', 'dirname_to_result_fn': lambda x: x, 'result_to_dirname_fn': lambda x: x, 'py_func_load': loadAllMHLenData, 'py_funcs_per_result': [(plotK562PercScatterAnalysis,'RegrLines'), (passData, 'Data')], 'py_funcs_all_results': [compareMHK562lines, plotGCContent], 'reads_colname': 'Non-Null Reads', 'check_output_fn': lambda x: True, 'id_colname': 'Oligo ID', 'min_reads': MIN_READS, 'partitions': ['Non-Targeting'], 'samples': ['K562 New'] } analyseResultsPerPartition( spec ) spec = {'results_dir': getHighDataDir() + '/microhomology/mh_freqs_by_len', 'dirname_to_result_fn': lambda x: x, 'result_to_dirname_fn': lambda x: x, 'py_func_load': loadAllMHLenData, 'py_funcs_per_result': [(plotPercScatterAnalysis,'RegrLines')], 'py_funcs_all_results': [compareMHlines], 'reads_colname': 'Non-Null Reads', 'check_output_fn': lambda x: True, 'id_colname': 'Oligo ID', 'min_reads': MIN_READS, 'partitions': ['Non-Targeting'], 'samples': ['DPI7'] } analyseResultsPerPartition( spec )
def loadAllData(guideset, sample_selector=lambda x: True, label='', cols=['KL without null'], allow_pickle=False): pickle_file = '%s/kl_analysis_%s.pickle' % (getPickleDir(), label.replace(' ', '_')) if os.path.exists(pickle_file) and allow_pickle: merged_data = pandas.read_pickle(pickle_file) else: cmp_files = os.listdir(getHighDataDir() + '/' + ST_COMPARISON_RESULTS_DIR) merged_data = None for filename in cmp_files: dir1, dir2 = getDirsFromFilename(filename) if not sample_selector(dir1) or not sample_selector(dir2): continue #Load data from file data = pandas.read_csv(getHighDataDir() + '/' + ST_COMPARISON_RESULTS_DIR + '/' + filename, sep='\t') data['Mutated Reads 1'] = data['Num Reads 1'] - data[ 'Num null reads 1'] data['Mutated Reads 2'] = data['Num Reads 2'] - data[ 'Num null reads 2'] data = data.loc[data['Mutated Reads 1'] > MIN_READS] data = data.loc[data['Mutated Reads 2'] > MIN_READS] data = data.loc[data['ID'].isin(guideset)][['ID'] + cols] if merged_data is not None and len(data) < 0.75 * len(merged_data): print('Skipping %s, data for insufficient guides (%d vs %d)' % (filename, len(data), len(merged_data))) continue #Merge with the other data (keep only common Oligos) suffix_fn = lambda x: '$' + x if merged_data is None: merged_data, first_suffix = data, suffix_fn(filename) else: merged_data = merged_data.merge(data, how='inner', on='ID', suffixes=('', suffix_fn(filename))) print(len(merged_data), filename) merged_data = merged_data.rename( columns={x: (x + first_suffix) for x in cols}) if allow_pickle: merged_data.to_pickle(pickle_file) return merged_data
def loadIndelData(): indel_data_new = pd.read_csv(getHighDataDir() + '/i1/exp_target_pam_new_gen_i1_indels.txt', sep='\t', header=1) indel_data_old = pd.read_csv(getHighDataDir() + '/i1/exp_target_pam_old_gen_i1_indels.txt', sep='\t', header=1) indel_data = pd.concat([indel_data_new, indel_data_old])[[ 'Oligo Id', 'Repeat Nucleotide Left', 'Repeat Nucleotide Right' ]] indel_data['Short Oligo Id'] = indel_data['Oligo Id'].apply( getShortOligoId) return indel_data
def fetchMhMismatchFrequencies(dirname, outdir='mh_mismatch_indel_frequencies'): if not os.path.isdir(outdir): os.makedirs(outdir) if isOldLib(dirname): raise Exception('Old Lib not supported') mh_exp_indels_file = getHighDataDir() + '/mh_mismatch_indels.txt' fout = io.open(outdir + '/' + getDirLabel(dirname) + '.txt', 'w') hdr_str = '\t'.join([ '\t'.join([ x + ' Indel Reads in ' + y for x in ['Orig', 'Left Mut', 'Right Mut', 'Merged Mut1', 'Merged Mut2'] ]) for y in ['Mut', 'Orig'] ]) f = io.open(mh_exp_indels_file) rdr = csv.DictReader(f, delimiter='\t') fout.write(u'%s\t%s\tMut Non-Null Reads\tOrig Non-Null Reads\n' % ('\t'.join(rdr.fieldnames), hdr_str)) for row in rdr: #Load Indel Profiles for both the original and mutated micrhomology forms mut_oligo_id = row['Oligo ID'].replace('_', '') orig_oligo_id = row['Mapped Oligo Id'].replace('_', '') mut_filepath, mut_filename = getFileForOligoIdx( getOligoIdxFromId(mut_oligo_id), ext='_mappedindelsummary.txt') orig_filepath, orig_filename = getFileForOligoIdx( getOligoIdxFromId(orig_oligo_id), ext='_mappedindelsummary.txt') p_mut, p_orig = {}, {} stats_mut = readSummaryToProfile(dirname + '/mapped_reads/' + mut_filepath + '/' + mut_filename, p_mut, oligoid=mut_oligo_id) stats_orig = readSummaryToProfile(dirname + '/mapped_reads/' + orig_filepath + '/' + orig_filename, p_orig, oligoid=orig_oligo_id) indels = [ row['Orig Indel'], row['Left Mut-MH Indel'], row['Right Mut-MH Indel'], row['Merge Mut 1 Indel'], row['Merge Mut 2 Indel'] ] reads = lambda indel, profile: profile[indel] if (indel in profile and indel != '') else 0 mut_read_str = '\t'.join( ['%d' % reads(indel, p_mut) for indel in indels]) orig_read_str = '\t'.join( ['%d' % reads(indel, p_orig) for indel in indels]) str_args = ('\t'.join([row[col] for col in rdr.fieldnames ]), mut_read_str, orig_read_str, stats_mut[0] - stats_mut[2], stats_orig[0] - stats_orig[2]) fout.write(u'%s\t%s\t%s\t%d\t%d\n' % str_args) f.close() fout.close()
def plotInFrameCorr(data): shi_data = pd.read_csv(getHighDataDir() + '/shi_deepseq_frame_shifts.txt', sep='\t') label1, label2 = 'New In Frame Perc', 'Predicted In Frame Per' PL.figure(figsize=(4, 4)) xdata, ydata = data[label1], data[label2] PL.plot(xdata, ydata, '.', alpha=0.15) PL.plot(shi_data['Measured Frame Shift'], shi_data['Predicted Frame Shift'], '^', color='orange') for x, y, id in zip(shi_data['Measured Frame Shift'], shi_data['Predicted Frame Shift'], shi_data['ID']): if x - y > 10: PL.text(x, y, id.split('/')[1][:-21]) PL.plot([0, 100], [0, 100], 'k--') PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0])) PL.xlabel('percent in frame mutations (measured)') PL.ylabel('percent in frame mutations (predicted)') PL.ylim((0, 80)) PL.xlim((0, 80)) PL.show(block=False) saveFig('in_frame_corr_%s_%s' % (label1.replace(' ', '_'), label2.replace(' ', '_')))
def plotDominantBars(all_result_outputs, label=''): pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads'] mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData') mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3']) mci_merged_data['Is Dominant I1'] = (mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')) oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',sep='\t') remove_under = lambda x: x.replace('_','') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id','Guide']], how='inner',on='Oligo Id') nt_perc_i1, cnt_labels = [], [] nts = 'ATGC' for nt in nts: is_nt = lambda guide: (guide[-4] == nt) nt_data = merged_mci_data.loc[merged_mci_data['Guide'].apply(is_nt)] nt_perc_i1.append(sum(nt_data['Is Dominant I1'])*100.0/len(nt_data)) cnt_labels.append('%d/%d' % (sum(nt_data['Is Dominant I1']), len(nt_data))) PL.figure() PL.bar(range(4), nt_perc_i1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i-0.3,nt_perc_i1[i]+5.0,cnt) PL.xticks(range(4), [x for x in nts]) PL.xlabel('Nucleotide on Left of cut-site') PL.ylabel('Percent gRNAs with single nucleotide insertion\nas most common indel in all 3 replicates') PL.show(block=False) saveFig('I1_bar_3_rep')
def loadProfilePair(old_id, new_id): p_old, p_new = {}, {} old_file, new_file = getSummaryFileSuffix(old_id), getSummaryFileSuffix( new_id) mut_reads_old, mut_reads_new = 0, 0 for new_dir in [getHighDataDir() + '/' + x for x in new_dirs]: acc, pacc, null = readSummaryToProfile(new_dir + '/mapped_reads/' + new_file, p_new, oligoid=new_id) mut_reads_new += (acc - null) for old_dir in [getHighDataDir() + '/' + x for x in old_dirs]: acc, pacc, null = readSummaryToProfile(old_dir + '/mapped_reads/' + old_file, p_old, oligoid=old_id) mut_reads_old += (acc - null) return p_old, p_new, mut_reads_old, mut_reads_new
def loadRepReads(new_id): oligo_idx = getOligoIdxFromId(new_id) subdir, profilefilename = getFileForOligoIdx( oligo_idx, ext='_mappedindelprofiles.txt') profile_file = getHighDataDir( ) + '/' + new_dirs[0] + '/mapped_reads/' + subdir + '/' + profilefilename rep_reads = {} fetchRepresentativeCleanReads(profile_file, rep_reads, oligoid=new_id) return rep_reads
def runAnalysis(): spec = { 'results_dir': getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4], 'py_func_load': loadData, 'py_funcs_per_result': [(perOligoCounts, 'perOligoCounts'), (perOligoMCI, 'perOligoMCI'), (computePercentages, 'PercData')], 'py_funcs_all_results': [plotSumPie, plotMCIPie, plotPercCorrelations], 'check_output_fn': lambda x: True, 'reads_colname': 'Total reads', 'min_reads': MIN_READS, 'id_colname': 'Oligo Id', 'partitions': ['Real Guides'], 'samples': ['K562 New'] } analyseResultsPerPartition(spec) spec = { 'results_dir': getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4], 'py_func_load': loadData, 'py_funcs_per_result': [(computePieData, 'PieData')], 'py_funcs_all_results': [plotBarSummaryPieIndels], 'check_output_fn': lambda x: True, 'reads_colname': 'Total reads', 'min_reads': MIN_READS, 'id_colname': 'Oligo Id', 'partitions': ['Real Guides'], 'samples': ['DPI7'] } analyseResultsPerPartition(spec)
def runAnalysis(): spec = {'results_specs': [{'results_dir':getHighDataDir() + '/i1/i1_summaries', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4]}, {'results_dir':getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4]}], 'py_func_load': loadI1andMCIData, 'py_funcs_per_result': [(mergeWithIndelData, 'i1IndelData')], 'py_funcs_all_results': [plotDominantBars,plotDominantPieDataWithAmbig,plotMergedPieDataWithAmbig, plotMergedI1Repeats], 'check_output_fn': lambda x: True, 'reads_colname': 'Total reads', 'min_reads': MIN_READS, 'id_colname': 'Oligo Id', 'partitions': ['Non-Targeting'], 'samples': ['K562 New'] } analyseResultsPerPartition( spec )
def loadProfilesSeparately(old_id, new_id): p_olds, p_news, old_sep_mr, new_sep_mr = [{}, {}], [{}, {}], [0, 0], [0, 0] old_file, new_file = getSummaryFileSuffix(old_id), getSummaryFileSuffix( new_id) for new_dir in [getHighDataDir() + '/' + x for x in new_dirs]: idx = 0 if '800' in new_dir else 1 acc, pacc, null = readSummaryToProfile(new_dir + '/mapped_reads/' + new_file, p_news[idx], oligoid=new_id) new_sep_mr[idx] += acc - null for old_dir in [getHighDataDir() + '/' + x for x in old_dirs]: idx = 0 if '800' in old_dir else 1 acc, pacc, null = readSummaryToProfile(old_dir + '/mapped_reads/' + old_file, p_olds[idx], oligoid=old_id) old_sep_mr[idx] += acc - null return p_olds, p_news, old_sep_mr, new_sep_mr
def loadLocationSpacerLookup(): f = io.open(getHighDataDir() + '/overbeek_2016_guides_s1.txt') reader = csv.DictReader(f, delimiter='\t') lookup = { 'Overbeek%d' % eval(row['Spacer ']): (row['Genomic location of spacer (hg19)'], row['Spacer sequence'], row['sgRNA primer']) for row in reader } f.close() return lookup
def prepareExample(example_dir): setHighDataDir(example_dir) #Generate all possible indels new_gen_dir, old_gen_dir = getHighDataDir( ) + '/generated_indels_new', getHighDataDir() + '/generated_indels_old' if not os.path.isdir(new_gen_dir): os.makedirs(new_gen_dir) if not os.path.isdir(old_gen_dir): os.makedirs(old_gen_dir) cmd = getIndelGenExe() + ' ' + getHighDataDir( ) + '/exp_target_pam_new.fasta ' + new_gen_dir + '/' print(cmd) os.system(cmd) cmd = getIndelGenExe() + ' ' + getHighDataDir( ) + '/exp_target_pam_old.fasta ' + old_gen_dir + '/' print(cmd) os.system(cmd) #Compile number of reads per sample for each indel reads_dir = getHighDataDir() + '/reads_for_gen_indels' compileGenIndelReads(gen_indel_dir=new_gen_dir, out_dir=reads_dir, sample_dirs=new_dirs) compileGenIndelReads(gen_indel_dir=old_gen_dir, out_dir=reads_dir, sample_dirs=old_dirs) setReadsDir(reads_dir) #Compute features for each indel features_dir = getHighDataDir() + '/features_for_gen_indels' computeFeaturesForGenIndels(gen_indel_dir=new_gen_dir, out_dir=features_dir) computeFeaturesForGenIndels(gen_indel_dir=old_gen_dir, out_dir=features_dir) setFeaturesDir(features_dir)
def loadMappings(): f = io.open(getHighDataDir() + '/overbeek_to_oligo_mapping.txt') reader = csv.reader(f, delimiter='\t') mappings = {} for toks in reader: overbeek_id = 'Overbeek' + toks[0].split()[-1] oligo_id = toks[1].split('_')[0] old = (toks[2] == 'Old') if overbeek_id not in mappings: mappings[overbeek_id] = [] mappings[overbeek_id].append((oligo_id, old)) f.close() return mappings
def runAnalysis(): partitions = partitionGuides(oligo_detail_dir=getHighDataDir() + '/ST_June_2017/data') for part_desc in ['Real Guides']: selector = getSampleSelectors()['DPI7'] guideset = partitions[part_desc] desc = part_desc + ' DPI7' data = loadAllData(guideset, sample_selector=selector, label=desc) plotHeatMap(data, label=desc)
def collectMhOfLen(filename, mh_len, fout): det = loadAllOligoDetails(oligo_detail_dir=getHighDataDir() + '/ST_June_2017/data') oligo_details = {'Oligo' + x.split('_')[-1]: val for x, val in det.items()} indels_to_write = [] max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = 0, -1, -1, -1, '' f = io.open(filename) #Collect indels of the right length, write out with details of MH indel with max reads for that oligo for toks in csv.reader(f, delimiter='\t'): #Next Oligo (write out last) if toks[0][:3] == '@@@': if len(indels_to_write) > 0: oligo_line = u'%d\t%d\t%s\t%d\t%d\t%d\t%d' % ( accpt_reads, accpt_nonnull_reads, max_indel, max_reads, len_mh_max_reads, left_max_reads, right_max_reads) for indel_line in indels_to_write: fout.write(u'%s\t%s\n' % (indel_line, oligo_line)) ctoks = toks[0][3:].split(':') oligo_id = ctoks[0] target = oligo_details[oligo_id]['Target'] accpt_reads, accpt_nonnull_reads = eval(ctoks[1]), eval(ctoks[2]) max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = 0, -1, -1, -1, '' indels_to_write = [] continue #MH details, collect MH's of correct length, and also track details of MH indel with max reads left, right, c_mh_len, indel, reads = eval(toks[0]), eval( toks[1]), eval(toks[2]), toks[3], eval(toks[-1]) l_mh_seq, r_mh_seq = target[left:left + c_mh_len], target[right:right + c_mh_len] assert (l_mh_seq == r_mh_seq) gc_content = sum([x in ['G', 'C'] for x in l_mh_seq]) * 100.0 / len(l_mh_seq) if reads > max_reads: max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = reads, c_mh_len, left, right, indel if c_mh_len != mh_len: continue indels_to_write.append( u'%s\t%s\t%d\t%d\t%d\t%.1f' % (oligo_id, indel, reads, left, right, gc_content)) #Write last Oligo (if needed) if len(indels_to_write) > 0: oligo_line = u'%d\t%d\t%s\t%d\t%d\t%d\t%d' % ( accpt_reads, accpt_nonnull_reads, max_indel, max_reads, len_mh_max_reads, left_max_reads, right_max_reads) for indel_line in indels_to_write: fout.write(u'%s\t%s\n' % (indel_line, oligo_line))
def runAnalysis(): spec = {'results_dir':getHighDataDir() + '/microhomology_mismatch/mh_mismatch_indel_frequencies', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4], 'py_func_load': loadData, 'py_funcs_per_result': [(passData,'Data')], 'py_funcs_all_results': [plotMicrohomologyMismatches], 'check_output_fn': lambda x: True, 'reads_colname': 'Orig Non-Null Reads', 'min_reads': MIN_READS, 'id_colname': 'Oligo ID', 'partitions': ['Non-Targeting'], 'samples': ['K562 New'] } analyseResultsPerPartition( spec )
def compileGenIndelReads(gen_indel_dir='generated_indels', out_dir='reads_for_gen_indels_all_samples', sample_dirs=[]): if not os.path.isdir(out_dir): os.mkdir(out_dir) for gen_file in os.listdir(gen_indel_dir): oligo_id = gen_file.split('_')[0] oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, sum_filename = getFileForOligoIdx( oligo_idx, ext='_mappedindelsummary.txt') out_subdir = out_dir + '/' + oligo_subdir if not os.path.isdir(out_subdir): os.mkdir(out_subdir) #Read all profiles for this oligo profiles, mut_read_totals = [], [] for dirname in sample_dirs: profiles.append({}) filename = getHighDataDir( ) + '/' + dirname + '/mapped_reads/' + oligo_subdir + '/' + sum_filename stats = readSummaryToProfile(filename, profiles[-1], oligoid=oligo_id) mut_read_totals.append('%d' % (stats[0] - stats[2])) #Compile reads for each indel across all samples f = io.open(gen_indel_dir + '/' + gen_file) fout = io.open(out_subdir + '/%s_gen_indel_reads.txt' % oligo_id, 'w') fout.write(f.readline()) #Git commit fout.write(u'Indel\tDetails\t%s\n' % '\t'.join([getDirLabel(x) for x in sample_dirs])) fout.write(u'All Mutated\t[]\t%s\n' % '\t'.join(mut_read_totals)) for toks in csv.reader(f, delimiter='\t'): indel, indel_details = toks[0], toks[2] read_str = '\t'.join( ['%d' % (p1[indel] if indel in p1 else 0) for p1 in profiles]) fout.write(u'%s\t%s\t%s\n' % (indel, indel_details, read_str)) fout.close() f.close()
def computeFeaturesForGenIndels(gen_indel_dir='generated_indels', out_dir='features_for_gen_indels'): if not os.path.isdir(out_dir): os.mkdir(out_dir) #Load Oligo details oligo_details = loadAllOligoDetails(oligo_detail_dir=getHighDataDir() + '/ST_June_2017/data') oligo_details = { id.replace('_', ''): row for (id, row) in oligo_details.items() } for gen_file in os.listdir(gen_indel_dir): print(gen_file) oligo_id = gen_file.split('_')[0] oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='') out_subdir = out_dir + '/' + oligo_subdir if not os.path.isdir(out_subdir): os.mkdir(out_subdir) row = oligo_details[oligo_id] uncut_seq = row['Target'] if row[ 'PAM Direction'] != 'REVERSE' else Bio.Seq.reverse_complement( row['Target']) cut_site = eval(row['PAM Location'] ) - 3 if row['PAM Direction'] != 'REVERSE' else ( 79 - eval(row['PAM Location']) - 3) generated_indel_file = gen_indel_dir + '/' + gen_file out_file = out_subdir + '/%s_gen_indel_features.txt' % oligo_id is_reverse = (row['PAM Direction'] == 'REVERSE') calculateFeaturesForGenIndelFile(generated_indel_file, uncut_seq, cut_site, out_file, is_reverse=is_reverse)
def runAnalysis(): data = pd.read_csv(getHighDataDir() + '/old_new_kl_summaries.txt', sep='\t').fillna(-1.0) kl_cols = [ x for x in data.columns if 'KL' in x and 'Class KL' not in x and 'Old v Old' not in x ] max_kl = 9 PL.figure(figsize=(2.5, 4)) bps = [] box_types = [('C2', 'Within Library'), ('C0', 'Between Library')] for i, (clr, box_type) in enumerate(box_types): col_box_data = [ data[col] for col in kl_cols if renameCol(col) == box_type ] pos = [2 * x + i + 1 for x in range(len(col_box_data))] print('KL', box_type, np.median(col_box_data, axis=1)) bps.append( PL.boxplot(col_box_data, positions=pos, patch_artist=True, boxprops=dict(facecolor=clr), showfliers=False)) PL.xticks([1.5, 3.5, 5.5], ['Same\ngRNA', 'Other\ngRNA', 'Other\ngRNA\n(Rpt)']) PL.plot([2.5, 2.5], [0, max_kl], '-', color='silver') PL.plot([4.5, 4.5], [0, max_kl], '-', color='silver') PL.xlim((0.5, 6.5)) PL.ylim((0, max_kl)) PL.ylabel('KL') PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.25) PL.legend([bp["boxes"][0] for bp in bps], [x[1] for x in box_types], loc='upper left') PL.show(block=False) saveFig('kl_compare_old_new_KL')
# Configure environment #---------------------------------------------------------------------- setRunLocal(True) setHighDataDir('/results/endogenous_processing_example/') setPythonCmd('python') setIndelMapExe('/usr/local/bin/indelmap') #---------------------------------------------------------------- # Processing of raw Van-Overbeek et al reads to produce descriptions of indels #---------------------------------------------------------------- #Note: This provides a demonstration on just 1 oligo, going from raw overbeek reads to indel descriptions. # Sam files are assumed to be already collected (for further details of this part see # collect_overbeek_sams.py in same dir) printStatus('Create fastq files from Van Overbeek sam files') sam_dir, fastq_dir = getHighDataDir() + '/overbeek_sam_files', getHighDataDir( ) + '/overbeek_fastq_files' if not os.path.isdir(fastq_dir): os.makedirs(fastq_dir) extractReads(sam_dir + '/Overbeek_6.sam', fastq_dir + '/Overbeek6.fastq', 'chrX:66765045-66765067', 'Overbeek6') sam_dir, fastq_dir = getHighDataDir( ) + '/overbeek_control_sam_files', getHighDataDir( ) + '/overbeek_control_fastq_files' if not os.path.isdir(fastq_dir): os.makedirs(fastq_dir) extractReads(sam_dir + '/Overbeek_6.sam', fastq_dir + '/Overbeek6.fastq', 'chrX:66765045-66765067', 'Overbeek6') printStatus('Compute mutational profile from Van Overbeek data') createOverbeekTemplates(selected_id='Overbeek6') computeOverbeekIndelProfiles(highdir=getHighDataDir(), selected_id='Overbeek6')
def loadValidationPairs(): f = io.open(getHighDataDir() + '/old_new_validation_guides.txt') id_pairs = [[row['Old Oligo Id'], row['New Oligo Id']] for row in csv.DictReader(f, delimiter='\t')] f.close() return id_pairs
def plotMicrohomologyMismatches(all_result_outputs, label=''): mut_hdrs = ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2'] cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel'] data = mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols) getLeft = lambda indel: tokFullIndel(indel)[2]['L'] getRight = lambda indel: tokFullIndel(indel)[2]['R'] getMHSize = lambda indel: tokFullIndel(indel)[2]['C'] oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1) reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])} is_reverse = lambda x: reverse_lookup[x] data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner') data['MH Size'] = data['Orig Indel'].apply(getMHSize) data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])] data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size'] data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size'] data['Is Reverse'] = data['Oligo ID'].apply(is_reverse) for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]: data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum'] data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum'] data[hdrL + ' Reads Ratio'] = data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data[hdrR + ' Reads Ratio'] = data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum'] data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum'] data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc'] data['1st Mismatch'] = data.apply(getMismatch, axis=1) data['Last Mismatch'] = data.apply(getLastMismatch, axis=1) data['MH GC Content'] = data.apply(getMhGC, axis=1) mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ] label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches', 'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence', 'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence', 'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)' } fig1 = PL.figure(figsize=(4,4)) fig_all = PL.figure(figsize=(10,10)) for i, (mh_typex, mh_typey) in enumerate(mh_indel_types): figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)] for fig, is_all in figs: PL.figure(fig.number) if is_all: PL.subplot(2,2,i+1) for nm,clr in zip([1,2],['royalblue','orange']): nm_data = data.loc[data['Num Mismatches'] == nm] sty, lsty = 'o', '-' sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)] PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm)) rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']]) if not is_all: print(grad, nm, mh_typex, mh_typey) if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2) PL.xlabel(label_lookup[mh_typex]) PL.ylabel(label_lookup[mh_typey]) PL.xlim((0,80)) PL.ylim((0,80)) PL.plot([0,80],[0,80],'k--') PL.legend() PL.show(block=False) saveFig('mm_mismatch_all') PL.figure(fig1.number) saveFig('mm_mismatch_one')
def runAnalysis(): data = pd.read_csv(getHighDataDir() + '/old_new_kl_predicted_summaries.txt', sep='\t').fillna(-1.0) plotKLBoxes(data) plotInFrameCorr(data)
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.', start_count=0, end_count=10000): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) new_sep_labels = 'New 2x800x', 'New 1600x' old_sep_labels = 'Old 2x800x', 'Old 1600x' #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open( out_dir + '/old_new_kl_predicted_summaries.txt' % (start_count, end_count), 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t' ) fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t' ) fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t' ) fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels) + '\t') fout.write(u'\t'.join([ '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1]) for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels)) ]) + '\n') id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Load separate profiles too p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately( old_id, new_id) #Compute in frame percentages old_if_perc = getInFramePerc(p_old) new_if_perc = getInFramePerc(p_new) comb_if_perc = getInFramePerc(p_comb) pred_if_perc = getInFramePerc(p_predict) new_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_new_sep ] old_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_old_sep ] #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_new_sep[0], p_new_sep[1], p_predict], [rrds, rrds, rrds], [56, 56, 56], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]), symmetricKL(p_new, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args kl_str += u'\t'.join([ '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep ]) kl_str += u'\t' + u'\t'.join([ '%.5f' % symmetricKL(x, y) for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep)) ]) if_str = u'\t'.join( ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs]) mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr]) fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, mut_str, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, if_str, kl_str)) fout.flush() fout.close()
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')
def plotD1(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']] pie_vals, pie_labels = [], [] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D1' )] #Note: type check discards equally most common indels spans_cutsite = lambda indel: tokFullIndel(indel)[2][ 'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0 for nt in 'ATGC': is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq) * nt) num_repeat_nt = len(dmci_data.loc[ dmci_data['Altered Sequence'].apply(is_mh) & dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data)) print(num_repeat_nt) pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt)) is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq)) num_non_repeat = len( dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat) | ~dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_non_repeat * 100.0 / len(dmci_data)) print(num_non_repeat) pie_labels.append('Removal from non-repeat') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=OLD_COLORS) PL.title( 'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D1') oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') print(len(merged_mci_data)) nt_dbl_perc_d1, cnt_labels = [], [] is_d1 = lambda indel: (indel.split('_')[0] == 'D1') non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3] nts = 'ATGC' for nt in nts: double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt) dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt, axis=1)] num_dbl_d1 = sum( dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI'] & (dbl_data['Oligo Id'] != 'Oligo28137') ) #Oligo28137: Corner case where a guide has CT|T and loses the C nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data)) cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data))) print(len(dbl_data)) non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt, axis=1)] print(len(non_dbl_data)) num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1) & non_dbl_data['Equal MCI']) nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data)) cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data))) PL.figure() PL.bar(range(5), nt_dbl_perc_d1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt) PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other']) PL.ylim((0, 40)) PL.xlabel('Nucleotides on either side of cut site') PL.ylabel( 'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) saveFig('D1_bar_3_rep')
compileGenIndelReads(gen_indel_dir=old_gen_dir, out_dir=reads_dir, sample_dirs=old_dirs) setReadsDir(reads_dir) #Compute features for each indel features_dir = getHighDataDir() + '/features_for_gen_indels' computeFeaturesForGenIndels(gen_indel_dir=new_gen_dir, out_dir=features_dir) computeFeaturesForGenIndels(gen_indel_dir=old_gen_dir, out_dir=features_dir) setFeaturesDir(features_dir) if __name__ == '__main__': setIndelGenExe('/usr/local/bin/indelgen') setPlotDir('/results/plots') setFigType('png') shutil.copytree('/data/predicted_vs_measured_example', '/results/predicted_vs_measured_example') prepareExample('/results/predicted_vs_measured_example') #Predict mutations using pre-trained model and compare to actual (for one oligo only) theta_file = getHighDataDir( ) + '/model_output_10000_0.01000000_0.01000000_-0.607_theta.txt_cf0.txt' computeAndComparePredicted(theta_file, selected_id='Oligo35785', out_dir='.')
def plotD2(all_result_outputs, label=''): #Merge replicates mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) mci_merged_data_guides = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D2' )] #Note: type check discards equally most common indels pie_vals, pie_labels = [], [] is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][ -3] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3 is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][ -2] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2 is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row) is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row) is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row) lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)] pie_labels.append('Y|XY->Y') pie_vals.append(len(lrpt_data)) rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)] pie_labels.append('XY|X->X') pie_vals.append(len(rrpt_data)) rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)] pie_labels.append('XY|XY->XY') pie_vals.append(len(rpt_data)) is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0 ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)] pie_labels.append('Z|XY->Z') pie_vals.append(len(ro_data)) is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1 l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)] pie_labels.append('XY|Z->Z') pie_vals.append(len(l1_data)) seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union( set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union( set(rrpt_data['Oligo Id'])) is_unseen = lambda id: id not in seen_ids unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)] print(unseen_data) assert (len(unseen_data) == 0) #pie_labels.append('Other') #pie_vals.append(len(unseen_data)) #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()] #pie_vals = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels] PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title( 'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D2_indel_cats') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data)) PL.show(block=False) #__| PL.subplot(2, 3, 2) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data)) PL.show(block=False) #|__ PL.subplot(2, 3, 3) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-3:-1] == mh_str pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data)) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%sN|%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-5] == nt pie_vals.append(len( lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data)) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%s|N%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-4] == nt pie_vals.append(len( rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data)) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(rpt_data['Oligo Id']).intersection( set(dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 90)) PL.xlabel('XY') PL.title('XY|XY->XY') PL.ylabel( 'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #__| PL.subplot(2, 3, 2) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(ro_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('XY| -> __|') PL.ylabel( 'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #|__ PL.subplot(2, 3, 3) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(l1_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('|XY -> |__') PL.ylabel( 'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(lrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('XY|X->X') PL.ylabel( 'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(rrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('X|YX->X') PL.ylabel( 'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat_bars')