def fetchMhMismatchFrequencies(dirname, outdir='mh_mismatch_indel_frequencies'): if not os.path.isdir(outdir): os.makedirs(outdir) if isOldLib(dirname): raise Exception('Old Lib not supported') mh_exp_indels_file = getHighDataDir() + '/mh_mismatch_indels.txt' fout = io.open(outdir + '/' + getDirLabel(dirname) + '.txt', 'w') hdr_str = '\t'.join([ '\t'.join([ x + ' Indel Reads in ' + y for x in ['Orig', 'Left Mut', 'Right Mut', 'Merged Mut1', 'Merged Mut2'] ]) for y in ['Mut', 'Orig'] ]) f = io.open(mh_exp_indels_file) rdr = csv.DictReader(f, delimiter='\t') fout.write(u'%s\t%s\tMut Non-Null Reads\tOrig Non-Null Reads\n' % ('\t'.join(rdr.fieldnames), hdr_str)) for row in rdr: #Load Indel Profiles for both the original and mutated micrhomology forms mut_oligo_id = row['Oligo ID'].replace('_', '') orig_oligo_id = row['Mapped Oligo Id'].replace('_', '') mut_filepath, mut_filename = getFileForOligoIdx( getOligoIdxFromId(mut_oligo_id), ext='_mappedindelsummary.txt') orig_filepath, orig_filename = getFileForOligoIdx( getOligoIdxFromId(orig_oligo_id), ext='_mappedindelsummary.txt') p_mut, p_orig = {}, {} stats_mut = readSummaryToProfile(dirname + '/mapped_reads/' + mut_filepath + '/' + mut_filename, p_mut, oligoid=mut_oligo_id) stats_orig = readSummaryToProfile(dirname + '/mapped_reads/' + orig_filepath + '/' + orig_filename, p_orig, oligoid=orig_oligo_id) indels = [ row['Orig Indel'], row['Left Mut-MH Indel'], row['Right Mut-MH Indel'], row['Merge Mut 1 Indel'], row['Merge Mut 2 Indel'] ] reads = lambda indel, profile: profile[indel] if (indel in profile and indel != '') else 0 mut_read_str = '\t'.join( ['%d' % reads(indel, p_mut) for indel in indels]) orig_read_str = '\t'.join( ['%d' % reads(indel, p_orig) for indel in indels]) str_args = ('\t'.join([row[col] for col in rdr.fieldnames ]), mut_read_str, orig_read_str, stats_mut[0] - stats_mut[2], stats_orig[0] - stats_orig[2]) fout.write(u'%s\t%s\t%s\t%d\t%d\n' % str_args) f.close() fout.close()
def get_data(self) -> WGEData: reads_file = tempfile.mkstemp()[1] profile_file = tempfile.mkstemp()[1] r = requests.get(self.filename, allow_redirects=True) with open(reads_file, 'w') as f: f.write(r.text) r = requests.get(self.filename.replace("_predicted_rep_reads.txt", "_predicted_mapped_indel_summary.txt"), allow_redirects=True) with open(profile_file, 'w') as f: f.write(r.text) crispr_line_info = get_guide_info_from_oligo_id(profile_file, self.oligo_id) profile = {} readSummaryToProfile(profile_file, profile, oligoid=self.oligo_id, remove_wt=False) return WGEData(reads_file, profile_file, profile, crispr_line_info)
def read_profile(obj): reads_file = tempfile.mkstemp()[1] profile_file = tempfile.mkstemp()[1] r = requests.get(obj['filename'], allow_redirects=True) with open(reads_file, 'w') as f: f.write(r.text) r = requests.get(obj['filename'].replace( "_predicted_rep_reads.txt", "_predicted_mapped_indel_summary.txt"), allow_redirects=True) with open(profile_file, 'w') as f: f.write(r.text) profile = {} readSummaryToProfile(profile_file, profile, oligoid=obj['oligo_id'], remove_wt=False) return reads_file, profile_file, profile
def loadProfilePair(old_id, new_id): p_old, p_new = {}, {} old_file, new_file = getFileSuffix(old_id), getFileSuffix(new_id) mut_reads_old, mut_reads_new = 0, 0 for new_dir in new_dirs: acc, pacc, null = readSummaryToProfile(new_dir + 'mapped_reads/' + new_file, p_new, oligoid=new_id) mut_reads_new += (acc - null) for old_dir in old_dirs: acc, pacc, null = readSummaryToProfile(old_dir + 'mapped_reads/' + old_file, p_old, oligoid=old_id) mut_reads_old += (acc - null) return p_old, p_new, mut_reads_old, mut_reads_new
def loadProfilesSeparately(old_id, new_id): p_olds, p_news, old_sep_mr, new_sep_mr = [{}, {}], [{}, {}], [0, 0], [0, 0] old_file, new_file = getSummaryFileSuffix(old_id), getSummaryFileSuffix( new_id) for new_dir in [getHighDataDir() + '/' + x for x in new_dirs]: idx = 0 if '800' in new_dir else 1 acc, pacc, null = readSummaryToProfile(new_dir + '/mapped_reads/' + new_file, p_news[idx], oligoid=new_id) new_sep_mr[idx] += acc - null for old_dir in [getHighDataDir() + '/' + x for x in old_dirs]: idx = 0 if '800' in old_dir else 1 acc, pacc, null = readSummaryToProfile(old_dir + '/mapped_reads/' + old_file, p_olds[idx], oligoid=old_id) old_sep_mr[idx] += acc - null return p_olds, p_news, old_sep_mr, new_sep_mr
def loadSeparateProfilePairs(old_id, new_id): old_ps, new_ps = [], [] old_file, new_file = getFileSuffix(old_id), getFileSuffix(new_id) for new_dir in new_dirs: p_new, mut_reads_new = {}, 0 acc, pacc, null = readSummaryToProfile(new_dir + 'mapped_reads/' + new_file, p_new, oligoid=new_id) mut_reads_new += (acc - null) new_ps.append(p_new) for old_dir in old_dirs: p_old, mut_reads_old = {}, 0 acc, pacc, null = readSummaryToProfile(old_dir + 'mapped_reads/' + old_file, p_old, oligoid=old_id) mut_reads_old += (acc - null) old_ps.append(p_old) return old_ps, new_ps
def compileGenIndelReads(gen_indel_dir='generated_indels', out_dir='reads_for_gen_indels_all_samples', sample_dirs=[]): if not os.path.isdir(out_dir): os.mkdir(out_dir) for gen_file in os.listdir(gen_indel_dir): oligo_id = gen_file.split('_')[0] oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, sum_filename = getFileForOligoIdx( oligo_idx, ext='_mappedindelsummary.txt') out_subdir = out_dir + '/' + oligo_subdir if not os.path.isdir(out_subdir): os.mkdir(out_subdir) #Read all profiles for this oligo profiles, mut_read_totals = [], [] for dirname in sample_dirs: profiles.append({}) filename = getHighDataDir( ) + '/' + dirname + '/mapped_reads/' + oligo_subdir + '/' + sum_filename stats = readSummaryToProfile(filename, profiles[-1], oligoid=oligo_id) mut_read_totals.append('%d' % (stats[0] - stats[2])) #Compile reads for each indel across all samples f = io.open(gen_indel_dir + '/' + gen_file) fout = io.open(out_subdir + '/%s_gen_indel_reads.txt' % oligo_id, 'w') fout.write(f.readline()) #Git commit fout.write(u'Indel\tDetails\t%s\n' % '\t'.join([getDirLabel(x) for x in sample_dirs])) fout.write(u'All Mutated\t[]\t%s\n' % '\t'.join(mut_read_totals)) for toks in csv.reader(f, delimiter='\t'): indel, indel_details = toks[0], toks[2] read_str = '\t'.join( ['%d' % (p1[indel] if indel in p1 else 0) for p1 in profiles]) fout.write(u'%s\t%s\t%s\n' % (indel, indel_details, read_str)) fout.close() f.close()
if len(sys.argv) > 3: more_indels = eval(sys.argv[3]) if more_indels: out_dir = createResultDirectory(high_dir + '/more_indel_summaries',subdir) else: out_dir = createResultDirectory(high_dir + '/most_common_indel_summaries', subdir) fout = io.open(out_dir + '/' + subdir.split('/')[-1] + '.txt', 'w') oligo_lookup = loadExpOligoLookup(subdir) #For each Oligo, summarise details of its most common indel fout.write(u'Oligo Id\tMost Common Indel\tLeft\tRight\tCentral\tType\tSize\tMCI Reads\tTotal reads\tMicrohomology Sequence\n') sum_files = getIndelSummaryFiles(subdir) for filename in sum_files: file_prefix = filename.split('/')[-1][:-23] oligo_details = {x[0]: x[1:] for x in oligo_lookup[file_prefix]} oligo_ids = getOligoIdsFromFile(filename) for id in oligo_ids: #Read in the profile (if it exists) p1 = {} stats1 = readSummaryToProfile(filename, p1, oligoid=getShortOligoId(id)) if len(p1) == 0 or p1.keys() == ['-']: continue #Compute and summarise its MCI details writeMCISummary(fout, id, p1, stats1, oligo_details[id], more_indels) fout.close()
dir1_files = getIndelSummaryFiles(dirname1 + '/mapped_reads/' + subdir, withpath=False) dir2_files = getIndelSummaryFiles(dirname2 + '/mapped_reads/' + subdir, withpath=False) common_files = set(dir1_files).intersection(set(dir2_files)) for filename in common_files: filename1 = dirname1 + '/mapped_reads/' + subdir + '/' + filename filename2 = dirname2 + '/mapped_reads/' + subdir + '/' + filename oligo_ids1 = getOligoIdsFromFile( filename1 ) oligo_ids2 = getOligoIdsFromFile( filename2 ) common_oligos = set(oligo_ids1).intersection(set(oligo_ids2)) for oligo_id in common_oligos: profile1, profile2 = {}, {} num_reads1, perc_acc1, nonull1 = readSummaryToProfile(filename1, profile1, oligoid=oligo_id) num_reads2, perc_acc2, nonull2 = readSummaryToProfile(filename2, profile2, oligoid=oligo_id) ns1, ns2 = len(profile1), len(profile2) if remove_largeI: profile1 = filterLargeI(profile1) profile2 = filterLargeI(profile2) ent1a, ent2a = entropy(profile1,True), entropy(profile2,True) poverlap = percentOverlap( profile1, profile2, True ) score1 = symmetricKL( profile1, profile2, False ) score2 = symmetricKL( profile1, profile2, True ) ent1b, ent2b = entropy(profile1,True), entropy(profile2,True) #Since comparing the profiles appends missing states to both profiles
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')
indel_files = getIndelSummaryFiles(subdir, withpath=False) for indel_file in indel_files: oligo_ids = getOligoIdsFromFile(subdir + '/' + indel_file) mh_loc = '.' if highdir == '.' else highdir + '/ST_June_2017/data' mh_indels = loadMhExpIndels(mh_loc + '/' + mh_exp_indels_file, set(oligo_ids)) fout = io.open(outdir + '/' + indel_file[:-23] + '_mhindels.txt', 'w') for oligo_id in oligo_ids: profile = {} acc, pacc, nullr = readSummaryToProfile(subdir + '/' + indel_file, profile, oligoid=oligo_id) fout.write(u'@@@%s:%d:%d\n' % (oligo_id, acc, acc - nullr)) mhs, indels = mh_indels[oligo_id] for (mh, indel) in zip(mhs, indels): left, right, mh_len = mh.split(':') if indel == 'Unmappable': continue if indel in profile: nreads = profile[indel] else: nreads = 0 fout.write(u'%s\t%s\t%s\t%s\t%d\n' % (left, right, mh_len, indel, nreads)) fout.close()