def prepareExample(example_dir): setHighDataDir(example_dir) #Generate all possible indels new_gen_dir, old_gen_dir = getHighDataDir( ) + '/generated_indels_new', getHighDataDir() + '/generated_indels_old' if not os.path.isdir(new_gen_dir): os.makedirs(new_gen_dir) if not os.path.isdir(old_gen_dir): os.makedirs(old_gen_dir) cmd = getIndelGenExe() + ' ' + getHighDataDir( ) + '/exp_target_pam_new.fasta ' + new_gen_dir + '/' print(cmd) os.system(cmd) cmd = getIndelGenExe() + ' ' + getHighDataDir( ) + '/exp_target_pam_old.fasta ' + old_gen_dir + '/' print(cmd) os.system(cmd) #Compile number of reads per sample for each indel reads_dir = getHighDataDir() + '/reads_for_gen_indels' compileGenIndelReads(gen_indel_dir=new_gen_dir, out_dir=reads_dir, sample_dirs=new_dirs) compileGenIndelReads(gen_indel_dir=old_gen_dir, out_dir=reads_dir, sample_dirs=old_dirs) setReadsDir(reads_dir) #Compute features for each indel features_dir = getHighDataDir() + '/features_for_gen_indels' computeFeaturesForGenIndels(gen_indel_dir=new_gen_dir, out_dir=features_dir) computeFeaturesForGenIndels(gen_indel_dir=old_gen_dir, out_dir=features_dir) setFeaturesDir(features_dir)
label=label, plot_label='pie_rpt_fracs', data_label='FracWithI1Rpt', stacked=True) def runAnalysis(): spec = { 'results_dir': getHighDataDir() + '/i1/i1_summaries', 'dirname_to_result_fn': lambda x: '%s.txt' % x, 'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4], 'py_func_load': defaultLoadData, 'py_funcs_per_result': [(mergeWithIndelData, 'i1IndelData')], 'py_funcs_all_results': [plotMergedPieDataWithAmbig, plotMergedI1Repeats], 'check_output_fn': lambda x: True, 'reads_colname': 'Total reads', 'min_reads': MIN_READS, 'id_colname': 'Oligo Id', 'partitions': ['Non-Targeting'], 'samples': ['K562 New'] } analyseResultsPerPartition(spec) if __name__ == '__main__': setHighDataDir('..') runAnalysis() import pdb pdb.set_trace()
if left_c_seq == right_c_seq: mh_seq = left_c_seq altered_seq = getSequence(oligo_det, details['L'] +1, details['R']-1) #Note includes MH seq at both ends str_args = (id, mci, details['L'], details['R'],details['C'],itype,isize,mci_reads,total_reads,mh_seq) fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\n' % str_args) if __name__ == '__main__': if len(sys.argv) != 3 and len(sys.argv) != 4: print('Usage: compile_most_common_indels.py <high_dir> <subdir> <more_indels>') else: #Set up output file high_dir = sys.argv[1] if high_dir != '.': setHighDataDir(high_dir) subdir = sys.argv[2] more_indels = True if len(sys.argv) > 3: more_indels = eval(sys.argv[3]) if more_indels: out_dir = createResultDirectory(high_dir + '/more_indel_summaries',subdir) else: out_dir = createResultDirectory(high_dir + '/most_common_indel_summaries', subdir) fout = io.open(out_dir + '/' + subdir.split('/')[-1] + '.txt', 'w') oligo_lookup = loadExpOligoLookup(subdir) #For each Oligo, summarise details of its most common indel fout.write(u'Oligo Id\tMost Common Indel\tLeft\tRight\tCentral\tType\tSize\tMCI Reads\tTotal reads\tMicrohomology Sequence\n') sum_files = getIndelSummaryFiles(subdir) for filename in sum_files: file_prefix = filename.split('/')[-1][:-23] oligo_details = {x[0]: x[1:] for x in oligo_lookup[file_prefix]}
from create_overbeek_templates import createOverbeekTemplates from compute_overbeek_indel_profiles import computeOverbeekIndelProfiles def printStatus(status): print('\n### ', status, ' ###\n ') shutil.copytree('/data/endogenous_processing_example', '/results/endogenous_processing_example') #---------------------------------------------------------------------- # Configure environment #---------------------------------------------------------------------- setRunLocal(True) setHighDataDir('/results/endogenous_processing_example/') setPythonCmd('python') setIndelMapExe('/usr/local/bin/indelmap') #---------------------------------------------------------------- # Processing of raw Van-Overbeek et al reads to produce descriptions of indels #---------------------------------------------------------------- #Note: This provides a demonstration on just 1 oligo, going from raw overbeek reads to indel descriptions. # Sam files are assumed to be already collected (for further details of this part see # collect_overbeek_sams.py in same dir) printStatus('Create fastq files from Van Overbeek sam files') sam_dir, fastq_dir = getHighDataDir() + '/overbeek_sam_files', getHighDataDir( ) + '/overbeek_fastq_files' if not os.path.isdir(fastq_dir): os.makedirs(fastq_dir) extractReads(sam_dir + '/Overbeek_6.sam', fastq_dir + '/Overbeek6.fastq',
) files = [results_subdir + '/' + x for x in os.listdir(results_subdir)] for filename in files: collectMhOfLen(filename, mh_len, fout) fout.close() if __name__ == '__main__': if len(sys.argv) != 4: print( 'Usage: collect_mh_frequencies_by_len.py results_mh_len highdir subdir' ) else: mh_len = eval(sys.argv[1]) highdir = sys.argv[2] results_subdir = sys.argv[3] subdir = '/'.join(results_subdir.split('/')[-2:]) setHighDataDir(highdir) if not os.path.isdir(results_subdir): raise Exception('No such directory:' + results_subdir) out_dir = highdir + '/mh_freqs_by_len' + '/' + subdir if not os.path.isdir(out_dir): os.makedirs(out_dir) collectMhFrequenciesOfLen( results_subdir, mh_len, out_dir + '/mh_indels_of_len_%d.txt' % mh_len)
for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL') if __name__ == '__main__': setHighDataDir('/data/endogenous_comparisons') setPlotDir('/results/plots') setFigType('png') compareOverbeekProfiles(selected_overbeek_id='Overbeek16', pred_results_dir='/data/endogenous_comparisons') compareOverbeekProfiles(selected_overbeek_id=None, pred_results_dir='/data/endogenous_comparisons')
def printStatus(status): print('\n### ', status, ' ###\n ') #---------------------------------------------------------------------- # Copy all example data to results directory since script runs in place #---------------------------------------------------------------------- shutil.copytree('/data/indel_processing_example', '/results/indel_processing_example') setRunLocal(True) if not os.path.isdir('/results/indel_processing_example'): os.mkdir('/results/indel_processing_example') setHighDataDir('/results/indel_processing_example/') setPythonCmd('python') setPearExe('/usr/local/bin/pear') setIndelMapExe('/usr/local/bin/indelmap') setIndelGenI1Exe('/usr/local/bin/indelgen_i1') setIndelMhExe('/usr/local/bin/indelmh') #---------------------------------------------------------------- # Processing of raw reads to produce descriptions of indels #---------------------------------------------------------------- #Note: This provides a demonstration on a cut-down data set of just 4 oligos. # In practice, the dataset was much too be large to be run in one script like this. # Instead individual steps were performed by calling each script in turn from the # command line to set off parallel jobs on a compute cluster.
from plot_i1_summaries import runAnalysis as plotI1 sys.path.append('kl_comparisons') from plot_kl_analysis import runAnalysis as plotKLCmp sys.path.append('microhomology') from plot_mh_analysis import runAnalysis as plotMH sys.path.append('microhomology_mismatch') from plot_mh_mismatch_frequencies import runAnalysis as plotMhMismatch sys.path.append('scaffold_compare') from plot_old_new import runAnalysis as plotKLOldNew sys.path.append('indel_details') from plot_pie_indel_summaries import runAnalysis as plotIndelDetails sys.path.append('../indel_prediction/model_testing') from plot_old_new_predictions import runAnalysis as plotPred setFigType('png') setPlotDir('/results/plots') setHighDataDir('/data/summary_data') plotI1() plotKLCmp() plotMH() plotMhMismatch() plotKLOldNew() plotIndelDetails() plotPred()