def alignment_post_processing(BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.alignment.summary') if os.path.exists(src): input_prefix_list.append( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][ read_group]: X_name = datasets_basecaller['read_groups'][read_group][ 'barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join( ALIGNMENT_RESULTS, os.path.basename(dataset['file_prefix']) + '.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bam_filename + '.bai', mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'), int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'), int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment("./", barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_json = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # What needs merging: # - alignment.summary # - alignTable.txt # Some time in the future: # - alignStats_err.json # Merge alignStats metrics try: input_prefix_list = [ os.path.join(dir, ALIGNMENT_RESULTS, dataset['file_prefix'] + '.') for dir in dirs ] input_prefix_list = [ prefix for prefix in input_prefix_list if os.path.exists(prefix + 'alignment.summary') ] composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.') if input_prefix_list: mergeAlignStatsResults(input_prefix_list, composite_prefix) else: printtime("Nothing to merge: " + dataset['file_prefix']) except: printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix'] + '.bam')) datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.alignment.summary') if os.path.exists(src): input_prefix_list.append( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][ read_group]: X_name = datasets_basecaller['read_groups'][read_group][ 'barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Merge alignStats_err.json right here! merged_align_stats = {} align_stats_num_bases = 400 for dir in dirs: current_align_stats = {} try: f = open( os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json'), 'r') current_align_stats = json.load(f) f.close() except: printtime( "Merge alignStats_err.json: skipping %s" % os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json')) continue if not merged_align_stats: merged_align_stats = current_align_stats align_stats_num_bases = len( merged_align_stats.get("read_length", [])) continue for idx in range(align_stats_num_bases): merged_align_stats['nread'][idx] += current_align_stats['nread'][ idx] merged_align_stats['unaligned'][idx] += current_align_stats[ 'unaligned'][idx] merged_align_stats['filtered'][idx] += current_align_stats[ 'filtered'][idx] merged_align_stats['clipped'][idx] += current_align_stats[ 'clipped'][idx] merged_align_stats['aligned'][idx] += current_align_stats[ 'aligned'][idx] merged_align_stats['n_err_at_position'][ idx] += current_align_stats['n_err_at_position'][idx] merged_align_stats['cum_aligned'][idx] += current_align_stats[ 'cum_aligned'][idx] merged_align_stats['cum_err_at_position'][ idx] += current_align_stats['cum_err_at_position'][idx] merged_align_stats['accuracy_total_bases'] += current_align_stats[ 'accuracy_total_bases'] merged_align_stats['accuracy_total_errors'] += current_align_stats[ 'accuracy_total_errors'] merged_align_stats['total_mapped_target_bases'] += current_align_stats[ 'total_mapped_target_bases'] merged_align_stats['total_mapped_reads'] += current_align_stats[ 'total_mapped_reads'] try: f = open(os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), "w") json.dump(merged_align_stats, f, indent=4) f.close() except: printtime("ERROR; Failed to write merged alignStats_err.json") traceback.print_exc() mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'), int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'), int(graph_max_x)) printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment("./", barcodelist_path)
def alignment_post_processing( libraryName, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 800 alignment_file_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue ionstats.generate_ionstats_alignment( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'), graph_max_x) ionstats2alignstats(libraryName, os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.alignment.summary')) alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) # In Progress: merge ionstats alignment results ionstats.reduce_stats(alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) ionstats2alignstats(libraryName, os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS,'alignment.summary')) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) # Generate alignment_barcode_summary.csv #TODO: use datasets_basecaller.json + *.ionstats_alignment.json instead of barcodeList.txt and alignment_*.summary barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path) # These graphs are likely obsolete #makeAlignGraphs() # In Progress: Use ionstats alignment results to generate plots ionstats_plots.alignment_rate_plot2( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'alignment_rate_plot.png', graph_max_x) ionstats_plots.base_error_plot( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'base_error_plot.png', graph_max_x) ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q10.png', 'AQ10', 'red') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q17.png', 'AQ17', 'yellow') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q20.png', 'AQ20', 'green') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q47.png', 'AQ47', 'purple')
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 800 ######################################################## # Merge ionstats_alignment.json # First across blocks, then across barcoded ######################################################## try: composite_filename_list = [] for dataset in datasets_basecaller["datasets"]: composite_filename = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') barcode_filename_list = [os.path.join(dir,ALIGNMENT_RESULTS,dataset['file_prefix']+'.ionstats_alignment.json') for dir in dirs] barcode_filename_list = [filename for filename in barcode_filename_list if os.path.exists(filename)] ionstats.reduce_stats(barcode_filename_list,composite_filename) if os.path.exists(composite_filename): composite_filename_list.append(composite_filename) ionstats.reduce_stats(composite_filename_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) except: printtime("ERROR: Failed to merge ionstats_alignment.json") traceback.print_exc() # Use ionstats alignment results to generate plots ionstats_plots.alignment_rate_plot2( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'alignment_rate_plot.png', graph_max_x) ionstats_plots.base_error_plot( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'base_error_plot.png', graph_max_x) ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q10.png', 'AQ10', 'red') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q17.png', 'AQ17', 'yellow') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q20.png', 'AQ20', 'green') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q47.png', 'AQ47', 'purple') # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path)
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_json = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # What needs merging: # - alignment.summary # - alignTable.txt # Some time in the future: # - alignStats_err.json # Merge alignStats metrics try: input_prefix_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.') for dir in dirs] input_prefix_list = [prefix for prefix in input_prefix_list if os.path.exists(prefix+'alignment.summary')] composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.') if input_prefix_list: mergeAlignStatsResults(input_prefix_list,composite_prefix) else: printtime("Nothing to merge: "+dataset['file_prefix']) except: printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix']+'.bam')) datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary') if os.path.exists(src): input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][read_group]: X_name = datasets_basecaller['read_groups'][read_group]['barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Merge alignStats_err.json right here! merged_align_stats = {} align_stats_num_bases = 400 for dir in dirs: current_align_stats = {} try: f = open(os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'),'r') current_align_stats = json.load(f); f.close() except: printtime("Merge alignStats_err.json: skipping %s" % os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json')) continue if not merged_align_stats: merged_align_stats = current_align_stats align_stats_num_bases = len(merged_align_stats.get("read_length",[])) continue for idx in range(align_stats_num_bases): merged_align_stats['nread'][idx] += current_align_stats['nread'][idx] merged_align_stats['unaligned'][idx] += current_align_stats['unaligned'][idx] merged_align_stats['filtered'][idx] += current_align_stats['filtered'][idx] merged_align_stats['clipped'][idx] += current_align_stats['clipped'][idx] merged_align_stats['aligned'][idx] += current_align_stats['aligned'][idx] merged_align_stats['n_err_at_position'][idx] += current_align_stats['n_err_at_position'][idx] merged_align_stats['cum_aligned'][idx] += current_align_stats['cum_aligned'][idx] merged_align_stats['cum_err_at_position'][idx] += current_align_stats['cum_err_at_position'][idx] merged_align_stats['accuracy_total_bases'] += current_align_stats['accuracy_total_bases'] merged_align_stats['accuracy_total_errors'] += current_align_stats['accuracy_total_errors'] merged_align_stats['total_mapped_target_bases'] += current_align_stats['total_mapped_target_bases'] merged_align_stats['total_mapped_reads'] += current_align_stats['total_mapped_reads'] try: f = open(os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),"w") json.dump(merged_align_stats, f, indent=4) f.close() except: printtime("ERROR; Failed to write merged alignStats_err.json") traceback.print_exc() mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x)) base_error_plot.generate_alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(BASECALLER_RESULTS,'readLen.txt'), os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x)) printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path)
def alignment_post_processing( BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary') if os.path.exists(src): input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][read_group]: X_name = datasets_basecaller['read_groups'][read_group]['barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) printtime("Creating legacy name links") if dataset.has_key('legacy_prefix'): link_src = [ os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai')] link_dst = [ os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam'), os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam.bai')] for (src,dst) in zip(link_src,link_dst): try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.basecaller.bam and rawlib.sff on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x)) base_error_plot.generate_alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(BASECALLER_RESULTS,'readLen.txt'), os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()
def align_barcodes( sammeta, libsff_path, align_full, sam_parsed, bidirectional, libraryName, DIR_BC_FILES, flows, aligner_opts_extra, mark_duplicates, ALIGNMENT_RESULTS, outBaseName='' ): printtime("Renaming non-barcoded alignment results to 'comprehensive'") files = [ 'alignment.summary', 'alignmentQC_out.txt', 'alignTable.txt', ] for fname in files: if os.path.exists(fname): try: #if os.path.exists(fname): # os.rename(fname, fname + ".comprehensive") shutil.copyfile(fname, fname + ".comprehensive") except: printtime('ERROR copying %s' % fname) traceback.print_exc() printtime("STARTING BARCODE ALIGNMENTS") barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if not os.path.exists(barcodelist_path): printtime('ERROR: barcodeList.txt not found') barcodeList = parse_bcfile(barcodelist_path) align_full = True (head,tail) = os.path.split(libsff_path) for bcid in (x['id_str'] for x in barcodeList): sffName = os.path.join(DIR_BC_FILES,"%s_%s" % (bcid, tail)) print "sffName: "+sffName if os.path.exists(sffName): printtime("Barcode processing for '%s': %s" % (bcid, sffName)) else: printtime("No barcode SFF file found for '%s': %s" % (bcid, sffName)) continue align_full_chip( sammeta, sffName, align_full, 1, True, sam_parsed, bidirectional, libraryName, flows, aligner_opts_extra, mark_duplicates, ALIGNMENT_RESULTS, outBaseName) #rename each output file based on barcode found in fastq filename #but ignore the comprehensive fastq output files printtime("Barcode processing, rename") if os.path.exists('alignment.summary'): try: fname='alignment_%s.summary' % bcid os.rename('alignment.summary', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignmentQC_out_%s.txt' % bcid os.rename('alignmentQC_out.txt', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignTable_%s.txt' % bcid os.rename('alignTable.txt', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) except: printtime('error renaming') traceback.print_exc() #rename comprehensive results back to default names for fname in files: if os.path.exists(fname + '.comprehensive'): # os.rename(fname + '.comprehensive', fname) try: shutil.copyfile(fname + '.comprehensive', fname) except: printtime('ERROR copying %s' % fname + '.comprehensive') traceback.print_exc() printtime("Barcode processing, aggreagate") aggregate_alignment ("./",barcodelist_path)
def align_full_chip_core(libsff, libKey, tfKey, floworder, fastqName, align_full, graph_max_x, do_barcode, make_align_graphs, sam_parsed, DIR_BC_FILES, env, outputdir): #collect all the meta data for the SAM file SAM_META = {} # id - this hash comes from the fastq file try: #read the first line of the fastq file fastqFile = open(fastqName,'r') id_hash = fastqFile.readline() fastqFile.close() #now just pull out the hash id_hash = id_hash[1:6] SAM_META['ID'] = id_hash except IOError: printtime("Could not read fastq file. The ID for the SAM file could not be found.") # sm - name for reads - project name SAM_META['SM'] = env['project'] # lb - library name SAM_META['LB'] = env['libraryName'] # pu - the platform unit SAM_META['PU'] = "PGM/" + env['chipType'].replace('"',"") SAM_META['PL'] = "IONTORRENT" #TODO: do not assume localhost. Find the name of the masternode try: #this will get the exp data from the database exp_json = json.loads(env['exp_json']) # ds - the "notes", only the alphanumeric and space characters. SAM_META['DS'] = ''.join(ch for ch in exp_json['notes'] if ch.isalnum() or ch == " ") # dt - the run date exp_log_json = json.loads(exp_json['log']) iso_exp_time = exp_log_json['start_time'] #convert to ISO time iso_exp_time = dateutil.parser.parse(iso_exp_time) SAM_META['DT'] = iso_exp_time.isoformat() #the site name should be here, also remove spaces site_name = env['site_name'] site_name = ''.join(ch for ch in site_name if ch.isalnum() ) SAM_META['CN'] = site_name env['flows'] = exp_json['flows'] except: printtime("There was an error getting the site name, because the Torrent Browser could not be contacted") traceback.print_exc() #Now build the SAM meta data arg string aligner_opts_rg= '--aligner-opts-rg "' aligner_opts_extra = '' if sam_parsed: aligner_opts_extra += ' -p 1' if env['aligner_opts_extra']: print ' found extra alignment options: "%s"' % env['aligner_opts_extra'] aligner_opts_extra = ' --aligner-opts-extra "' aligner_opts_extra += env['aligner_opts_extra'] + '"' first = True for key, value in SAM_META.items(): if value: sam_arg = r'-R \"' end = r'\"' sam_arg = sam_arg + key + ":" + value + end if first: aligner_opts_rg = aligner_opts_rg + sam_arg first = False else: aligner_opts_rg = aligner_opts_rg + " " + sam_arg #add the trailing quote aligner_opts_rg = aligner_opts_rg + '"' if 0 < graph_max_x: # establish the read-length histogram range by using the simple rule: 0.6 * num-flows flowsUsed = 0 try: flowsUsed = int(env['flows']) except: flowsUsed = 400 graph_max_x = 100 * math.trunc((0.6 * flowsUsed + 99)/100.0) if graph_max_x < 400: graph_max_x = 400 #----------------------------------- # DEFAULT SINGLE SFF/FASTQ BEHAVIOR - (Runs for barcoded runs too) #----------------------------------- if (align_full): #If a full align is forced add a '--align-all-reads' flag com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % libsff com += " --genome %s" % env["libraryName"] com += " --max-plot-read-len %s" % graph_max_x com += " --align-all-reads" com += " %s %s" % (aligner_opts_rg,aligner_opts_extra) com += " >> ReportLog.html 2>&1" else: # Add -p 1 to enable default.sam file generation com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % libsff com += " --genome %s" % env["libraryName"] com += " --max-plot-read-len %s" % graph_max_x com += " %s %s" % (aligner_opts_rg,aligner_opts_extra) com += " >> ReportLog.html 2>&1" try: printtime("Alignment QC command line:\n%s" % com) retcode = subprocess.call(com, shell=True) if retcode != 0: printtime("alignmentQC failed, return code: %d" % retcode) alignError = open("alignment.error", "w") alignError.write('alignmentQC returned with error code: ') alignError.write(str(retcode)) alignError.close() except OSError: printtime('Alignment Failed to start') alignError = open("alignment.error", "w") alignError.write(str(traceback.format_exc())) alignError.close() traceback.print_exc() if make_align_graphs: makeAlignGraphs() #-------------------------------------------- # BARCODE HANDLING BEHAVIOR (Multiple FASTQ) #-------------------------------------------- if env['barcodeId'] and True == do_barcode: printtime("Renaming non-barcoded alignment results to 'comprehensive'") files = [ 'alignment.summary', 'alignmentQC_out.txt', 'alignTable.txt', ] for fname in files: try: if os.path.exists(fname): os.rename(fname, fname + ".comprehensive") except: printtime('error renaming') traceback.print_exc() # Only make the graphs from the alignment of comprehensive fastq file if make_align_graphs: makeAlignGraphs() printtime("STARTING BARCODE ALIGNMENTS") if not os.path.exists(DIR_BC_FILES): os.mkdir(DIR_BC_FILES) barcodeList = parse_bcfile('barcodeList.txt') align_full = True for bcid in (x['id_str'] for x in barcodeList): sffName = "%s_%s_%s.sff" % (bcid, env['expName'], env['resultsName']) if not os.path.exists(sffName): printtime("No barcode SFF file found for '%s'" % bcid) continue if (align_full): printtime("Align All Reads") #If a full align is forced add a '--align-all-reads' flag com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % sffName com += " --genome %s" % env["libraryName"] com += " --max-plot-read-len %s" % graph_max_x com += " --align-all-reads" com += " %s %s" % (aligner_opts_rg, aligner_opts_extra) com += " >> ReportLog.html 2>&1" else: printtime("Align Subset of Reads") # Add -p 1 to enable default.sam file generation com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % sffName com += " --genome %s" % env["libraryName"] com += " --max-plot-read-len %s" % graph_max_x com += " %s %s" % (aligner_opts_rg, aligner_opts_extra) com += " >> ReportLog.html 2>&1" try: printtime("Alignment QC command line:\n%s" % com) retcode = subprocess.call(com, shell=True) if retcode != 0: printtime("alignmentQC failed, return code: %d" % retcode) alignError = open("alignment.error", "a") alignError.write(com) alignError.write(': \nalignmentQC returned with error code: ') alignError.write(str(retcode)) alignError.close() except OSError: printtime('Alignment Failed to start') alignError = open("alignment.error", "a") alignError.write(str(traceback.format_exc())) alignError.close() traceback.print_exc() #rename each output file based on barcode found in fastq filename #but ignore the comprehensive fastq output files if os.path.exists('alignment.summary'): try: fname='alignment_%s.summary' % bcid os.rename('alignment.summary', fname) os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignmentQC_out_%s.txt' % bcid os.rename('alignmentQC_out.txt', fname) os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignTable_%s.txt' % bcid os.rename('alignTable.txt', fname) os.rename(fname,os.path.join(DIR_BC_FILES,fname)) #move fastq, sff, bam, bai files extlist = ['fastq','sff','bam','bam.bai'] for ext in extlist: bcfile = "%s_%s_%s.%s" % (bcid,env['expName'], env['resultsName'],ext) if os.path.isfile(bcfile): os.rename(bcfile,os.path.join(DIR_BC_FILES,bcfile)) except: printtime('error renaming') traceback.print_exc() #rename comprehensive results back to default names files = [ 'alignment.summary', 'alignmentQC_out.txt', 'alignTable.txt', ] for fname in files: if os.path.exists(fname + '.comprehensive'): os.rename(fname + '.comprehensive', fname) aggregate_alignment (DIR_BC_FILES,'barcodeList.txt')
def align_full_chip( SAM_META, libsff_path, align_full, graph_max_x, do_barcode, make_align_graphs, sam_parsed, bidirectional, DIR_BC_FILES, libraryName, flows, barcodeId, opts_extra, outputdir): printtime("sam_parsed is %s" % sam_parsed) #Now build the SAM meta data arg string aligner_opts_rg= '--aligner-opts-rg "' aligner_opts_extra = '' additional_aligner_opts = '' if sam_parsed: additional_aligner_opts += ' -p 1' if bidirectional: additional_aligner_opts += ' --bidirectional' if opts_extra: print ' found extra alignment options: "%s"' % opts_extra aligner_opts_extra = ' --aligner-opts-extra "' aligner_opts_extra += opts_extra + '"' first = True for key, value in SAM_META.items(): if value: sam_arg = r'-R \"' end = r'\"' sam_arg = sam_arg + key + ":" + value + end if first: aligner_opts_rg = aligner_opts_rg + sam_arg first = False else: aligner_opts_rg = aligner_opts_rg + " " + sam_arg #add the trailing quote aligner_opts_rg = aligner_opts_rg + '"' if 0 < graph_max_x: # establish the read-length histogram range by using the simple rule: 0.6 * num-flows flowsUsed = 0 try: flowsUsed = int(flows) except: flowsUsed = 400 graph_max_x = 100 * math.trunc((0.6 * flowsUsed + 99)/100.0) if graph_max_x < 400: graph_max_x = 400 #----------------------------------- # DEFAULT SINGLE SFF/FASTQ BEHAVIOR - (Runs for barcoded runs too) #----------------------------------- if (align_full): #If a full align is forced add a '--align-all-reads' flag com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % libsff_path com += " --genome %s" % libraryName com += " --max-plot-read-len %s" % graph_max_x com += " --align-all-reads" com += " %s" % (additional_aligner_opts) com += " %s %s" % (aligner_opts_rg,aligner_opts_extra) com += " >> ReportLog.html 2>&1" else: com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % libsff_path com += " --genome %s" % libraryName com += " --max-plot-read-len %s" % graph_max_x com += " %s" % (additional_aligner_opts) com += " %s %s" % (aligner_opts_rg,aligner_opts_extra) com += " >> ReportLog.html 2>&1" try: printtime("Alignment QC command line:\n%s" % com) retcode = subprocess.call(com, shell=True) blockprocessing.add_status("alignmentQC.pl", retcode) if retcode != 0: printtime("alignmentQC failed, return code: %d" % retcode) alignError = open("alignment.error", "w") alignError.write('alignmentQC returned with error code: ') alignError.write(str(retcode)) alignError.close() except OSError: printtime('Alignment Failed to start') alignError = open("alignment.error", "w") alignError.write(str(traceback.format_exc())) alignError.close() traceback.print_exc() if make_align_graphs: makeAlignGraphs() #-------------------------------------------- # BARCODE HANDLING BEHAVIOR (Multiple FASTQ) #-------------------------------------------- if barcodeId and do_barcode: printtime("Renaming non-barcoded alignment results to 'comprehensive'") files = [ 'alignment.summary', 'alignmentQC_out.txt', 'alignTable.txt', ] for fname in files: try: #if os.path.exists(fname): # os.rename(fname, fname + ".comprehensive") shutil.copyfile(fname, fname + ".comprehensive") except: printtime('ERROR copying %s' % fname) traceback.print_exc() printtime("STARTING BARCODE ALIGNMENTS") barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): printtime('ERROR: barcodeList.txt not found') barcodeList = parse_bcfile(barcodelist_path) align_full = True top_dir = os.getcwd() try: os.chdir(DIR_BC_FILES) printtime('DEBUG changing to %s for barcodes alignment' % DIR_BC_FILES) except: printtime('ERROR missing %s folder' % DIR_BC_FILES) for bcid in (x['id_str'] for x in barcodeList): (head,tail) = os.path.split(libsff_path) sffName = os.path.join(head,"%s_%s" % (bcid, tail)) if os.path.exists(sffName): printtime("Barcode processing for '%s': %s" % (bcid, sffName)) else: printtime("No barcode SFF file found for '%s': %s" % (bcid, sffName)) continue if (align_full): printtime("Align All Reads") #If a full align is forced add a '--align-all-reads' flag com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % sffName com += " --genome %s" % libraryName com += " --max-plot-read-len %s" % graph_max_x com += " --align-all-reads" com += " %s" % (additional_aligner_opts) com += " %s %s" % (aligner_opts_rg, aligner_opts_extra) com += " >> ReportLog.html 2>&1" else: printtime("Align Subset of Reads") com = "alignmentQC.pl" com += " --logfile %s" % os.path.join(outputdir,"alignmentQC_out.txt") com += " --output-dir %s" % outputdir com += " --input %s" % sffName com += " --genome %s" % libraryName com += " --max-plot-read-len %s" % graph_max_x com += " %s" % (additional_aligner_opts) com += " %s %s" % (aligner_opts_rg, aligner_opts_extra) com += " >> ReportLog.html 2>&1" try: printtime("Alignment QC command line:\n%s" % com) retcode = subprocess.call(com, shell=True) blockprocessing.add_status("alignmentQC.pl", retcode) if retcode != 0: printtime("alignmentQC failed, return code: %d" % retcode) alignError = open("alignment.error", "a") alignError.write(com) alignError.write(': \nalignmentQC returned with error code: ') alignError.write(str(retcode)) alignError.close() except: printtime('ERROR: Alignment Failed to start') alignError = open("alignment.error", "a") alignError.write(str(traceback.format_exc())) alignError.close() traceback.print_exc() #rename each output file based on barcode found in fastq filename #but ignore the comprehensive fastq output files if os.path.exists('alignment.summary'): try: fname='alignment_%s.summary' % bcid os.rename('alignment.summary', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignmentQC_out_%s.txt' % bcid os.rename('alignmentQC_out.txt', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) fname='alignTable_%s.txt' % bcid os.rename('alignTable.txt', fname) # os.rename(fname,os.path.join(DIR_BC_FILES,fname)) except: printtime('error renaming') traceback.print_exc() os.chdir(top_dir) #rename comprehensive results back to default names for fname in files: #if os.path.exists(fname + '.comprehensive'): # os.rename(fname + '.comprehensive', fname) try: shutil.copyfile(fname + '.comprehensive', fname) except: printtime('ERROR copying %s' % fname + '.comprehensive') traceback.print_exc() aggregate_alignment (DIR_BC_FILES,barcodelist_path)