def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_json = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # What needs merging: # - alignment.summary # - alignTable.txt # Some time in the future: # - alignStats_err.json # Merge alignStats metrics try: input_prefix_list = [ os.path.join(dir, ALIGNMENT_RESULTS, dataset['file_prefix'] + '.') for dir in dirs ] input_prefix_list = [ prefix for prefix in input_prefix_list if os.path.exists(prefix + 'alignment.summary') ] composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.') if input_prefix_list: mergeAlignStatsResults(input_prefix_list, composite_prefix) else: printtime("Nothing to merge: " + dataset['file_prefix']) except: printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix'] + '.bam')) datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.alignment.summary') if os.path.exists(src): input_prefix_list.append( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][ read_group]: X_name = datasets_basecaller['read_groups'][read_group][ 'barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Merge alignStats_err.json right here! merged_align_stats = {} align_stats_num_bases = 400 for dir in dirs: current_align_stats = {} try: f = open( os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json'), 'r') current_align_stats = json.load(f) f.close() except: printtime( "Merge alignStats_err.json: skipping %s" % os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json')) continue if not merged_align_stats: merged_align_stats = current_align_stats align_stats_num_bases = len( merged_align_stats.get("read_length", [])) continue for idx in range(align_stats_num_bases): merged_align_stats['nread'][idx] += current_align_stats['nread'][ idx] merged_align_stats['unaligned'][idx] += current_align_stats[ 'unaligned'][idx] merged_align_stats['filtered'][idx] += current_align_stats[ 'filtered'][idx] merged_align_stats['clipped'][idx] += current_align_stats[ 'clipped'][idx] merged_align_stats['aligned'][idx] += current_align_stats[ 'aligned'][idx] merged_align_stats['n_err_at_position'][ idx] += current_align_stats['n_err_at_position'][idx] merged_align_stats['cum_aligned'][idx] += current_align_stats[ 'cum_aligned'][idx] merged_align_stats['cum_err_at_position'][ idx] += current_align_stats['cum_err_at_position'][idx] merged_align_stats['accuracy_total_bases'] += current_align_stats[ 'accuracy_total_bases'] merged_align_stats['accuracy_total_errors'] += current_align_stats[ 'accuracy_total_errors'] merged_align_stats['total_mapped_target_bases'] += current_align_stats[ 'total_mapped_target_bases'] merged_align_stats['total_mapped_reads'] += current_align_stats[ 'total_mapped_reads'] try: f = open(os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), "w") json.dump(merged_align_stats, f, indent=4) f.close() except: printtime("ERROR; Failed to write merged alignStats_err.json") traceback.print_exc() mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'), int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'), int(graph_max_x)) printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment("./", barcodelist_path)
def alignment_post_processing(BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.alignment.summary') if os.path.exists(src): input_prefix_list.append( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][ read_group]: X_name = datasets_basecaller['read_groups'][read_group][ 'barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join( ALIGNMENT_RESULTS, os.path.basename(dataset['file_prefix']) + '.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bam_filename + '.bai', mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'), int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'), int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment("./", barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()
if os.path.exists(ionstats_path): file_list.append(ionstats_path) elif os.path.exists(ionstats_path_CA): file_list.append(ionstats_path_CA) else: raise Exception('') ionstats.reduce_stats(file_list, ionstats_file) # Make alignment_rate_plot.png stats = json.load(open(ionstats_file)) l = stats['full']['max_read_length'] graph_max_x = int(round(l + 49, -2)) ionstats_plots.alignment_rate_plot( 'alignStats_err.json', 'ionstats_basecaller.json', 'alignment_rate_plot.png', int(graph_max_x)) print("Ionstats plot created successfully") except: print("ERROR: Failed to generate alignment rate plot") try: # Make base_error_plot.png base_error_plot.generate_base_error_plot( 'alignStats_err.json', 'base_error_plot.png',int(graph_max_x)) except: print("ERROR: Failed to generate base error plot") traceback.print_exc()
if os.path.exists(ionstats_path): file_list.append(ionstats_path) elif os.path.exists(ionstats_path_CA): file_list.append(ionstats_path_CA) else: raise Exception('') ionstats.reduce_stats(file_list, ionstats_file) # Make alignment_rate_plot.png stats = json.load(open(ionstats_file)) l = stats['full']['max_read_length'] graph_max_x = int(round(l + 49, -2)) ionstats_plots.alignment_rate_plot('alignStats_err.json', 'ionstats_basecaller.json', 'alignment_rate_plot.png', int(graph_max_x)) print("Ionstats plot created successfully") except: print("ERROR: Failed to generate alignment rate plot") try: # Make base_error_plot.png base_error_plot.generate_base_error_plot('alignStats_err.json', 'base_error_plot.png', int(graph_max_x)) except: print("ERROR: Failed to generate base error plot") traceback.print_exc() if args.zip and len(args.files) > 1:
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_json = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # What needs merging: # - alignment.summary # - alignTable.txt # Some time in the future: # - alignStats_err.json # Merge alignStats metrics try: input_prefix_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.') for dir in dirs] input_prefix_list = [prefix for prefix in input_prefix_list if os.path.exists(prefix+'alignment.summary')] composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.') if input_prefix_list: mergeAlignStatsResults(input_prefix_list,composite_prefix) else: printtime("Nothing to merge: "+dataset['file_prefix']) except: printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix']+'.bam')) datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary') if os.path.exists(src): input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][read_group]: X_name = datasets_basecaller['read_groups'][read_group]['barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Merge alignStats_err.json right here! merged_align_stats = {} align_stats_num_bases = 400 for dir in dirs: current_align_stats = {} try: f = open(os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'),'r') current_align_stats = json.load(f); f.close() except: printtime("Merge alignStats_err.json: skipping %s" % os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json')) continue if not merged_align_stats: merged_align_stats = current_align_stats align_stats_num_bases = len(merged_align_stats.get("read_length",[])) continue for idx in range(align_stats_num_bases): merged_align_stats['nread'][idx] += current_align_stats['nread'][idx] merged_align_stats['unaligned'][idx] += current_align_stats['unaligned'][idx] merged_align_stats['filtered'][idx] += current_align_stats['filtered'][idx] merged_align_stats['clipped'][idx] += current_align_stats['clipped'][idx] merged_align_stats['aligned'][idx] += current_align_stats['aligned'][idx] merged_align_stats['n_err_at_position'][idx] += current_align_stats['n_err_at_position'][idx] merged_align_stats['cum_aligned'][idx] += current_align_stats['cum_aligned'][idx] merged_align_stats['cum_err_at_position'][idx] += current_align_stats['cum_err_at_position'][idx] merged_align_stats['accuracy_total_bases'] += current_align_stats['accuracy_total_bases'] merged_align_stats['accuracy_total_errors'] += current_align_stats['accuracy_total_errors'] merged_align_stats['total_mapped_target_bases'] += current_align_stats['total_mapped_target_bases'] merged_align_stats['total_mapped_reads'] += current_align_stats['total_mapped_reads'] try: f = open(os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),"w") json.dump(merged_align_stats, f, indent=4) f.close() except: printtime("ERROR; Failed to write merged alignStats_err.json") traceback.print_exc() mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x)) printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path)
def alignment_post_processing( BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary') if os.path.exists(src): input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][read_group]: X_name = datasets_basecaller['read_groups'][read_group]['barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()