def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo, ): parallel_datasets = 1 memTotalGb = _get_total_memory_gb() try: if memTotalGb > 140: parallel_datasets = 4 elif memTotalGb >= 70: parallel_datasets = 2 except Exception: pass align_threads = multiprocessing.cpu_count() / parallel_datasets if memTotalGb <= 40: # reduce number of CPU (1 vCPU = 2 cores) align_threads = align_threads - 2 printtime("Attempt to align") printtime( "DEBUG: PROCESS DATASETS blocks: '%s', parallel datasets: %d" % (blocks, parallel_datasets) ) # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] align_dataset_args = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset["read_groups"][0] reference = basecaller_datasets["read_groups"][read_group]["reference"] # print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get("filtered", False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue align_dataset_args.append( ( dataset, blocks, reference, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, align_threads, barcodeInfo, ) ) if reference: if filtered: ionstats_alignment_filtered_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset["file_prefix"] + ".ionstats_alignment.json", ) ) else: ionstats_alignment_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset["file_prefix"] + ".ionstats_alignment.json", ) ) else: if filtered: ionstats_basecaller_filtered_file_list.append( os.path.join( BASECALLER_RESULTS, dataset["file_prefix"] + ".ionstats_basecaller.json", ) ) else: ionstats_basecaller_file_list.append( os.path.join( BASECALLER_RESULTS, dataset["file_prefix"] + ".ionstats_basecaller.json", ) ) # do alignment in multiprocessing pool pool = multiprocessing.Pool(processes=parallel_datasets) pool.map(align_dataset_parallel_wrap, align_dataset_args) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats( ionstats_alignment_file_list, os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json"), ) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret ) raise RuntimeError("exit code: %d" % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ["empty_dummy.bam"], os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json"), os.path.join(ALIGNMENT_RESULTS, "ionstats_error_summary.h5"), basecaller_meta_information, library_key, graph_max_x, ) except Exception: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats( ionstats_basecaller_file_list, os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json"), ) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret ) raise RuntimeError("exit code: %d" % ret) ionstats.generate_ionstats_basecaller( ["empty_dummy.bam"], os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json"), library_key, graph_max_x, ) except Exception: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json") b = os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json") if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join( BASECALLER_RESULTS, "ionstats_basecaller_with_aligninfos.json" ), ) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS, "ionstats_basecaller.json"), ) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): parallel_datasets = 1 try: memTotalGb = os.sysconf('SC_PAGE_SIZE')*os.sysconf('SC_PHYS_PAGES')/(1024*1024*1024) if memTotalGb > 70: parallel_datasets = 2 except: pass align_threads = multiprocessing.cpu_count() / parallel_datasets printtime("Attempt to align") printtime("DEBUG: PROCESS DATASETS blocks: '%s', parallel datasets: %d" % (blocks, parallel_datasets)) # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] align_dataset_args = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] # print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered', False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue align_dataset_args.append(( dataset, blocks, reference, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, align_threads )) if reference: if filtered: ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: if filtered: ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) # do alignment in multiprocessing pool pool = multiprocessing.Pool(processes=parallel_datasets) pool.map(align_dataset_parallel_wrap, align_dataset_args) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats(ionstats_alignment_file_list, os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats(ionstats_basecaller_file_list, os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats(ionstatslist, os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats(reversed(ionstatslist), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def process_datasets(blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks) do_sorting = True # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get( 'filtered', False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue try: # process block by block if reference and len(blocks) > 1 and int( dataset["read_count"]) > 20000000: printtime( "DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align([block], os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block, ALIGNMENT_RESULTS), output_basename=dataset['file_prefix']) bamdir = '.' # TODO , do we need this ? bamBase = dataset['file_prefix'] bamfile = dataset['file_prefix'] + ".bam" # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset['file_prefix'] + ".bam" composite_bam_filepath = dataset['file_prefix'] + ".bam" blockprocessing.extract_and_merge_bam_header( block_bam_list, composite_bam_filepath) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = 'samtools merge -l1 -@8' if do_ionstats: cmd += ' - ' else: cmd += ' %s' % (composite_bam_filepath) for bamfile in block_bam_list: cmd += ' %s' % bamfile cmd += ' -h %s.header.sam' % composite_bam_filepath if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) else: printtime( "DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # TODO: try a python multiprocessing pool align(blocks, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if reference: if filtered: ionstats_alignment_filtered_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset['file_prefix'] + '.ionstats_alignment.json')) else: ionstats_alignment_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset['file_prefix'] + '.ionstats_alignment.json')) else: if filtered: ionstats_basecaller_filtered_file_list.append( os.path.join( BASECALLER_RESULTS, dataset['file_prefix'] + '.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append( os.path.join( BASECALLER_RESULTS, dataset['file_prefix'] + '.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats( ionstats_alignment_file_list, os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats( ionstats_basecaller_file_list, os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") do_sorting = True # compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue if reference: # merge unmapped bam files TODO move into align try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) try: align( blocks, alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, logfile=os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignmentQC_out.txt'), output_dir=ALIGNMENT_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if filtered: ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) ''' if do_indexing: try: composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai') blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath) except: traceback.print_exc() ''' else: # merge unmapped bam file without reference try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) if do_ionstats: # TODO: move ionstats basecaller into basecaller ionstats.generate_ionstats_basecaller( [os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])], os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'), library_key, graph_max_x) if filtered: ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks) do_sorting = True # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue try: # process block by block if reference and len(blocks) > 1 and int(dataset["read_count"]) > 20000000: printtime("DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align( [block], os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block,ALIGNMENT_RESULTS), output_basename=dataset['file_prefix']) bamdir = '.' # TODO , do we need this ? bamBase = dataset['file_prefix'] bamfile = dataset['file_prefix'] + ".bam" # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset['file_prefix'] + ".bam" composite_bam_filepath = dataset['file_prefix'] + ".bam" blockprocessing.extract_and_merge_bam_header(block_bam_list,composite_bam_filepath) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = 'samtools merge -l1 -@8' if do_ionstats: cmd += ' - ' else: cmd += ' %s' % (composite_bam_filepath) for bamfile in block_bam_list: cmd += ' %s' % bamfile cmd += ' -h %s.header.sam' % composite_bam_filepath if do_ionstats: bam_filenames=["/dev/stdin"] ionstats_alignment_filename="%s.ionstats_alignment.json" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename="%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = 'BamDuplicates.%s.json' % bamBase if bamBase!='rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd,shell=True) else: printtime("DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # TODO: try a python multiprocessing pool align( blocks, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if reference: if filtered: ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: if filtered: ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")