def mount_gatk_cram_input(input_param="input"): # Get single CRAM file for this task print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise errors.InvalidArgumentError( "Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise errors.FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise errors.FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) return cram_file
def mount_gatk_gvcf_inputs(inputs_param="inputs"): # Get input gVCFs for this task print "Mounting task input collection" inputs_dir = arvados.get_task_param_mount(inputs_param) # Sanity check input gVCFs input_gvcf_files = [] for f in arvados.util.listdir_recursive(inputs_dir): if re.search(r'\.vcf\.gz$', f): input_gvcf_files.append(os.path.join(inputs_dir, f)) elif re.search(r'\.tbi$', f): pass elif re.search(r'\.interval_list$', f): pass else: print "WARNING: collection contains unexpected file %s" % f if len(input_gvcf_files) == 0: raise errors.InvalidArgumentError("Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)" % inputs_dir) # Ensure we can read the gVCF files and that they each have an index for gvcf_file in input_gvcf_files: if not os.access(gvcf_file, os.R_OK): raise errors.FileAccessError("gVCF file not readable: %s" % gvcf_file) # Ensure we have corresponding .tbi index and can read it as well (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file) assert(gvcf_file_ext == ".gz") tbi_file = gvcf_file_base + ".gz.tbi" if not os.access(tbi_file, os.R_OK): tbi_file = gvcf_file_base + ".tbi" if not os.access(tbi_file, os.R_OK): raise errors.FileAccessError("No readable gVCF index file for gVCF file: %s" % gvcf_file) return input_gvcf_files
def mount_gatk_cram_input(input_param="input"): # Get single CRAM file for this task print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise errors.InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise errors.FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise errors.FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) return cram_file
def mount_gatk_reference(ref_param="ref"): # Get reference FASTA print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount(ref_param) # Sanity check reference FASTA for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise errors.InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise errors.FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? return ref_file
def mount_gatk_reference(ref_param="ref"): # Get reference FASTA print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount(ref_param) # Sanity check reference FASTA for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise errors.InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise errors.FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? return ref_file
def mount_gatk_gvcf_inputs(inputs_param="inputs"): # Get input gVCFs for this task print "Mounting task input collection" inputs_dir = "" if inputs_param in arvados.current_task()['parameters']: inputs_dir = arvados.get_task_param_mount(inputs_param) else: inputs_dir = arvados.get_job_param_mount(inputs_param) # Sanity check input gVCFs input_gvcf_files = [] for f in arvados.util.listdir_recursive(inputs_dir): if re.search(r'\.vcf\.gz$', f): input_gvcf_files.append(os.path.join(inputs_dir, f)) elif re.search(r'\.tbi$', f): pass elif re.search(r'\.interval_list$', f): pass else: print "WARNING: collection contains unexpected file %s" % f if len(input_gvcf_files) == 0: raise errors.InvalidArgumentError( "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)" % inputs_dir) # Ensure we can read the gVCF files and that they each have an index for gvcf_file in input_gvcf_files: if not os.access(gvcf_file, os.R_OK): raise errors.FileAccessError("gVCF file not readable: %s" % gvcf_file) # Ensure we have corresponding .tbi index and can read it as well (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file) assert (gvcf_file_ext == ".gz") tbi_file = gvcf_file_base + ".gz.tbi" if not os.access(tbi_file, os.R_OK): tbi_file = gvcf_file_base + ".tbi" if not os.access(tbi_file, os.R_OK): raise errors.FileAccessError( "No readable gVCF index file for gVCF file: %s" % gvcf_file) return input_gvcf_files
def mount_single_gatk_interval_list_input(interval_list_param="interval_list"): # Get interval_list for this task print "Mounting task input collection to get interval_list" interval_list_dir = arvados.get_task_param_mount(interval_list_param) print "Interval_List collection mounted at %s" % (interval_list_dir) # Sanity check input interval_list (there can be only one) input_interval_lists = [] for f in arvados.util.listdir_recursive(interval_list_dir): if re.search(r'\.interval_list$', f): input_interval_lists.append(os.path.join(interval_list_dir, f)) if len(input_interval_lists) != 1: raise errors.InvalidArgumentError("Expected exactly one interval_list in input collection (found %s)" % len(input_interval_lists)) assert(len(input_interval_lists) == 1) interval_list_file = input_interval_lists[0] if not os.access(interval_list_file, os.R_OK): raise errors.FileAccessError("interval_list file not readable: %s" % interval_list_file) return interval_list_file
def mount_single_gatk_interval_list_input(interval_list_param="interval_list"): # Get interval_list for this task print "Mounting task input collection to get interval_list" interval_list_dir = arvados.get_task_param_mount(interval_list_param) print "Interval_List collection mounted at %s" % (interval_list_dir) # Sanity check input interval_list (there can be only one) input_interval_lists = [] for f in arvados.util.listdir_recursive(interval_list_dir): if re.search(r'\.interval_list$', f): input_interval_lists.append(os.path.join(interval_list_dir, f)) if len(input_interval_lists) != 1: raise errors.InvalidArgumentError( "Expected exactly one interval_list in input collection (found %s)" % len(input_interval_lists)) assert (len(input_interval_lists) == 1) interval_list_file = input_interval_lists[0] if not os.access(interval_list_file, os.R_OK): raise errors.FileAccessError("interval_list file not readable: %s" % interval_list_file) return interval_list_file
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file( ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file( ref_fai_file, error_exception=FileAccessError( "reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file( chunk_file, error_exception=FileAccessError( "Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError( "CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename( cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join( tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend( [concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None)): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = [ "teepot", bcftools_view_noheader_input_fifo, "-" ] bcftools_view_noheader_cmd = [ "bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo ] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = [ "bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp ] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = [ "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file ] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % ( region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe( ) print "Creating 'bcftools norm | tee' pipe for region %s" % ( region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % ( region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe( ) print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % ( bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd( bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd( bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd( part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd( bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd( bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished( bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished( bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[ bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write ]) part_tee_p = close_process_if_finished( part_tee_p, "tee %s" % (region_label), close_fds=[ part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write ], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished( bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished( bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished( region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ((region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None)): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % ( concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [ concat_headeronly_tmps[region] for region in regions ]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe( ) grep_headeronly_cmd = [ "egrep", "-v", "^[#][#](bcftools|mpileup|reference)" ] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = [ "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn ] bcftools_concat_headeronly_p = run_child_cmd( bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished( bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished( grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % ( final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished( final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe( ) print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = [ "bcftools", "view", "-Oz", "-o", penultimate_out_file ] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd( final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd( final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished( final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished( final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % ( final_out_file) final_bcftools_reheader_cmd = [ "bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file ] final_bcftools_reheader_p = run_child_cmd( final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished( final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert(this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ( (bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None) ): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"] bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe() print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd(bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd(part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished(bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write]) part_tee_p = close_process_if_finished(part_tee_p, "tee %s" % (region_label), close_fds=[part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished(region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ( (region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None) ): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe() grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn] bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished(grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished(final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe() print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished(final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % (final_out_file) final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file] final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
outdir = os.path.join(task.tmpdir, "output") os.makedirs(outdir) os.chdir(outdir) if infile_parts is None: print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file sys.exit(1) cr = arvados.CollectionReader(infile_parts.group(1)) streamname = infile_parts.group(3)[1:] filename = infile_parts.group(4)[1:] if streamname is not None: subprocess.call(["mkdir", "-p", streamname]) os.chdir(streamname) else: streamname = '.' m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE) if m is not None: rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')]) if rc == 0: task.set_output(robust_put.upload(outdir)) else: sys.exit(rc) else: streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0] filereader = streamreader.files()[filename] task.set_output(streamname + filereader.as_manifest()[1:])
def main(): this_job = arvados.current_job() # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list.txt$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file if not os.access(chunk_file, os.R_OK): raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise # out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz") config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf") lock_file = os.path.join(arvados.current_task().tmpdir, "run-bt-mpileup.lock") if not os.path.exists(RUNNER_CONFIG_TEMPLATE): raise FileAccessError("No runner configuration template at %s" % RUNNER_CONFIG_TEMPLATE) # generate config runner_config_text = jinja2.Environment(loader=jinja2.FileSystemLoader( "/")).get_template(RUNNER_CONFIG_TEMPLATE).render( fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file) with open(config_file, "wb") as fh: fh.write(runner_config_text) # report configuration print "Generated runner config to %s:\n%s" % (config_file, runner_config_text) # Call run-bt-mpileup runner_p = subprocess.Popen([ "run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5", "+lock", lock_file, "-o", out_dir ], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=False) while runner_p.poll() is None: line = runner_p.stdout.readline() # only print '#### unfinished' lines or things that are errors or warnings if re.search(r'\d+\s+unfinished', line) or re.search( r'(FATAL|ERROR|WARNING)', line, flags=re.IGNORECASE): print "RUNNER: %s" % line.rstrip() runner_exit = runner_p.wait() if runner_exit != 0: print "WARNING: runner exited with exit code %s" % runner_exit # clean up out_dir try: os.remove(os.path.join(out_dir, "run-bt-mpileup.lock")) os.remove(os.path.join(out_dir, "mpileup.conf")) os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz")) except: print "WARNING: could not remove some output files!" pass out_bcf = os.path.join( out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".bcf") try: os.rename(os.path.join(out_dir, "all.bcf"), out_bcf) os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi") os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"), out_bcf + ".filt.vchk") os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk") except: print "WARNING: could not rename some output files!" pass # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator)
def main(): this_job = arvados.current_job() # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert this_task["sequence"] != 0 # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount("ref") for f in arvados.util.listdir_recursive(ref_dir): if re.search(r"\.fa$", f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount("chunk") for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r"\.region_list.txt$", f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file if not os.access(chunk_file, os.R_OK): raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount("input") input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r"\.cram$", f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert cram_file_ext == ".cram" crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, "out") if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise # out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz") config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf") lock_file = os.path.join(arvados.current_task().tmpdir, "run-bt-mpileup.lock") if not os.path.exists(RUNNER_CONFIG_TEMPLATE): raise FileAccessError("No runner configuration template at %s" % RUNNER_CONFIG_TEMPLATE) # generate config runner_config_text = ( jinja2.Environment(loader=jinja2.FileSystemLoader("/")) .get_template(RUNNER_CONFIG_TEMPLATE) .render(fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file) ) with open(config_file, "wb") as fh: fh.write(runner_config_text) # report configuration print "Generated runner config to %s:\n%s" % (config_file, runner_config_text) # Call run-bt-mpileup runner_p = subprocess.Popen( ["run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5", "+lock", lock_file, "-o", out_dir], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=False, ) while runner_p.poll() is None: line = runner_p.stdout.readline() # only print '#### unfinished' lines or things that are errors or warnings if re.search(r"\d+\s+unfinished", line) or re.search(r"(FATAL|ERROR|WARNING)", line, flags=re.IGNORECASE): print "RUNNER: %s" % line.rstrip() runner_exit = runner_p.wait() if runner_exit != 0: print "WARNING: runner exited with exit code %s" % runner_exit # clean up out_dir try: os.remove(os.path.join(out_dir, "run-bt-mpileup.lock")) os.remove(os.path.join(out_dir, "mpileup.conf")) os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz")) except: print "WARNING: could not remove some output files!" pass out_bcf = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".bcf") try: os.rename(os.path.join(out_dir, "all.bcf"), out_bcf) os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi") os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"), out_bcf + ".filt.vchk") os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk") except: print "WARNING: could not rename some output files!" pass # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator)
if infile_parts is None: print >> sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file sys.exit(1) cr = arvados.CollectionReader(infile_parts.group(1)) streamname = infile_parts.group(3)[1:] filename = infile_parts.group(4)[1:] if streamname is not None: subprocess.call(["mkdir", "-p", streamname]) os.chdir(streamname) else: streamname = '.' m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE) if m is not None: rc = subprocess.call( ["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')]) if rc == 0: task.set_output(robust_put.upload(outdir)) else: sys.exit(rc) else: streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0] filereader = streamreader.files()[filename] task.set_output(streamname + filereader.as_manifest()[1:])
def main(): this_job = arvados.current_job() # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert(this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? # Get genome chunk intervals file # chunk_file = None # print "Mounting chunk collection" # chunk_dir = arvados.get_task_param_mount('chunk') # for f in arvados.util.listdir_recursive(chunk_dir): # if re.search(r'\.region_list.txt$', f): # chunk_file = os.path.join(chunk_dir, f) # if chunk_file is None: # raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # # Ensure we can read the chunk file # if not os.access(chunk_file, os.R_OK): # raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise # out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.bcf") out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + ".g.bcf") bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) # Call bcftools runner_p = subprocess.Popen(bash_cmd_pipe, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=True) while runner_p.poll() is None: line = runner_p.stdout.readline() print "BCFTOOLS: %s" % line.rstrip() runner_exit = runner_p.wait() if runner_exit != 0: print "WARNING: runner exited with exit code %s" % runner_exit # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator)