def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] if arvados.current_task()['sequence'] == 0: # get candidates for task reuse task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list script="gatk-genotypegvcfs.py" oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2' job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] # retrieve a full set of all possible reusable tasks at sequence 1 print "Retrieving all potentially reusable tasks" reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) def create_task_with_validated_reuse(sequence, params): return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output) # Setup sub tasks (and terminate if this is task 0) hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task_with_validated_reuse) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Genotype gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs") name = this_task['parameters'].get('name') if not name: name = "unknown" out_file = name + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # GenotypeGVCFs! gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g") if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()["script_parameters"]["reference_collection"] ) job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"] interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"] interval_count = 1 if "interval_count" in arvados.current_job()["script_parameters"]: interval_count = arvados.current_job()["script_parameters"]["interval_count"] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.chunked_tasks_per_cram_file( ref_input_pdh, job_input_pdh, interval_lists_pdh, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=False, oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2", script="gatk-haplotypecaller-cram.py", ) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert this_task["sequence"] != 0 ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if "reuse_job_task" in this_task["parameters"]: print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"] exit(0) ################################################################################ # Phase IIb: Call Haplotypes! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk") cram_file = gatk_helper.mount_gatk_cram_input(input_param="input") cram_file_base, cram_file_ext = os.path.splitext(cram_file) out_dir = hgi_arvados.prepare_out_dir() out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_filename = out_filename.replace(".bcf", "._cf") # HaplotypeCaller! gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename)) if gatk_exit != 0: print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() print "Task output written to keep, validating it" if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, max_gvcfs_to_combine, if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase II: Read interval_list and split into additional intervals ################################################################################ hgi_arvados.one_task_per_interval(interval_count, validate_task_output, reuse_tasks=True, oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1", if_sequence=1, and_end_task=True) # We will never reach this point if we are in the 1st task sequence assert(this_task['sequence'] > 1) ################################################################################ # Phase IIIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIIb: Combine gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() name = this_task['parameters'].get('name') if not name: name = "unknown" interval_str = this_task['parameters'].get('interval') if not interval_str: interval_str = "" interval_strs = interval_str.split() intervals = [] for interval in interval_strs: intervals.extend(["--intervals", interval]) out_file = name + ".vcf.gz" if interval_count > 1: out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz" if len(out_file) > 255: out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz" print "Output file name was too long with full interval list, shortened it to: %s" % out_file if len(out_file) > 255: raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file) # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # CombineGVCFs! extra_args = intervals extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"]) gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args) if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, max_gvcfs_to_combine, if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert (this_task['sequence'] > 0) ################################################################################ # Phase II: Read interval_list and split into additional intervals ################################################################################ hgi_arvados.one_task_per_interval( interval_count, validate_task_output, reuse_tasks=True, oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1", if_sequence=1, and_end_task=True) # We will never reach this point if we are in the 1st task sequence assert (this_task['sequence'] > 1) ################################################################################ # Phase IIIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIIb: Combine gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() name = this_task['parameters'].get('name') if not name: name = "unknown" interval_str = this_task['parameters'].get('interval') if not interval_str: interval_str = "" interval_strs = interval_str.split() intervals = [] for interval in interval_strs: intervals.extend(["--intervals", interval]) out_file = name + ".vcf.gz" if interval_count > 1: out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz" if len(out_file) > 255: out_file = name + "." + '_'.join( [interval_strs[0], interval_strs[-1]]) + ".vcf.gz" print "Output file name was too long with full interval list, shortened it to: %s" % out_file if len(out_file) > 255: raise errors.InvalidArgumentError( "Output file name is too long, cannot continue: %s" % out_file) # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # CombineGVCFs! extra_args = intervals extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"]) gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args) if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] if arvados.current_task()['sequence'] == 0: # get candidates for task reuse task_key_params = [ 'inputs', 'ref', 'name' ] # N.B. inputs collection includes input vcfs and corresponding interval_list script = "gatk-genotypegvcfs.py" oldest_git_commit_to_reuse = '6ca726fc265f9e55765bf1fdf71b86285b8a0ff2' job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] # retrieve a full set of all possible reusable tasks at sequence 1 print "Retrieving all potentially reusable tasks" reusable_tasks = hgi_arvados.get_reusable_tasks( 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) def create_task_with_validated_reuse(sequence, params): return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output) # Setup sub tasks (and terminate if this is task 0) hgi_arvados.one_task_per_group_combined_inputs( ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task_with_validated_reuse) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert (this_task['sequence'] > 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Genotype gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param="inputs") name = this_task['parameters'].get('name') if not name: name = "unknown" out_file = name + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # GenotypeGVCFs! gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="4", java_mem="19g") if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.chunked_tasks_per_cram_file( ref_input_pdh, job_input_pdh, interval_lists_pdh, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=False, oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script="gatk-haplotypecaller-cram.py") # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Call Haplotypes! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param="chunk") cram_file = gatk_helper.mount_gatk_cram_input(input_param="input") cram_file_base, cram_file_ext = os.path.splitext(cram_file) out_dir = hgi_arvados.prepare_out_dir() out_filename = os.path.basename(cram_file_base) + "." + os.path.basename( interval_list_file) + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_filename = out_filename.replace(".bcf", "._cf") # HaplotypeCaller! gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename)) if gatk_exit != 0: print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() print "Task output written to keep, validating it" if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update( uuid=arvados.current_task()['uuid'], body={ 'success': False }).execute()