def spawn_new_task_per_file(script_parameter, regex, if_sequence=0, and_end_task=True): """ Generalized form of one_task_per_pair_input_file from https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py Creates a new task if the file in the collection matches the regex """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters'][script_parameter] input_collection = Collection(job_input) for name in input_collection: if not re.search(regex,name): continue new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1': name, } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute()
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True): """ Queue one task for each pair of fastq files in this job's input collection. Each new task will have two parameters, named "input_1" and "input_2", each being a manifest containing a single fastq file. A matching pair of files in the input collection is assumed to have names "x_1.y" and "x_2.y". Files in the input collection that are not part of a matched pair are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) all_files = [] for s in cr.all_streams(): all_files += list(s.all_files()) for s in cr.all_streams(): for left_file in s.all_files(): left_name = left_file.name() right_file = None right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name) if right_name == left_name: continue for f2 in s.all_files(): if right_name == f2.name(): right_file = f2 if right_file != None: new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1': left_file.as_manifest(), 'input_2': right_file.as_manifest() } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True): """ Queue one task for each pair of fastq files in this job's input collection. Each new task will have two parameters, named "input_1" and "input_2", each being a manifest containing a single fastq file. A matching pair of files in the input collection is assumed to have names "x_1.y" and "x_2.y". Files in the input collection that are not part of a matched pair are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) all_files = [] for s in cr.all_streams(): all_files += list(s.all_files()) for s in cr.all_streams(): for left_file in s.all_files(): left_name = left_file.name() right_file = None right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name) if right_name == left_name: continue for f2 in s.all_files(): if right_name == f2.name(): right_file = f2 if right_file != None: new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1':left_file.as_manifest(), 'input_2':right_file.as_manifest() } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def tarball_extract(tarball, path): """Retrieve a tarball from Keep and extract it to a local directory. Return the absolute path where the tarball was extracted. If the top level of the tarball contained just one file or directory, return the absolute path of that single item. tarball -- collection locator path -- where to extract the tarball: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == tarball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(tarball).all_files(): if re.search('\.(tbz|tar.bz2)$', f.name()): p = tar_extractor(path, 'j') elif re.search('\.(tgz|tar.gz)$', f.name()): p = tar_extractor(path, 'z') elif re.search('\.tar$', f.name()): p = tar_extractor(path, '') else: raise arvados.errors.AssertionError( "tarball_extract cannot handle filename %s" % f.name()) while True: buf = f.read(2**20) if len(buf) == 0: break p.stdin.write(buf) p.stdin.close() p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError( "tar exited %d" % p.returncode) os.symlink(tarball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=None): async_results = [] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk job_uuid = arvados.current_job()['uuid'] current_task_uuid = arvados.current_task()['uuid'] new_task_attrs = { 'job_uuid': job_uuid, 'created_by_job_task_uuid': current_task_uuid, 'sequence': if_sequence + 1, 'parameters': { 'input': task_input_pdh, 'ref': ref_input_pdh, 'chunk': chunk_input_pdh } } async_result = pool.apply_async(arv_create_task, ( new_task_attrs, "Created new task to process %s with chunk interval %s (job_uuid %s)" % (f_name, chunk_input_name, job_uuid))) async_results.append(async_result) for async_result in async_results: async_result.wait() try: (res, report) = async_result.get() if (not res) or (not 'qsequence' in res): raise InternalError("Could not create job task: %s" % res) else: print report + " qsequence %s" % res['qsequence'] except Exception as e: raise InternalError("Exception creating job task: %s" % e)
def git_checkout(url, version, path): if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) if not os.path.exists(path): run_command(["git", "clone", url, path], cwd=os.path.dirname(path)) run_command(["git", "checkout", version], cwd=path) return path
def tarball_extract(tarball, path): """Retrieve a tarball from Keep and extract it to a local directory. Return the absolute path where the tarball was extracted. If the top level of the tarball contained just one file or directory, return the absolute path of that single item. tarball -- collection locator path -- where to extract the tarball: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == tarball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(tarball).all_files(): if re.search('\.(tbz|tar.bz2)$', f.name()): p = tar_extractor(path, 'j') elif re.search('\.(tgz|tar.gz)$', f.name()): p = tar_extractor(path, 'z') elif re.search('\.tar$', f.name()): p = tar_extractor(path, '') else: raise arvados.errors.AssertionError( "tarball_extract cannot handle filename %s" % f.name()) while True: buf = f.read(2**20) if len(buf) == 0: break p.stdin.write(buf) p.stdin.close() p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError("tar exited %d" % p.returncode) os.symlink(tarball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': parameters } # See if there is a task in reusable_tasks that can be reused ct_index = tuple([parameters[index_param] for index_param in task_key_params]) if len(reusable_tasks) == 0: print "No reusable tasks were available" elif ct_index in reusable_tasks: # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output reuse_task = reusable_tasks[ct_index] if validate_task_output(reuse_task['output']): print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output']) # remove task from reusable_tasks as it won't be used more than once del reusable_tasks[ct_index] # copy relevant attrs from reuse_task so that the new tasks start already finished for attr in ['success', 'output', 'progress', 'started_at', 'finished_at', 'parameters']: new_task_attrs[attr] = reuse_task[attr] # crunch seems to ignore the fact that the job says it is done and queue it anyway # signal ourselves to just immediately exit successfully when we are run new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid'] else: print "Output %s for potential task reuse did not validate" % (reuse_task['output']) else: print "No reusable JobTask matched key parameters %s" % (list(ct_index)) # Create the "new" task (may be new work or may be already finished work) new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute() if not new_task: raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs) return new_task
def one_task_per_bam_file(if_sequence=0, and_end_task=True): """ Queue one task for each bam file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .bam file and (if available) the corresponding .bai index file. Files in the input collection that are not named *.bam or *.bai (as well as *.bai files that do not match any .bam file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) bam = {} bai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.bam$', f.name()): bam[s.name(), f.name()] = f elif re.search(r'\.bai$', f.name()): bai[s.name(), f.name()] = f for ((s_name, f_name), bam_f) in bam.items(): bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None) task_input = bam_f.as_manifest() if bai_f: task_input += bai_f.as_manifest() new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input': task_input } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def install_path(): global gatk2_install_path if gatk2_install_path: return gatk2_install_path gatk2_install_path = arvados.util.tarball_extract( tarball=arvados.current_job()['script_parameters']['gatk_tbz'], path='gatk2') return gatk2_install_path
def one_task_per_bam_file(if_sequence=0, and_end_task=True): """ Queue one task for each bam file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .bam file and (if available) the corresponding .bai index file. Files in the input collection that are not named *.bam or *.bai (as well as *.bai files that do not match any .bam file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) bam = {} bai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.bam$', f.name()): bam[s.name(), f.name()] = f elif re.search(r'\.bai$', f.name()): bai[s.name(), f.name()] = f for ((s_name, f_name), bam_f) in bam.items(): bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None) task_input = bam_f.as_manifest() if bai_f: task_input += bai_f.as_manifest() new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input': task_input } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def install_path(): global gatk2_install_path if gatk2_install_path: return gatk2_install_path gatk2_install_path = arvados.util.tarball_extract( tarball = arvados.current_job()['script_parameters']['gatk_tbz'], path = 'gatk2') return gatk2_install_path
def create_task(sequence, params): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': params } task = arvados.api().job_tasks().create(body=new_task_attrs).execute() return task
def main(): # Get object representing the current task this_task = arvados.current_task() sort_by_r = re.compile(sort_by_regex) ################################################################################ # Concatentate VCFs in numerically sorted order of sort_by_regex ################################################################################ vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() output_prefix = arvados.current_job()['script_parameters']['output_prefix'] out_file = output_prefix + ".vcf.gz" # Concatenate VCFs bcftools_concat_exit = bcftools.concat(sorted(vcf_files, key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))), os.path.join(out_dir, out_file)) if bcftools_concat_exit != 0: print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "bcftools concat exited successfully, indexing" bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file)) if bcftools_index_exit != 0: print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "bcftools index exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def setup(): global rtg_install_path if rtg_install_path: return rtg_install_path rtg_path = arvados.util.zipball_extract( zipball = arvados.current_job()['script_parameters']['rtg_binary_zip'], path = 'rtg') rtg_license_path = arvados.util.collection_extract( collection = arvados.current_job()['script_parameters']['rtg_license'], path = 'license', decompress = False) # symlink to rtg-license.txt license_txt_path = os.path.join(rtg_license_path, 'rtg-license.txt') try: os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt')) except OSError: if not os.path.exists(os.path.join(rtg_path,'rtg-license.txt')): os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt')) rtg_install_path = rtg_path return rtg_path
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True): """ Generalized form of one_task_per_pair_input_file from https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py Creates a new task if the file in the collection matches the regex """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters'][script_parameter] input_collection = Collection(job_input) for name in input_collection: if not re.search(regex,name): continue name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name) bed_lines = (line.split() for line in open(name_path, 'r')) # Start the biggest regions first def cmp_desc_region_size(a, b): return ((int(b[2]) - int(b[1])) - (int(a[2]) - int(a[1]))) for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size): print bed_line new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'chrom': bed_line[0], 'start': bed_line[1], 'end': bed_line[2] } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit()
def setup(): global rtg_install_path if rtg_install_path: return rtg_install_path rtg_path = arvados.util.zipball_extract( zipball=arvados.current_job()['script_parameters']['rtg_binary_zip'], path='rtg') rtg_license_path = arvados.util.collection_extract( collection=arvados.current_job()['script_parameters']['rtg_license'], path='license', decompress=False) # symlink to rtg-license.txt license_txt_path = os.path.join(rtg_license_path, 'rtg-license.txt') try: os.symlink(license_txt_path, os.path.join(rtg_path, 'rtg-license.txt')) except OSError: if not os.path.exists(os.path.join(rtg_path, 'rtg-license.txt')): os.symlink(license_txt_path, os.path.join(rtg_path, 'rtg-license.txt')) rtg_install_path = rtg_path return rtg_path
def main(): current_job = arvados.current_job() genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the interval_list collection to only those files relevant to gatk il_input_pdh = prepare_gatk_interval_list_collection(interval_list_coll=current_job['script_parameters']['interval_list_collection']) # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection output_locator = create_interval_lists(genome_chunks, il_input_pdh) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def install_path(): global picard_install_path if picard_install_path: return picard_install_path zipball = arvados.current_job()['script_parameters']['picard_zip'] extracted = arvados.util.zipball_extract(zipball=zipball, path='picard') for f in os.listdir(extracted): if (re.search(r'^picard-tools-[\d\.]+$', f) and os.path.exists(os.path.join(extracted, f, '.'))): picard_install_path = os.path.join(extracted, f) break if not picard_install_path: raise Exception("picard-tools-{version} directory not found in %s" % zipball) return picard_install_path
def install_path(): global picard_install_path if picard_install_path: return picard_install_path zipball = arvados.current_job()['script_parameters']['picard_zip'] extracted = arvados.util.zipball_extract( zipball = zipball, path = 'picard') for f in os.listdir(extracted): if (re.search(r'^picard-tools-[\d\.]+$', f) and os.path.exists(os.path.join(extracted, f, '.'))): picard_install_path = os.path.join(extracted, f) break if not picard_install_path: raise Exception("picard-tools-{version} directory not found in %s" % zipball) return picard_install_path
def main(): current_job = arvados.current_job() genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the interval_list collection to only those files relevant to gatk il_input_pdh = prepare_gatk_interval_list_collection( interval_list_coll=current_job['script_parameters'] ['interval_list_collection']) # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection output_locator = create_interval_lists(genome_chunks, il_input_pdh) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': parameters } # See if there is a task in reusable_tasks that can be reused ct_index = tuple( [parameters[index_param] for index_param in task_key_params]) if len(reusable_tasks) == 0: print "No reusable tasks were available" elif ct_index in reusable_tasks: # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output reuse_task = reusable_tasks[ct_index] if validate_task_output(reuse_task['output']): print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % ( reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output']) # remove task from reusable_tasks as it won't be used more than once del reusable_tasks[ct_index] # copy relevant attrs from reuse_task so that the new tasks start already finished for attr in [ 'success', 'output', 'progress', 'started_at', 'finished_at', 'parameters' ]: new_task_attrs[attr] = reuse_task[attr] # crunch seems to ignore the fact that the job says it is done and queue it anyway # signal ourselves to just immediately exit successfully when we are run new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid'] else: print "Output %s for potential task reuse did not validate" % ( reuse_task['output']) else: print "No reusable JobTask matched key parameters %s" % ( list(ct_index)) # Create the "new" task (may be new work or may be already finished work) new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute() if not new_task: raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs) return new_task
def main(): current_job = arvados.current_job() skip_sq_sn_regex = '_decoy$' if 'skip_sq_sn_regex' in current_job['script_parameters']: skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex'] skip_sq_sn_r = re.compile(skip_sq_sn_regex) genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the reference collection to only those files relevant to gatk ref_input_pdh = prepare_gatk_reference_collection(reference_coll=current_job['script_parameters']['reference_collection']) # Create an interval_list file for each chunk based on the .dict in the reference collection output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True): if if_sequence != arvados.current_task()['sequence']: return api_client = arvados.api('v1') for i in range(num_classifiers_to_parameterize): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'classifier_index':i, 'time_to_wait':i*560 } } api_client.job_tasks().create(body=new_task_attrs).execute() if and_end_task: api_client.job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def samtools_install_path(): """ Extract the samtools source tree, build the samtools binary, and return the path to the source tree. """ global samtools_path if samtools_path: return samtools_path samtools_path = arvados.util.tarball_extract( tarball = arvados.current_job()['script_parameters']['samtools_tgz'], path = 'samtools') # build "samtools" binary lockfile = open(os.path.split(samtools_path)[0] + '.samtools-make.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) arvados.util.run_command(['make', '-j16'], cwd=samtools_path) lockfile.close() return samtools_path
def install_path(): """ Extract the bwa source tree, build the bwa binary, and return the path to the source tree. """ global bwa_install_path if bwa_install_path: return bwa_install_path bwa_install_path = arvados.util.tarball_extract( tarball=arvados.current_job()['script_parameters']['bwa_tbz'], path='bwa') # build "bwa" binary lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path) lockfile.close() return bwa_install_path
def samtools_install_path(): """ Extract the samtools source tree, build the samtools binary, and return the path to the source tree. """ global samtools_path if samtools_path: return samtools_path samtools_path = arvados.util.tarball_extract( tarball=arvados.current_job()['script_parameters']['samtools_tgz'], path='samtools') # build "samtools" binary lockfile = open( os.path.split(samtools_path)[0] + '.samtools-make.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) arvados.util.run_command(['make', '-j16'], cwd=samtools_path) lockfile.close() return samtools_path
def get_file_path(parameter,regex): """ Return the path to a file with (name) set in script parameters (parameter), using regex (regex): Basically to avoid: ref_collection_id = this_job['script_parameters']['reference_index'] ref_collection = coll(ref_collection_id) for file in ref_collection: if not re.search('.*f(ast)?a(.gz)?$',file): continue ref_file = file ref_path = os.path.join(arvados.get_job_param_mount("reference_index"),ref_file) """ collection_id = arvados.current_job()['script_parameters'][parameter] collection_handle = Collection(collection_id) for file in collection_handle: if not re.search(regex,file): continue out_file = file out_path = os.path.join(arvados.get_job_param_mount(parameter),out_file) return out_path
def install_path(): """ Extract the bwa source tree, build the bwa binary, and return the path to the source tree. """ global bwa_install_path if bwa_install_path: return bwa_install_path bwa_install_path = arvados.util.tarball_extract( tarball = arvados.current_job()['script_parameters']['bwa_tbz'], path = 'bwa') # build "bwa" binary lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path) lockfile.close() return bwa_install_path
def stream_extract(stream, path, files=[], decompress=True): """Retrieve a stream from Keep and extract it to a local directory. Return the absolute path where the stream was extracted. stream -- StreamReader object path -- where to extract: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) files_got = [] for f in stream.all_files(): if (files == [] or ((f.name() not in files_got) and (f.name() in files or (decompress and f.decompressed_name() in files)))): outname = f.decompressed_name() if decompress else f.name() files_got += [outname] if os.path.exists(os.path.join(path, outname)): os.unlink(os.path.join(path, outname)) mkdir_dash_p(os.path.dirname(os.path.join(path, outname))) outfile = open(os.path.join(path, outname), 'wb') for buf in (f.readall_decompressed() if decompress else f.readall()): outfile.write(buf) outfile.close() if len(files_got) < len(files): raise arvados.errors.AssertionError( "Wanted files %s but only got %s from %s" % (files, files_got, [z.name() for z in stream.all_files()])) lockfile.close() return path
def main(): current_job = arvados.current_job() skip_sq_sn_regex = '_decoy$' if 'skip_sq_sn_regex' in current_job['script_parameters']: skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex'] skip_sq_sn_r = re.compile(skip_sq_sn_regex) genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the reference collection to only those files relevant to gatk ref_input_pdh = prepare_gatk_reference_collection( reference_coll=current_job['script_parameters'] ['reference_collection']) # Create an interval_list file for each chunk based on the .dict in the reference collection output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
of.write("pwd: " + pwdinfo) lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo) meminfo = sp.check_output(["free", "-hm"]) of.write("mem:\n" + meminfo) hostinfo = sp.check_output(["hostname"]) of.write("host: " + hostinfo) job = arv.current_job() task = arv.current_task() of = arv.CollectionWriter() of.set_current_file_name("info.log") whoinfo = sp.check_output(["whoami"]) of.write("user: "******"\n") pwdinfo = sp.check_output(["pwd"]) of.write("pwd: " + pwdinfo + "\n") lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"])
def chunked_tasks_per_cram_file( ref_input, job_input, interval_lists, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, reuse_tasks_retrieve_all=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script=arvados.current_job()['script']): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # prepare interval lists cr = arvados.CollectionReader(interval_lists) chunk_interval_list = {} chunk_input_pdh_names = [] for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.interval_list$', f.name()): chunk_interval_list[s.name(), f.name()] = f for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()): chunk_input = chunk_interval_list_f.as_manifest() try: r = arvados.api().collections().create(body={ "manifest_text": chunk_input }).execute() chunk_input_pdh = r["portable_data_hash"] chunk_input_name = os.path.join(s_name, f_name) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) except: raise if len(chunk_input_pdh_names) == 0: raise errors.InvalidArgumentError( "No interval_list files found in %s" % (interval_lists)) # prepare CRAM input collections cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): crai_f = crai.get( (s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise errors.InvalidArgumentError( "No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={ "manifest_text": task_input }).execute() task_input_pdh = r["portable_data_hash"] except: raise if reuse_tasks: task_key_params = ['input', 'ref', 'chunk'] # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] if reuse_tasks_retrieve_all: # retrieve a full set of all possible reusable tasks reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % ( len(reusable_tasks)) else: reusable_task_jobs = get_jobs_for_task_reuse(job_filters) print "Have %s jobs for potential task reuse" % ( len(reusable_task_jobs)) reusable_task_job_uuids = [ job['uuid'] for job in reusable_task_jobs['items'] ] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk new_task_params = { 'input': task_input_pdh, 'ref': ref_input, 'chunk': chunk_input_pdh } print "Creating new task to process %s with chunk interval %s " % ( f_name, chunk_input_name) if reuse_tasks: if reuse_tasks_retrieve_all: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_or_reuse_task_from_jobs( if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def one_task_per_interval(interval_count, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', task_key_params=['name', 'inputs', 'interval', 'ref'], script=arvados.current_job()['script']): """ Queue one task for each of interval_count intervals, splitting the genome chunk (described by the .interval_list file) evenly. Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param=interval_list_param) interval_reader = open(interval_list_file, mode="r") lines = interval_reader.readlines() sn_intervals = dict() sns = [] total_len = 0 for line in lines: if line[0] == '@': # skip all lines starting with '@' continue fields = line.split("\t") if len(fields) != 5: raise errors.InvalidArgumentError("interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields))) sn = fields[0] start = int(fields[1]) end = int(fields[2]) length = int(end) - int(start) + 1 total_len += int(length) sn_intervals[sn] = (start, end) sns.append(sn) print "Total chunk length is %s" % total_len interval_len = int(total_len / interval_count) intervals = [] print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len) for interval_i in range(0, interval_count): interval_num = interval_i + 1 intervals_count = 0 remaining_len = interval_len interval = [] while len(sns) > 0: sn = sns.pop(0) if not sn_intervals.has_key(sn): raise errors.ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end-start+1) > remaining_len: # not enough space for the whole sq, split it real_end = end end = remaining_len + start - 1 assert((end-start+1) <= remaining_len) sn_intervals[sn] = (end+1, real_end) sns.insert(0, sn) interval.append("%s:%s-%s" % (sn, start, end)) remaining_len -= (end-start+1) intervals_count += 1 if remaining_len <= 0: break if intervals_count > 0: intervals.append(interval) else: print "WARNING: skipping empty intervals for %s" % interval_input_name print "Have %s intervals" % (len(intervals)) if reuse_tasks: # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s potentially reusable tasks" % (len(reusable_tasks)) for interval in intervals: interval_str = ' '.join(interval) print "Creating task to process interval: [%s]" % interval_str new_task_params = arvados.current_task()['parameters'] new_task_params['interval'] = interval_str if reuse_tasks: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def chunked_tasks_per_cram_file(ref_input, job_input, interval_lists, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, reuse_tasks_retrieve_all=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script=arvados.current_job()['script']): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # prepare interval lists cr = arvados.CollectionReader(interval_lists) chunk_interval_list = {} chunk_input_pdh_names = [] for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.interval_list$', f.name()): chunk_interval_list[s.name(), f.name()] = f for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()): chunk_input = chunk_interval_list_f.as_manifest() try: r = arvados.api().collections().create(body={"manifest_text": chunk_input}).execute() chunk_input_pdh = r["portable_data_hash"] chunk_input_name = os.path.join(s_name, f_name) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) except: raise if len(chunk_input_pdh_names) == 0: raise errors.InvalidArgumentError("No interval_list files found in %s" % (interval_lists)) # prepare CRAM input collections cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise errors.InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={"manifest_text": task_input}).execute() task_input_pdh = r["portable_data_hash"] except: raise if reuse_tasks: task_key_params=['input', 'ref', 'chunk'] # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] if reuse_tasks_retrieve_all: # retrieve a full set of all possible reusable tasks reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) else: reusable_task_jobs = get_jobs_for_task_reuse(job_filters) print "Have %s jobs for potential task reuse" % (len(reusable_task_jobs)) reusable_task_job_uuids = [job['uuid'] for job in reusable_task_jobs['items']] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk new_task_params = { 'input': task_input_pdh, 'ref': ref_input, 'chunk': chunk_input_pdh } print "Creating new task to process %s with chunk interval %s " % (f_name, chunk_input_name) if reuse_tasks: if reuse_tasks_retrieve_all: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_or_reuse_task_from_jobs(if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def zipball_extract(zipball, path): """Retrieve a zip archive from Keep and extract it to a local directory. Return the absolute path where the archive was extracted. If the top level of the archive contained just one file or directory, return the absolute path of that single item. zipball -- collection locator path -- where to extract the archive: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == zipball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(zipball).all_files(): if not re.search('\.zip$', f.name()): raise arvados.errors.NotImplementedError( "zipball_extract cannot handle filename %s" % f.name()) zip_filename = os.path.join(path, os.path.basename(f.name())) zip_file = open(zip_filename, 'wb') while True: buf = f.read(2**20) if len(buf) == 0: break zip_file.write(buf) zip_file.close() p = subprocess.Popen( ["unzip", "-q", "-o", "-d", path, zip_filename], stdout=None, stdin=None, stderr=sys.stderr, shell=False, close_fds=True) p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError("unzip exited %d" % p.returncode) os.unlink(zip_filename) os.symlink(zipball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def zipball_extract(zipball, path): """Retrieve a zip archive from Keep and extract it to a local directory. Return the absolute path where the archive was extracted. If the top level of the archive contained just one file or directory, return the absolute path of that single item. zipball -- collection locator path -- where to extract the archive: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == zipball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(zipball).all_files(): if not re.search('\.zip$', f.name()): raise arvados.errors.NotImplementedError( "zipball_extract cannot handle filename %s" % f.name()) zip_filename = os.path.join(path, os.path.basename(f.name())) zip_file = open(zip_filename, 'wb') while True: buf = f.read(2**20) if len(buf) == 0: break zip_file.write(buf) zip_file.close() p = subprocess.Popen(["unzip", "-q", "-o", "-d", path, zip_filename], stdout=None, stdin=None, stderr=sys.stderr, shell=False, close_fds=True) p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError( "unzip exited %d" % p.returncode) os.unlink(zip_filename) os.symlink(zipball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex='_decoy$', genome_chunks=200): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # setup multiprocessing pool pool_processes = cpu_count() - 1 print 'Using %d processes to submit tasks\n' % pool_processes pool = Pool(processes=pool_processes) skip_sq_sn_r = re.compile(skip_sq_sn_regex) # Ensure we have a .fa reference file with corresponding .fai index and .dict reference_coll = arvados.current_job( )['script_parameters']['reference_collection'] rcr = arvados.CollectionReader(reference_coll) ref_fasta = {} ref_fai = {} ref_dict = {} ref_input = None dict_reader = None for rs in rcr.all_streams(): for rf in rs.all_files(): if re.search(r'\.fa$', rf.name()): ref_fasta[rs.name(), rf.name()] = rf elif re.search(r'\.fai$', rf.name()): ref_fai[rs.name(), rf.name()] = rf elif re.search(r'\.dict$', rf.name()): ref_dict[rs.name(), rf.name()] = rf for ((s_name, f_name), fasta_f) in ref_fasta.items(): fai_f = ref_fai.get( (s_name, re.sub(r'fa$', 'fai', f_name)), ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None)) dict_f = ref_dict.get( (s_name, re.sub(r'fa$', 'dict', f_name)), ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None)) if fasta_f and fai_f and dict_f: # found a set of all three! ref_input = fasta_f.as_manifest() ref_input += fai_f.as_manifest() ref_input += dict_f.as_manifest() dict_reader = dict_f break if ref_input is None: raise InvalidArgumentError( "Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) if dict_reader is None: raise InvalidArgumentError( "Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) # Create a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={ "manifest_text": ref_input }).execute() ref_input_pdh = r["portable_data_hash"] except: raise # Load the dict data interval_header = "" dict_lines = dict_reader.readlines() dict_header = dict_lines.pop(0) if re.search(r'^@HD', dict_header) is None: raise InvalidArgumentError( "Dict file in reference collection does not have correct header: [%s]" % dict_header) interval_header += dict_header print "Dict header is %s" % dict_header sn_intervals = dict() sns = [] skip_sns = [] total_len = 0 for sq in dict_lines: if re.search(r'^@SQ', sq) is None: raise InvalidArgumentError( "Dict file contains malformed SQ line: [%s]" % sq) interval_header += sq sn = None ln = None for tagval in sq.split("\t"): tv = tagval.split(":", 1) if tv[0] == "SN": sn = tv[1] if tv[0] == "LN": ln = tv[1] if sn and ln: break if not (sn and ln): raise InvalidArgumentError( "Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq) assert (sn and ln) if sn_intervals.has_key(sn): raise InvalidArgumentError( "Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq)) if skip_sq_sn_r.search(sn): skip_sns.append(sn) continue sn_intervals[sn] = (1, int(ln)) sns.append(sn) total_len += int(ln) total_sequences = len(sns) print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex) # Chunk the genome into genome_chunks pieces # weighted by both number of base pairs and number of seqs print "Total sequences included: %s" % (total_sequences) print "Total genome length: %s" % (total_len) total_points = total_len + (total_sequences * weight_seq) chunk_points = int(total_points / genome_chunks) chunk_input_pdh_names = [] print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points) for chunk_i in range(0, genome_chunks): chunk_num = chunk_i + 1 chunk_intervals_count = 0 chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks)) print "Creating interval file for chunk %s" % chunk_num chunk_c = arvados.collection.CollectionWriter(num_retries=3) chunk_c.start_new_file(newfilename=chunk_input_name) # chunk_c.write(interval_header) remaining_points = chunk_points while len(sns) > 0: sn = sns.pop(0) remaining_points -= weight_seq if remaining_points <= 0: sns.insert(0, sn) break if not sn_intervals.has_key(sn): raise ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end - start + 1) > remaining_points: # not enough space for the whole sq, split it real_end = end end = remaining_points + start - 1 assert ((end - start + 1) <= remaining_points) sn_intervals[sn] = (end + 1, real_end) sns.insert(0, sn) #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn)) interval = "%s\t%s\t%s\n" % (sn, start, end) remaining_points -= (end - start + 1) chunk_c.write(interval) chunk_intervals_count += 1 if remaining_points <= 0: break if chunk_intervals_count > 0: chunk_input_pdh = chunk_c.finish() print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) else: print "WARNING: skipping empty intervals for %s" % chunk_input_name print "Have %s chunk collections: [%s]" % (len( chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names])) # prepare CRAM input collections job_input = arvados.current_job()['script_parameters']['inputs_collection'] cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): # Handle this CRAM file crai_f = crai.get( (s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise InvalidArgumentError( "No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={ "manifest_text": task_input }).execute() task_input_pdh = r["portable_data_hash"] except: raise create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=pool) print "Waiting for asynchronous requests to complete" pool.close() pool.join() if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()["script_parameters"]["reference_collection"] ) job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"] interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"] interval_count = 1 if "interval_count" in arvados.current_job()["script_parameters"]: interval_count = arvados.current_job()["script_parameters"]["interval_count"] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.chunked_tasks_per_cram_file( ref_input_pdh, job_input_pdh, interval_lists_pdh, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=False, oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2", script="gatk-haplotypecaller-cram.py", ) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert this_task["sequence"] != 0 ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if "reuse_job_task" in this_task["parameters"]: print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"] exit(0) ################################################################################ # Phase IIb: Call Haplotypes! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk") cram_file = gatk_helper.mount_gatk_cram_input(input_param="input") cram_file_base, cram_file_ext = os.path.splitext(cram_file) out_dir = hgi_arvados.prepare_out_dir() out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_filename = out_filename.replace(".bcf", "._cf") # HaplotypeCaller! gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename)) if gatk_exit != 0: print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() print "Task output written to keep, validating it" if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
#!/usr/bin/python # import arvados import re arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True) this_job = arvados.current_job() this_task = arvados.current_task() this_task_input = this_task['parameters']['input'] input_file = list(arvados.CollectionReader(this_task_input).all_files())[0] out = arvados.CollectionWriter() out.set_current_file_name(input_file.decompressed_name()) out.set_current_stream_name(input_file.stream_name()) for line in input_file.readlines(): out.write("!!!" + line.upper()) this_task.set_output(out.finish())
of.write("pwd: " + pwdinfo) lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo) meminfo = sp.check_output(["free", "-hm"]) of.write("mem:\n" + meminfo) hostinfo = sp.check_output(["hostname"]) of.write("host: " + hostinfo) job = arv.current_job() task = arv.current_task() of = arv.CollectionWriter() of.set_current_file_name("info.log") whoinfo = sp.check_output(["whoami"]) of.write("user: "******"\n" ) pwdinfo = sp.check_output(["pwd"]) of.write("pwd: " + pwdinfo + "\n" ) lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"])
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert(this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ( (bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None) ): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"] bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe() print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd(bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd(part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished(bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write]) part_tee_p = close_process_if_finished(part_tee_p, "tee %s" % (region_label), close_fds=[part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished(region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ( (region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None) ): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe() grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn] bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished(grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished(final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe() print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished(final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % (final_out_file) final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file] final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
def collection_extract(collection, path, files=[], decompress=True): """Retrieve a collection from Keep and extract it to a local directory. Return the absolute path where the collection was extracted. collection -- collection locator path -- where to extract: absolute, or relative to job tmp """ matches = re.search(r'^([0-9a-f]+)(\+[\w@]+)*$', collection) if matches: collection_hash = matches.group(1) else: collection_hash = hashlib.md5(collection).hexdigest() if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == collection_hash: already_have_it = True except OSError: pass # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) files_got = [] for s in CollectionReader(collection).all_streams(): stream_name = s.name() for f in s.all_files(): if (files == [] or ((f.name() not in files_got) and (f.name() in files or (decompress and f.decompressed_name() in files)))): outname = f.decompressed_name() if decompress else f.name() files_got += [outname] if os.path.exists(os.path.join(path, stream_name, outname)): continue mkdir_dash_p(os.path.dirname(os.path.join(path, stream_name, outname))) outfile = open(os.path.join(path, stream_name, outname), 'wb') for buf in (f.readall_decompressed() if decompress else f.readall()): outfile.write(buf) outfile.close() if len(files_got) < len(files): raise arvados.errors.AssertionError( "Wanted files %s but only got %s from %s" % (files, files_got, [z.name() for z in CollectionReader(collection).all_files()])) os.symlink(collection_hash, os.path.join(path, '.locator')) lockfile.close() return path
def collection_extract(collection, path, files=[], decompress=True): """Retrieve a collection from Keep and extract it to a local directory. Return the absolute path where the collection was extracted. collection -- collection locator path -- where to extract: absolute, or relative to job tmp """ matches = re.search(r'^([0-9a-f]+)(\+[\w@]+)*$', collection) if matches: collection_hash = matches.group(1) else: collection_hash = hashlib.md5(collection).hexdigest() if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == collection_hash: already_have_it = True except OSError: pass # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) files_got = [] for s in CollectionReader(collection).all_streams(): stream_name = s.name() for f in s.all_files(): if (files == [] or ((f.name() not in files_got) and (f.name() in files or (decompress and f.decompressed_name() in files)))): outname = f.decompressed_name() if decompress else f.name() files_got += [outname] if os.path.exists(os.path.join(path, stream_name, outname)): continue mkdir_dash_p( os.path.dirname(os.path.join(path, stream_name, outname))) outfile = open(os.path.join(path, stream_name, outname), 'wb') for buf in (f.readall_decompressed() if decompress else f.readall()): outfile.write(buf) outfile.close() if len(files_got) < len(files): raise arvados.errors.AssertionError( "Wanted files %s but only got %s from %s" % (files, files_got, [z.name() for z in CollectionReader(collection).all_files()])) os.symlink(collection_hash, os.path.join(path, '.locator')) lockfile.close() return path
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file( ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file( ref_fai_file, error_exception=FileAccessError( "reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file( chunk_file, error_exception=FileAccessError( "Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError( "CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename( cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join( tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend( [concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None)): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = [ "teepot", bcftools_view_noheader_input_fifo, "-" ] bcftools_view_noheader_cmd = [ "bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo ] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = [ "bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp ] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = [ "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file ] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % ( region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe( ) print "Creating 'bcftools norm | tee' pipe for region %s" % ( region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % ( region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe( ) print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % ( bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd( bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd( bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd( part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd( bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd( bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished( bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished( bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[ bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write ]) part_tee_p = close_process_if_finished( part_tee_p, "tee %s" % (region_label), close_fds=[ part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write ], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished( bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished( bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished( region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ((region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None)): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % ( concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [ concat_headeronly_tmps[region] for region in regions ]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe( ) grep_headeronly_cmd = [ "egrep", "-v", "^[#][#](bcftools|mpileup|reference)" ] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = [ "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn ] bcftools_concat_headeronly_p = run_child_cmd( bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished( bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished( grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % ( final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished( final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe( ) print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = [ "bcftools", "view", "-Oz", "-o", penultimate_out_file ] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd( final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd( final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished( final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished( final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % ( final_out_file) final_bcftools_reheader_cmd = [ "bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file ] final_bcftools_reheader_p = run_child_cmd( final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished( final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, max_gvcfs_to_combine, if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase II: Read interval_list and split into additional intervals ################################################################################ hgi_arvados.one_task_per_interval(interval_count, validate_task_output, reuse_tasks=True, oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1", if_sequence=1, and_end_task=True) # We will never reach this point if we are in the 1st task sequence assert(this_task['sequence'] > 1) ################################################################################ # Phase IIIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIIb: Combine gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() name = this_task['parameters'].get('name') if not name: name = "unknown" interval_str = this_task['parameters'].get('interval') if not interval_str: interval_str = "" interval_strs = interval_str.split() intervals = [] for interval in interval_strs: intervals.extend(["--intervals", interval]) out_file = name + ".vcf.gz" if interval_count > 1: out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz" if len(out_file) > 255: out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz" print "Output file name was too long with full interval list, shortened it to: %s" % out_file if len(out_file) > 255: raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file) # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # CombineGVCFs! extra_args = intervals extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"]) gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args) if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
# Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import arvados import subprocess import crunchutil.subst as subst import shutil import os import sys import time if len(arvados.current_task()['parameters']) > 0: p = arvados.current_task()['parameters'] else: p = arvados.current_job()['script_parameters'] t = arvados.current_task().tmpdir os.unlink("/usr/local/share/bcbio-nextgen/galaxy") os.mkdir("/usr/local/share/bcbio-nextgen/galaxy") shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml", "/usr/local/share/bcbio-nextgen/galaxy") with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml", "w") as f: f.write('''<tables> <!-- Locations of indexes in the BWA mapper format --> <table name="bwa_indexes" comment_char="#"> <columns>value, dbkey, name, path</columns> <file path="tool-data/bwa_index.loc" />
def run_method(self): arvados.current_job()
def one_task_per_interval( interval_count, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', task_key_params=['name', 'inputs', 'interval', 'ref'], script=arvados.current_job()['script']): """ Queue one task for each of interval_count intervals, splitting the genome chunk (described by the .interval_list file) evenly. Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param=interval_list_param) interval_reader = open(interval_list_file, mode="r") lines = interval_reader.readlines() sn_intervals = dict() sns = [] total_len = 0 for line in lines: if line[0] == '@': # skip all lines starting with '@' continue fields = line.split("\t") if len(fields) != 5: raise errors.InvalidArgumentError( "interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields))) sn = fields[0] start = int(fields[1]) end = int(fields[2]) length = int(end) - int(start) + 1 total_len += int(length) sn_intervals[sn] = (start, end) sns.append(sn) print "Total chunk length is %s" % total_len interval_len = int(total_len / interval_count) intervals = [] print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len) for interval_i in range(0, interval_count): interval_num = interval_i + 1 intervals_count = 0 remaining_len = interval_len interval = [] while len(sns) > 0: sn = sns.pop(0) if not sn_intervals.has_key(sn): raise errors.ValueError( "sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end - start + 1) > remaining_len: # not enough space for the whole sq, split it real_end = end end = remaining_len + start - 1 assert ((end - start + 1) <= remaining_len) sn_intervals[sn] = (end + 1, real_end) sns.insert(0, sn) interval.append("%s:%s-%s" % (sn, start, end)) remaining_len -= (end - start + 1) intervals_count += 1 if remaining_len <= 0: break if intervals_count > 0: intervals.append(interval) else: print "WARNING: skipping empty intervals for %s" % interval_input_name print "Have %s intervals" % (len(intervals)) if reuse_tasks: # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s potentially reusable tasks" % (len(reusable_tasks)) for interval in intervals: interval_str = ' '.join(interval) print "Creating task to process interval: [%s]" % interval_str new_task_params = arvados.current_task()['parameters'] new_task_params['interval'] = interval_str if reuse_tasks: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def run(): # Timestamps are added by crunch-job, so don't print redundant timestamps. arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s')) # Print package versions logger.info(arvados_cwl.versionstring()) api = arvados.api("v1") arvados_cwl.add_arv_hints() runner = None try: job_order_object = arvados.current_job()['script_parameters'] toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool")) pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$') def keeppath(v): if pdh_path.match(v): return "keep:%s" % v else: return v def keeppathObj(v): if "location" in v: v["location"] = keeppath(v["location"]) for k,v in job_order_object.items(): if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v): job_order_object[k] = { "class": "File", "location": "keep:%s" % v } adjustFileObjs(job_order_object, keeppathObj) adjustDirObjs(job_order_object, keeppathObj) normalizeFilesDirs(job_order_object) output_name = None output_tags = None enable_reuse = True on_error = "continue" debug = False if "arv:output_name" in job_order_object: output_name = job_order_object["arv:output_name"] del job_order_object["arv:output_name"] if "arv:output_tags" in job_order_object: output_tags = job_order_object["arv:output_tags"] del job_order_object["arv:output_tags"] if "arv:enable_reuse" in job_order_object: enable_reuse = job_order_object["arv:enable_reuse"] del job_order_object["arv:enable_reuse"] if "arv:on_error" in job_order_object: on_error = job_order_object["arv:on_error"] del job_order_object["arv:on_error"] if "arv:debug" in job_order_object: debug = job_order_object["arv:debug"] del job_order_object["arv:debug"] runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache( api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}), output_name=output_name, output_tags=output_tags) make_fs_access = functools.partial(CollectionFsAccess, collection_cache=runner.collection_cache) t = load_tool(toolpath, runner.arv_make_tool, fetcher_constructor=functools.partial(CollectionFetcher, api_client=runner.api, fs_access=make_fs_access(""), num_retries=runner.num_retries)) if debug: logger.setLevel(logging.DEBUG) logging.getLogger('arvados').setLevel(logging.DEBUG) logging.getLogger("cwltool").setLevel(logging.DEBUG) args = argparse.Namespace() args.project_uuid = arvados.current_job()["owner_uuid"] args.enable_reuse = enable_reuse args.on_error = on_error args.submit = False args.debug = debug args.quiet = False args.ignore_docker_for_reuse = False args.basedir = os.getcwd() args.name = None args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]} args.make_fs_access = make_fs_access args.trash_intermediate = False args.intermediate_output_ttl = 0 args.priority = arvados_cwl.DEFAULT_PRIORITY args.do_validate = True args.disable_js_validation = False runner.arv_executor(t, job_order_object, **vars(args)) except Exception as e: if isinstance(e, WorkflowException): logging.info("Workflow error %s", e) else: logging.exception("Unhandled exception") if runner and runner.final_output_collection: outputCollection = runner.final_output_collection.portable_data_hash() else: outputCollection = None api.job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'output': outputCollection, 'success': False, 'progress':1.0 }).execute()
def one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex='_decoy$', genome_chunks=200): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # setup multiprocessing pool pool_processes = cpu_count() - 1 print 'Using %d processes to submit tasks\n' % pool_processes pool = Pool(processes=pool_processes) skip_sq_sn_r = re.compile(skip_sq_sn_regex) # Ensure we have a .fa reference file with corresponding .fai index and .dict reference_coll = arvados.current_job()['script_parameters']['reference_collection'] rcr = arvados.CollectionReader(reference_coll) ref_fasta = {} ref_fai = {} ref_dict = {} ref_input = None dict_reader = None for rs in rcr.all_streams(): for rf in rs.all_files(): if re.search(r'\.fa$', rf.name()): ref_fasta[rs.name(), rf.name()] = rf elif re.search(r'\.fai$', rf.name()): ref_fai[rs.name(), rf.name()] = rf elif re.search(r'\.dict$', rf.name()): ref_dict[rs.name(), rf.name()] = rf for ((s_name, f_name), fasta_f) in ref_fasta.items(): fai_f = ref_fai.get((s_name, re.sub(r'fa$', 'fai', f_name)), ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None)) dict_f = ref_dict.get((s_name, re.sub(r'fa$', 'dict', f_name)), ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None)) if fasta_f and fai_f and dict_f: # found a set of all three! ref_input = fasta_f.as_manifest() ref_input += fai_f.as_manifest() ref_input += dict_f.as_manifest() dict_reader = dict_f break if ref_input is None: raise InvalidArgumentError("Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) if dict_reader is None: raise InvalidArgumentError("Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) # Create a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={"manifest_text": ref_input}).execute() ref_input_pdh = r["portable_data_hash"] except: raise # Load the dict data interval_header = "" dict_lines = dict_reader.readlines() dict_header = dict_lines.pop(0) if re.search(r'^@HD', dict_header) is None: raise InvalidArgumentError("Dict file in reference collection does not have correct header: [%s]" % dict_header) interval_header += dict_header print "Dict header is %s" % dict_header sn_intervals = dict() sns = [] skip_sns = [] total_len = 0 for sq in dict_lines: if re.search(r'^@SQ', sq) is None: raise InvalidArgumentError("Dict file contains malformed SQ line: [%s]" % sq) interval_header += sq sn = None ln = None for tagval in sq.split("\t"): tv = tagval.split(":", 1) if tv[0] == "SN": sn = tv[1] if tv[0] == "LN": ln = tv[1] if sn and ln: break if not (sn and ln): raise InvalidArgumentError("Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq) assert(sn and ln) if sn_intervals.has_key(sn): raise InvalidArgumentError("Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq)) if skip_sq_sn_r.search(sn): skip_sns.append(sn) continue sn_intervals[sn] = (1, int(ln)) sns.append(sn) total_len += int(ln) total_sequences = len(sns) print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex) # Chunk the genome into genome_chunks pieces # weighted by both number of base pairs and number of seqs print "Total sequences included: %s" % (total_sequences) print "Total genome length: %s" % (total_len) total_points = total_len + (total_sequences * weight_seq) chunk_points = int(total_points / genome_chunks) chunk_input_pdh_names = [] print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points) for chunk_i in range(0, genome_chunks): chunk_num = chunk_i + 1 chunk_intervals_count = 0 chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks)) print "Creating interval file for chunk %s" % chunk_num chunk_c = arvados.collection.CollectionWriter(num_retries=3) chunk_c.start_new_file(newfilename=chunk_input_name) # chunk_c.write(interval_header) remaining_points = chunk_points while len(sns) > 0: sn = sns.pop(0) remaining_points -= weight_seq if remaining_points <= 0: sns.insert(0, sn) break if not sn_intervals.has_key(sn): raise ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end-start+1) > remaining_points: # not enough space for the whole sq, split it real_end = end end = remaining_points + start - 1 assert((end-start+1) <= remaining_points) sn_intervals[sn] = (end+1, real_end) sns.insert(0, sn) #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn)) interval = "%s\t%s\t%s\n" % (sn, start, end) remaining_points -= (end-start+1) chunk_c.write(interval) chunk_intervals_count += 1 if remaining_points <= 0: break if chunk_intervals_count > 0: chunk_input_pdh = chunk_c.finish() print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) else: print "WARNING: skipping empty intervals for %s" % chunk_input_name print "Have %s chunk collections: [%s]" % (len(chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names])) # prepare CRAM input collections job_input = arvados.current_job()['script_parameters']['inputs_collection'] cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): # Handle this CRAM file crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={"manifest_text": task_input}).execute() task_input_pdh = r["portable_data_hash"] except: raise create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=pool) print "Waiting for asynchronous requests to complete" pool.close() pool.join() if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] if arvados.current_task()['sequence'] == 0: # get candidates for task reuse task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list script="gatk-genotypegvcfs.py" oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2' job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] # retrieve a full set of all possible reusable tasks at sequence 1 print "Retrieving all potentially reusable tasks" reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) def create_task_with_validated_reuse(sequence, params): return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output) # Setup sub tasks (and terminate if this is task 0) hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task_with_validated_reuse) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Genotype gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs") name = this_task['parameters'].get('name') if not name: name = "unknown" out_file = name + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # GenotypeGVCFs! gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g") if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
#!/usr/bin/python # import arvados import re arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True) this_job = arvados.current_job() this_task = arvados.current_task() this_task_input = this_task['parameters']['input'] input_file = list( arvados.CollectionReader(this_task_input).all_files() )[0] out = arvados.CollectionWriter() out.set_current_file_name(input_file.decompressed_name()) out.set_current_stream_name(input_file.stream_name()) for line in input_file.readlines(): out.write( "!!!" + line.upper() ) this_task.set_output(out.finish())