def write_tmpdir(dir): coll_writer = arvados.CollectionWriter() filenames=next(os.walk(dir))[-1] for filename in filenames: coll_writer.write_file(os.path.join(dir,filename)) pdh = coll_writer.finish() arvados.current_task().set_output(pdh)
def spawn_new_task_per_file(script_parameter, regex, if_sequence=0, and_end_task=True): """ Generalized form of one_task_per_pair_input_file from https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py Creates a new task if the file in the collection matches the regex """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters'][script_parameter] input_collection = Collection(job_input) for name in input_collection: if not re.search(regex,name): continue new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1': name, } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute()
def run(**kwargs): kwargs.setdefault('cwd', arvados.current_task().tmpdir) kwargs.setdefault('stdout', sys.stderr) execargs = ['java', '-Xmx%dm' % memory_limit(), '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar', os.path.join(install_path(), 'GenomeAnalysisTK.jar')] execargs += [str(arg) for arg in kwargs.pop('args', [])] sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) return arvados.util.run_command(execargs, **kwargs)
def run(module, **kwargs): kwargs.setdefault('cwd', arvados.current_task().tmpdir) execargs = ['java', '-Xmx1500m', '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar', os.path.join(install_path(), module + '.jar')] execargs += [str(arg) for arg in kwargs.pop('args', [])] for key, value in kwargs.pop('params', {}).items(): execargs += [key.upper() + '=' + str(value)] sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) return arvados.util.run_command(execargs, **kwargs)
def run(module, **kwargs): kwargs.setdefault('cwd', arvados.current_task().tmpdir) execargs = [ 'java', '-Xmx1500m', '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar', os.path.join(install_path(), module + '.jar') ] execargs += [str(arg) for arg in kwargs.pop('args', [])] for key, value in kwargs.pop('params', {}).items(): execargs += [key.upper() + '=' + str(value)] sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) return arvados.util.run_command(execargs, **kwargs)
def run(**kwargs): kwargs.setdefault('cwd', arvados.current_task().tmpdir) kwargs.setdefault('stdout', sys.stderr) execargs = [ 'java', '-Xmx%dm' % memory_limit(), '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar', os.path.join(install_path(), 'GenomeAnalysisTK.jar') ] execargs += [str(arg) for arg in kwargs.pop('args', [])] sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) return arvados.util.run_command(execargs, **kwargs)
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True): """ Queue one task for each pair of fastq files in this job's input collection. Each new task will have two parameters, named "input_1" and "input_2", each being a manifest containing a single fastq file. A matching pair of files in the input collection is assumed to have names "x_1.y" and "x_2.y". Files in the input collection that are not part of a matched pair are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) all_files = [] for s in cr.all_streams(): all_files += list(s.all_files()) for s in cr.all_streams(): for left_file in s.all_files(): left_name = left_file.name() right_file = None right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name) if right_name == left_name: continue for f2 in s.all_files(): if right_name == f2.name(): right_file = f2 if right_file != None: new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1': left_file.as_manifest(), 'input_2': right_file.as_manifest() } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True): """ Queue one task for each pair of fastq files in this job's input collection. Each new task will have two parameters, named "input_1" and "input_2", each being a manifest containing a single fastq file. A matching pair of files in the input collection is assumed to have names "x_1.y" and "x_2.y". Files in the input collection that are not part of a matched pair are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) all_files = [] for s in cr.all_streams(): all_files += list(s.all_files()) for s in cr.all_streams(): for left_file in s.all_files(): left_name = left_file.name() right_file = None right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name) if right_name == left_name: continue for f2 in s.all_files(): if right_name == f2.name(): right_file = f2 if right_file != None: new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1':left_file.as_manifest(), 'input_2':right_file.as_manifest() } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def main(): current_job = arvados.current_job() genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the interval_list collection to only those files relevant to gatk il_input_pdh = prepare_gatk_interval_list_collection(interval_list_coll=current_job['script_parameters']['interval_list_collection']) # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection output_locator = create_interval_lists(genome_chunks, il_input_pdh) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def run_rtg(command, output_dir, command_args, **kwargs): global rtg_install_path execargs = [ os.path.join(rtg_install_path, 'rtg'), command, '-o', output_dir ] execargs += command_args sys.stderr.write("run_rtg: exec %s\n" % str(execargs)) arvados.util.run_command(execargs, cwd=arvados.current_task().tmpdir, stderr=sys.stderr, stdout=sys.stderr) # Exit status cannot be trusted in rtg 1.1.1. assert_done(output_dir) # Copy log files to stderr and delete them to avoid storing them # in Keep with the output data. for dirent in arvados.util.listdir_recursive(output_dir): if is_log_file(dirent): log_file = os.path.join(output_dir, dirent) sys.stderr.write(' '.join(['==>', dirent, '<==\n'])) with open(log_file, 'rb') as f: while True: buf = f.read(2**20) if len(buf) == 0: break sys.stderr.write(buf) sys.stderr.write('\n') # in case log does not end in newline os.unlink(log_file)
def run_rtg(command, output_dir, command_args, **kwargs): global rtg_install_path execargs = [os.path.join(rtg_install_path, 'rtg'), command, '-o', output_dir] execargs += command_args sys.stderr.write("run_rtg: exec %s\n" % str(execargs)) arvados.util.run_command( execargs, cwd=arvados.current_task().tmpdir, stderr=sys.stderr, stdout=sys.stderr) # Exit status cannot be trusted in rtg 1.1.1. assert_done(output_dir) # Copy log files to stderr and delete them to avoid storing them # in Keep with the output data. for dirent in arvados.util.listdir_recursive(output_dir): if is_log_file(dirent): log_file = os.path.join(output_dir, dirent) sys.stderr.write(' '.join(['==>', dirent, '<==\n'])) with open(log_file, 'rb') as f: while True: buf = f.read(2**20) if len(buf) == 0: break sys.stderr.write(buf) sys.stderr.write('\n') # in case log does not end in newline os.unlink(log_file)
def create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=None): async_results = [] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk job_uuid = arvados.current_job()['uuid'] current_task_uuid = arvados.current_task()['uuid'] new_task_attrs = { 'job_uuid': job_uuid, 'created_by_job_task_uuid': current_task_uuid, 'sequence': if_sequence + 1, 'parameters': { 'input': task_input_pdh, 'ref': ref_input_pdh, 'chunk': chunk_input_pdh } } async_result = pool.apply_async(arv_create_task, ( new_task_attrs, "Created new task to process %s with chunk interval %s (job_uuid %s)" % (f_name, chunk_input_name, job_uuid))) async_results.append(async_result) for async_result in async_results: async_result.wait() try: (res, report) = async_result.get() if (not res) or (not 'qsequence' in res): raise InternalError("Could not create job task: %s" % res) else: print report + " qsequence %s" % res['qsequence'] except Exception as e: raise InternalError("Exception creating job task: %s" % e)
def upload(source_dir, logger=None): if logger is None: logger = logging.getLogger("arvados") source_dir = os.path.abspath(source_dir) done = False if 'TASK_WORK' in os.environ: resume_cache = put.ResumeCache(os.path.join(arvados.current_task().tmpdir, "upload-output-checkpoint")) else: resume_cache = put.ResumeCache(put.ResumeCache.make_path(Args(source_dir))) reporter = put.progress_writer(machine_progress) bytes_expected = put.expected_bytes_for([source_dir]) backoff = 1 outuuid = None while not done: try: out = put.ArvPutCollectionWriter.from_cache(resume_cache, reporter, bytes_expected) out.do_queued_work() out.write_directory_tree(source_dir, max_manifest_depth=0) outuuid = out.finish() done = True except KeyboardInterrupt as e: logger.critical("caught interrupt signal 2") raise e except Exception as e: logger.exception("caught exception:") backoff *= 2 if backoff > 256: logger.critical("Too many upload failures, giving up") raise e else: logger.warning("Sleeping for %s seconds before trying again" % backoff) time.sleep(backoff) return outuuid
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': parameters } # See if there is a task in reusable_tasks that can be reused ct_index = tuple([parameters[index_param] for index_param in task_key_params]) if len(reusable_tasks) == 0: print "No reusable tasks were available" elif ct_index in reusable_tasks: # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output reuse_task = reusable_tasks[ct_index] if validate_task_output(reuse_task['output']): print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output']) # remove task from reusable_tasks as it won't be used more than once del reusable_tasks[ct_index] # copy relevant attrs from reuse_task so that the new tasks start already finished for attr in ['success', 'output', 'progress', 'started_at', 'finished_at', 'parameters']: new_task_attrs[attr] = reuse_task[attr] # crunch seems to ignore the fact that the job says it is done and queue it anyway # signal ourselves to just immediately exit successfully when we are run new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid'] else: print "Output %s for potential task reuse did not validate" % (reuse_task['output']) else: print "No reusable JobTask matched key parameters %s" % (list(ct_index)) # Create the "new" task (may be new work or may be already finished work) new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute() if not new_task: raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs) return new_task
def one_task_per_bam_file(if_sequence=0, and_end_task=True): """ Queue one task for each bam file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .bam file and (if available) the corresponding .bai index file. Files in the input collection that are not named *.bam or *.bai (as well as *.bai files that do not match any .bam file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) bam = {} bai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.bam$', f.name()): bam[s.name(), f.name()] = f elif re.search(r'\.bai$', f.name()): bai[s.name(), f.name()] = f for ((s_name, f_name), bam_f) in bam.items(): bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None) task_input = bam_f.as_manifest() if bai_f: task_input += bai_f.as_manifest() new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input': task_input } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def one_task_per_bam_file(if_sequence=0, and_end_task=True): """ Queue one task for each bam file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .bam file and (if available) the corresponding .bai index file. Files in the input collection that are not named *.bam or *.bai (as well as *.bai files that do not match any .bam file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) bam = {} bai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.bam$', f.name()): bam[s.name(), f.name()] = f elif re.search(r'\.bai$', f.name()): bai[s.name(), f.name()] = f for ((s_name, f_name), bam_f) in bam.items(): bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None) task_input = bam_f.as_manifest() if bai_f: task_input += bai_f.as_manifest() new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input': task_input } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def main(): current_job = arvados.current_job() genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the interval_list collection to only those files relevant to gatk il_input_pdh = prepare_gatk_interval_list_collection( interval_list_coll=current_job['script_parameters'] ['interval_list_collection']) # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection output_locator = create_interval_lists(genome_chunks, il_input_pdh) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def create_task(sequence, params): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': params } task = arvados.api().job_tasks().create(body=new_task_attrs).execute() return task
def main(): # Get object representing the current task this_task = arvados.current_task() sort_by_r = re.compile(sort_by_regex) ################################################################################ # Concatentate VCFs in numerically sorted order of sort_by_regex ################################################################################ vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() output_prefix = arvados.current_job()['script_parameters']['output_prefix'] out_file = output_prefix + ".vcf.gz" # Concatenate VCFs bcftools_concat_exit = bcftools.concat(sorted(vcf_files, key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))), os.path.join(out_dir, out_file)) if bcftools_concat_exit != 0: print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "bcftools concat exited successfully, indexing" bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file)) if bcftools_index_exit != 0: print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "bcftools index exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def main(): current_job = arvados.current_job() skip_sq_sn_regex = '_decoy$' if 'skip_sq_sn_regex' in current_job['script_parameters']: skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex'] skip_sq_sn_r = re.compile(skip_sq_sn_regex) genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the reference collection to only those files relevant to gatk ref_input_pdh = prepare_gatk_reference_collection(reference_coll=current_job['script_parameters']['reference_collection']) # Create an interval_list file for each chunk based on the .dict in the reference collection output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True): if if_sequence != arvados.current_task()['sequence']: return api_client = arvados.api('v1') for i in range(num_classifiers_to_parameterize): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'classifier_index':i, 'time_to_wait':i*560 } } api_client.job_tasks().create(body=new_task_attrs).execute() if and_end_task: api_client.job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def clear_tmpdir(path=None): """ Ensure the given directory (or TASK_TMPDIR if none given) exists and is empty. """ if path is None: path = arvados.current_task().tmpdir if os.path.exists(path): p = subprocess.Popen(['rm', '-rf', path]) stdout, stderr = p.communicate(None) if p.returncode != 0: raise Exception('rm -rf %s: %s' % (path, stderr)) os.mkdir(path)
def main(): current_job = arvados.current_job() skip_sq_sn_regex = '_decoy$' if 'skip_sq_sn_regex' in current_job['script_parameters']: skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex'] skip_sq_sn_r = re.compile(skip_sq_sn_regex) genome_chunks = int(current_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Limit the scope of the reference collection to only those files relevant to gatk ref_input_pdh = prepare_gatk_reference_collection( reference_coll=current_job['script_parameters'] ['reference_collection']) # Create an interval_list file for each chunk based on the .dict in the reference collection output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r) # Use the resulting locator as the output for this task. arvados.current_task().set_output(output_locator)
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True): """ Generalized form of one_task_per_pair_input_file from https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py Creates a new task if the file in the collection matches the regex """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters'][script_parameter] input_collection = Collection(job_input) for name in input_collection: if not re.search(regex,name): continue name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name) bed_lines = (line.split() for line in open(name_path, 'r')) # Start the biggest regions first def cmp_desc_region_size(a, b): return ((int(b[2]) - int(b[1])) - (int(a[2]) - int(a[1]))) for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size): print bed_line new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'chrom': bed_line[0], 'start': bed_line[1], 'end': bed_line[2] } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit()
def prepare_out_dir(): # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise return out_dir
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': sequence, 'parameters': parameters } # See if there is a task in reusable_tasks that can be reused ct_index = tuple( [parameters[index_param] for index_param in task_key_params]) if len(reusable_tasks) == 0: print "No reusable tasks were available" elif ct_index in reusable_tasks: # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output reuse_task = reusable_tasks[ct_index] if validate_task_output(reuse_task['output']): print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % ( reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output']) # remove task from reusable_tasks as it won't be used more than once del reusable_tasks[ct_index] # copy relevant attrs from reuse_task so that the new tasks start already finished for attr in [ 'success', 'output', 'progress', 'started_at', 'finished_at', 'parameters' ]: new_task_attrs[attr] = reuse_task[attr] # crunch seems to ignore the fact that the job says it is done and queue it anyway # signal ourselves to just immediately exit successfully when we are run new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid'] else: print "Output %s for potential task reuse did not validate" % ( reuse_task['output']) else: print "No reusable JobTask matched key parameters %s" % ( list(ct_index)) # Create the "new" task (may be new work or may be already finished work) new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute() if not new_task: raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs) return new_task
def mount_gatk_gvcf_inputs(inputs_param="inputs"): # Get input gVCFs for this task print "Mounting task input collection" inputs_dir = "" if inputs_param in arvados.current_task()['parameters']: inputs_dir = arvados.get_task_param_mount(inputs_param) else: inputs_dir = arvados.get_job_param_mount(inputs_param) # Sanity check input gVCFs input_gvcf_files = [] for f in arvados.util.listdir_recursive(inputs_dir): if re.search(r'\.vcf\.gz$', f): input_gvcf_files.append(os.path.join(inputs_dir, f)) elif re.search(r'\.tbi$', f): pass elif re.search(r'\.interval_list$', f): pass else: print "WARNING: collection contains unexpected file %s" % f if len(input_gvcf_files) == 0: raise errors.InvalidArgumentError( "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)" % inputs_dir) # Ensure we can read the gVCF files and that they each have an index for gvcf_file in input_gvcf_files: if not os.access(gvcf_file, os.R_OK): raise errors.FileAccessError("gVCF file not readable: %s" % gvcf_file) # Ensure we have corresponding .tbi index and can read it as well (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file) assert (gvcf_file_ext == ".gz") tbi_file = gvcf_file_base + ".gz.tbi" if not os.access(tbi_file, os.R_OK): tbi_file = gvcf_file_base + ".tbi" if not os.access(tbi_file, os.R_OK): raise errors.FileAccessError( "No readable gVCF index file for gVCF file: %s" % gvcf_file) return input_gvcf_files
def run(command, command_args, **kwargs): """ Build and run the samtools binary. command is the samtools subcommand, e.g., "view" or "sort". command_args is a list of additional command line arguments, e.g., ['-bt', 'ref_list.txt', '-o', 'aln.bam', 'aln.sam.gz'] It is assumed that we are running in a Crunch job environment, and the job's "samtools_tgz" parameter is a collection containing the samtools source tree in a .tgz file. """ execargs = [samtools_binary(), command] execargs += command_args sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) arvados.util.run_command(execargs, cwd=arvados.current_task().tmpdir, stdin=kwargs.get('stdin', subprocess.PIPE), stderr=kwargs.get('stderr', sys.stderr), stdout=kwargs.get('stdout', sys.stderr))
def run(command, command_args, **kwargs): """ Build and run the bwa binary. command is the bwa module, e.g., "index" or "aln". command_args is a list of additional command line arguments, e.g., ['-a', 'bwtsw', 'ref.fasta'] It is assumed that we are running in a Crunch job environment, and the job's "bwa_tbz" parameter is a collection containing the bwa source tree in a .tbz file. """ execargs = [bwa_binary(), command] execargs += command_args sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) arvados.util.run_command(execargs, cwd=arvados.current_task().tmpdir, stderr=sys.stderr, stdin=kwargs.get('stdin', subprocess.PIPE), stdout=kwargs.get('stdout', sys.stderr))
def run(command, command_args, **kwargs): """ Build and run the samtools binary. command is the samtools subcommand, e.g., "view" or "sort". command_args is a list of additional command line arguments, e.g., ['-bt', 'ref_list.txt', '-o', 'aln.bam', 'aln.sam.gz'] It is assumed that we are running in a Crunch job environment, and the job's "samtools_tgz" parameter is a collection containing the samtools source tree in a .tgz file. """ execargs = [samtools_binary(), command] execargs += command_args sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) arvados.util.run_command( execargs, cwd=arvados.current_task().tmpdir, stdin=kwargs.get('stdin', subprocess.PIPE), stderr=kwargs.get('stderr', sys.stderr), stdout=kwargs.get('stdout', sys.stderr))
def run(command, command_args, **kwargs): """ Build and run the bwa binary. command is the bwa module, e.g., "index" or "aln". command_args is a list of additional command line arguments, e.g., ['-a', 'bwtsw', 'ref.fasta'] It is assumed that we are running in a Crunch job environment, and the job's "bwa_tbz" parameter is a collection containing the bwa source tree in a .tbz file. """ execargs = [bwa_binary(), command] execargs += command_args sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs))) arvados.util.run_command( execargs, cwd=arvados.current_task().tmpdir, stderr=sys.stderr, stdin=kwargs.get('stdin', subprocess.PIPE), stdout=kwargs.get('stdout', sys.stderr))
def one_task_per_interval(interval_count, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', task_key_params=['name', 'inputs', 'interval', 'ref'], script=arvados.current_job()['script']): """ Queue one task for each of interval_count intervals, splitting the genome chunk (described by the .interval_list file) evenly. Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param=interval_list_param) interval_reader = open(interval_list_file, mode="r") lines = interval_reader.readlines() sn_intervals = dict() sns = [] total_len = 0 for line in lines: if line[0] == '@': # skip all lines starting with '@' continue fields = line.split("\t") if len(fields) != 5: raise errors.InvalidArgumentError("interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields))) sn = fields[0] start = int(fields[1]) end = int(fields[2]) length = int(end) - int(start) + 1 total_len += int(length) sn_intervals[sn] = (start, end) sns.append(sn) print "Total chunk length is %s" % total_len interval_len = int(total_len / interval_count) intervals = [] print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len) for interval_i in range(0, interval_count): interval_num = interval_i + 1 intervals_count = 0 remaining_len = interval_len interval = [] while len(sns) > 0: sn = sns.pop(0) if not sn_intervals.has_key(sn): raise errors.ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end-start+1) > remaining_len: # not enough space for the whole sq, split it real_end = end end = remaining_len + start - 1 assert((end-start+1) <= remaining_len) sn_intervals[sn] = (end+1, real_end) sns.insert(0, sn) interval.append("%s:%s-%s" % (sn, start, end)) remaining_len -= (end-start+1) intervals_count += 1 if remaining_len <= 0: break if intervals_count > 0: intervals.append(interval) else: print "WARNING: skipping empty intervals for %s" % interval_input_name print "Have %s intervals" % (len(intervals)) if reuse_tasks: # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s potentially reusable tasks" % (len(reusable_tasks)) for interval in intervals: interval_str = ' '.join(interval) print "Creating task to process interval: [%s]" % interval_str new_task_params = arvados.current_task()['parameters'] new_task_params['interval'] = interval_str if reuse_tasks: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def one_task_per_group_combined_inputs(ref_input, job_input, interval_lists, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task): """ Queue one task for each group of gVCFs and corresponding interval_list in the inputs_collection, with grouping based on three things: - the stream in which the gVCFs are held within the collection - the value of the named capture group "group_by" in the group_by_regex against the filename in the inputs_collection Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return group_by_r = re.compile(group_by_regex) # prepare interval_lists il_cr = arvados.CollectionReader(interval_lists) il_ignored_files = [] interval_list_by_group = {} for s in il_cr.all_streams(): for f in s.all_files(): m = re.search(group_by_r, f.name()) if m: group_name = m.group('group_by') interval_list_m = re.search(r'\.interval_list', f.name()) if interval_list_m: if group_name not in interval_list_by_group: interval_list_by_group[group_name] = dict() interval_list_by_group[group_name][s.name(), f.name()] = f continue # if we make it this far, we have files that we are ignoring il_ignored_files.append("%s/%s" % (s.name(), f.name())) # prepare gVCF input collections cr = arvados.CollectionReader(job_input) ignored_files = [] last_stream_name = "" gvcf_by_group = {} gvcf_indices = {} for s in sorted(cr.all_streams(), key=lambda stream: stream.name()): stream_name = s.name() # handle each stream name separately if stream_name != last_stream_name: if last_stream_name != "": print "Done processing files in stream %s" % last_stream_name one_task_per_gvcf_group_in_stream_combined_inputs(last_stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func) # now that we are done with last_stream_name, reinitialise dicts to # process data from new stream print "Processing files in stream %s" % stream_name gvcf_by_group = {} gvcf_indices = {} last_stream_name = stream_name # loop over all the files in this stream (there may be only one) for f in s.all_files(): if re.search(r'\.tbi$', f.name()): gvcf_indices[s.name(), f.name()] = f continue m = re.search(group_by_r, f.name()) if m: group_name = m.group('group_by') gvcf_m = re.search(r'\.vcf\.gz$', f.name()) if gvcf_m: if group_name not in gvcf_by_group: gvcf_by_group[group_name] = dict() gvcf_by_group[group_name][s.name(), f.name()] = f continue interval_list_m = re.search(r'\.interval_list', f.name()) if interval_list_m: if group_name not in interval_list_by_group: interval_list_by_group[group_name] = dict() if (s.name(), f.name()) in interval_list_by_group[group_name]: if interval_list_by_group[group_name][s.name(), f.name()].as_manifest() != f.as_manifest(): raise errors.InvalidArgumentError("Already have interval_list for group %s file %s/%s, but manifests are not identical!" % (group_name, s.name(), f.name())) else: interval_list_by_group[group_name][s.name(), f.name()] = f continue # if we make it this far, we have files that we are ignoring ignored_files.append("%s/%s" % (s.name(), f.name())) # finally, process the last stream print "Processing last stream" one_task_per_gvcf_group_in_stream_combined_inputs(stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func) # report on any ignored files if len(ignored_files) > 0: print "WARNING: ignored non-matching files in inputs_collection: %s" % (' '.join(ignored_files)) # TODO: could use `setmedian` from https://github.com/ztane/python-Levenshtein # to print most representative "median" filename (i.e. skipped 15 files like median), then compare the # rest of the files to that median (perhaps with `ratio`) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def chunked_tasks_per_cram_file(ref_input, job_input, interval_lists, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, reuse_tasks_retrieve_all=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script=arvados.current_job()['script']): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # prepare interval lists cr = arvados.CollectionReader(interval_lists) chunk_interval_list = {} chunk_input_pdh_names = [] for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.interval_list$', f.name()): chunk_interval_list[s.name(), f.name()] = f for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()): chunk_input = chunk_interval_list_f.as_manifest() try: r = arvados.api().collections().create(body={"manifest_text": chunk_input}).execute() chunk_input_pdh = r["portable_data_hash"] chunk_input_name = os.path.join(s_name, f_name) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) except: raise if len(chunk_input_pdh_names) == 0: raise errors.InvalidArgumentError("No interval_list files found in %s" % (interval_lists)) # prepare CRAM input collections cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise errors.InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={"manifest_text": task_input}).execute() task_input_pdh = r["portable_data_hash"] except: raise if reuse_tasks: task_key_params=['input', 'ref', 'chunk'] # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] if reuse_tasks_retrieve_all: # retrieve a full set of all possible reusable tasks reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) else: reusable_task_jobs = get_jobs_for_task_reuse(job_filters) print "Have %s jobs for potential task reuse" % (len(reusable_task_jobs)) reusable_task_job_uuids = [job['uuid'] for job in reusable_task_jobs['items']] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk new_task_params = { 'input': task_input_pdh, 'ref': ref_input, 'chunk': chunk_input_pdh } print "Creating new task to process %s with chunk interval %s " % (f_name, chunk_input_name) if reuse_tasks: if reuse_tasks_retrieve_all: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_or_reuse_task_from_jobs(if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo) meminfo = sp.check_output(["free", "-hm"]) of.write("mem:\n" + meminfo) hostinfo = sp.check_output(["hostname"]) of.write("host: " + hostinfo) job = arv.current_job() task = arv.current_task() of = arv.CollectionWriter() of.set_current_file_name("info.log") whoinfo = sp.check_output(["whoami"]) of.write("user: "******"\n") pwdinfo = sp.check_output(["pwd"]) of.write("pwd: " + pwdinfo + "\n") lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo + "\n")
#!/usr/bin/python # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import arvados import subprocess import crunchutil.subst as subst import shutil import os import sys import time if len(arvados.current_task()['parameters']) > 0: p = arvados.current_task()['parameters'] else: p = arvados.current_job()['script_parameters'] t = arvados.current_task().tmpdir os.unlink("/usr/local/share/bcbio-nextgen/galaxy") os.mkdir("/usr/local/share/bcbio-nextgen/galaxy") shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml", "/usr/local/share/bcbio-nextgen/galaxy") with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml", "w") as f: f.write('''<tables> <!-- Locations of indexes in the BWA mapper format --> <table name="bwa_indexes" comment_char="#"> <columns>value, dbkey, name, path</columns>
def one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex='_decoy$', genome_chunks=200): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # setup multiprocessing pool pool_processes = cpu_count() - 1 print 'Using %d processes to submit tasks\n' % pool_processes pool = Pool(processes=pool_processes) skip_sq_sn_r = re.compile(skip_sq_sn_regex) # Ensure we have a .fa reference file with corresponding .fai index and .dict reference_coll = arvados.current_job()['script_parameters']['reference_collection'] rcr = arvados.CollectionReader(reference_coll) ref_fasta = {} ref_fai = {} ref_dict = {} ref_input = None dict_reader = None for rs in rcr.all_streams(): for rf in rs.all_files(): if re.search(r'\.fa$', rf.name()): ref_fasta[rs.name(), rf.name()] = rf elif re.search(r'\.fai$', rf.name()): ref_fai[rs.name(), rf.name()] = rf elif re.search(r'\.dict$', rf.name()): ref_dict[rs.name(), rf.name()] = rf for ((s_name, f_name), fasta_f) in ref_fasta.items(): fai_f = ref_fai.get((s_name, re.sub(r'fa$', 'fai', f_name)), ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None)) dict_f = ref_dict.get((s_name, re.sub(r'fa$', 'dict', f_name)), ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None)) if fasta_f and fai_f and dict_f: # found a set of all three! ref_input = fasta_f.as_manifest() ref_input += fai_f.as_manifest() ref_input += dict_f.as_manifest() dict_reader = dict_f break if ref_input is None: raise InvalidArgumentError("Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) if dict_reader is None: raise InvalidArgumentError("Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) # Create a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={"manifest_text": ref_input}).execute() ref_input_pdh = r["portable_data_hash"] except: raise # Load the dict data interval_header = "" dict_lines = dict_reader.readlines() dict_header = dict_lines.pop(0) if re.search(r'^@HD', dict_header) is None: raise InvalidArgumentError("Dict file in reference collection does not have correct header: [%s]" % dict_header) interval_header += dict_header print "Dict header is %s" % dict_header sn_intervals = dict() sns = [] skip_sns = [] total_len = 0 for sq in dict_lines: if re.search(r'^@SQ', sq) is None: raise InvalidArgumentError("Dict file contains malformed SQ line: [%s]" % sq) interval_header += sq sn = None ln = None for tagval in sq.split("\t"): tv = tagval.split(":", 1) if tv[0] == "SN": sn = tv[1] if tv[0] == "LN": ln = tv[1] if sn and ln: break if not (sn and ln): raise InvalidArgumentError("Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq) assert(sn and ln) if sn_intervals.has_key(sn): raise InvalidArgumentError("Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq)) if skip_sq_sn_r.search(sn): skip_sns.append(sn) continue sn_intervals[sn] = (1, int(ln)) sns.append(sn) total_len += int(ln) total_sequences = len(sns) print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex) # Chunk the genome into genome_chunks pieces # weighted by both number of base pairs and number of seqs print "Total sequences included: %s" % (total_sequences) print "Total genome length: %s" % (total_len) total_points = total_len + (total_sequences * weight_seq) chunk_points = int(total_points / genome_chunks) chunk_input_pdh_names = [] print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points) for chunk_i in range(0, genome_chunks): chunk_num = chunk_i + 1 chunk_intervals_count = 0 chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks)) print "Creating interval file for chunk %s" % chunk_num chunk_c = arvados.collection.CollectionWriter(num_retries=3) chunk_c.start_new_file(newfilename=chunk_input_name) # chunk_c.write(interval_header) remaining_points = chunk_points while len(sns) > 0: sn = sns.pop(0) remaining_points -= weight_seq if remaining_points <= 0: sns.insert(0, sn) break if not sn_intervals.has_key(sn): raise ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end-start+1) > remaining_points: # not enough space for the whole sq, split it real_end = end end = remaining_points + start - 1 assert((end-start+1) <= remaining_points) sn_intervals[sn] = (end+1, real_end) sns.insert(0, sn) #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn)) interval = "%s\t%s\t%s\n" % (sn, start, end) remaining_points -= (end-start+1) chunk_c.write(interval) chunk_intervals_count += 1 if remaining_points <= 0: break if chunk_intervals_count > 0: chunk_input_pdh = chunk_c.finish() print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) else: print "WARNING: skipping empty intervals for %s" % chunk_input_name print "Have %s chunk collections: [%s]" % (len(chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names])) # prepare CRAM input collections job_input = arvados.current_job()['script_parameters']['inputs_collection'] cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): # Handle this CRAM file crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={"manifest_text": task_input}).execute() task_input_pdh = r["portable_data_hash"] except: raise create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=pool) print "Waiting for asynchronous requests to complete" pool.close() pool.join() if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit(0)
def one_task_per_interval( interval_count, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', task_key_params=['name', 'inputs', 'interval', 'ref'], script=arvados.current_job()['script']): """ Queue one task for each of interval_count intervals, splitting the genome chunk (described by the .interval_list file) evenly. Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param=interval_list_param) interval_reader = open(interval_list_file, mode="r") lines = interval_reader.readlines() sn_intervals = dict() sns = [] total_len = 0 for line in lines: if line[0] == '@': # skip all lines starting with '@' continue fields = line.split("\t") if len(fields) != 5: raise errors.InvalidArgumentError( "interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields))) sn = fields[0] start = int(fields[1]) end = int(fields[2]) length = int(end) - int(start) + 1 total_len += int(length) sn_intervals[sn] = (start, end) sns.append(sn) print "Total chunk length is %s" % total_len interval_len = int(total_len / interval_count) intervals = [] print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len) for interval_i in range(0, interval_count): interval_num = interval_i + 1 intervals_count = 0 remaining_len = interval_len interval = [] while len(sns) > 0: sn = sns.pop(0) if not sn_intervals.has_key(sn): raise errors.ValueError( "sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end - start + 1) > remaining_len: # not enough space for the whole sq, split it real_end = end end = remaining_len + start - 1 assert ((end - start + 1) <= remaining_len) sn_intervals[sn] = (end + 1, real_end) sns.insert(0, sn) interval.append("%s:%s-%s" % (sn, start, end)) remaining_len -= (end - start + 1) intervals_count += 1 if remaining_len <= 0: break if intervals_count > 0: intervals.append(interval) else: print "WARNING: skipping empty intervals for %s" % interval_input_name print "Have %s intervals" % (len(intervals)) if reuse_tasks: # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s potentially reusable tasks" % (len(reusable_tasks)) for interval in intervals: interval_str = ' '.join(interval) print "Creating task to process interval: [%s]" % interval_str new_task_params = arvados.current_task()['parameters'] new_task_params['interval'] = interval_str if reuse_tasks: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
#!/usr/bin/python import arvados import subprocess import subst import shutil import os import sys if len(arvados.current_task()["parameters"]) > 0: p = arvados.current_task()["parameters"] else: p = arvados.current_job()["script_parameters"] t = arvados.current_task().tmpdir os.unlink("/usr/local/share/bcbio-nextgen/galaxy") os.mkdir("/usr/local/share/bcbio-nextgen/galaxy") shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml", "/usr/local/share/bcbio-nextgen/galaxy") with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml", "w") as f: f.write( """<tables> <!-- Locations of indexes in the BWA mapper format --> <table name="bwa_indexes" comment_char="#"> <columns>value, dbkey, name, path</columns> <file path="tool-data/bwa_index.loc" /> </table> <!-- Locations of indexes in the Bowtie2 mapper format --> <table name="bowtie2_indexes" comment_char="#"> <columns>value, dbkey, name, path</columns>
def one_task_per_group_combined_inputs(ref_input, job_input, interval_lists, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task): """ Queue one task for each group of gVCFs and corresponding interval_list in the inputs_collection, with grouping based on three things: - the stream in which the gVCFs are held within the collection - the value of the named capture group "group_by" in the group_by_regex against the filename in the inputs_collection Each new task will have an "inputs" parameter: a manifest containing a set of one or more gVCF files and its corresponding index. Each new task will also have a "ref" parameter: a manifest containing the reference files to use. Note that all gVCFs not matching the group_by_regex are ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return group_by_r = re.compile(group_by_regex) # prepare interval_lists il_cr = arvados.CollectionReader(interval_lists) il_ignored_files = [] interval_list_by_group = {} for s in il_cr.all_streams(): for f in s.all_files(): m = re.search(group_by_r, f.name()) if m: group_name = m.group('group_by') interval_list_m = re.search(r'\.interval_list', f.name()) if interval_list_m: if group_name not in interval_list_by_group: interval_list_by_group[group_name] = dict() interval_list_by_group[group_name][s.name(), f.name()] = f continue # if we make it this far, we have files that we are ignoring il_ignored_files.append("%s/%s" % (s.name(), f.name())) # prepare gVCF input collections cr = arvados.CollectionReader(job_input) ignored_files = [] last_stream_name = "" gvcf_by_group = {} gvcf_indices = {} for s in sorted(cr.all_streams(), key=lambda stream: stream.name()): stream_name = s.name() # handle each stream name separately if stream_name != last_stream_name: if last_stream_name != "": print "Done processing files in stream %s" % last_stream_name one_task_per_gvcf_group_in_stream_combined_inputs( last_stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func) # now that we are done with last_stream_name, reinitialise dicts to # process data from new stream print "Processing files in stream %s" % stream_name gvcf_by_group = {} gvcf_indices = {} last_stream_name = stream_name # loop over all the files in this stream (there may be only one) for f in s.all_files(): if re.search(r'\.tbi$', f.name()): gvcf_indices[s.name(), f.name()] = f continue m = re.search(group_by_r, f.name()) if m: group_name = m.group('group_by') gvcf_m = re.search(r'\.vcf\.gz$', f.name()) if gvcf_m: if group_name not in gvcf_by_group: gvcf_by_group[group_name] = dict() gvcf_by_group[group_name][s.name(), f.name()] = f continue interval_list_m = re.search(r'\.interval_list', f.name()) if interval_list_m: if group_name not in interval_list_by_group: interval_list_by_group[group_name] = dict() if (s.name(), f.name()) in interval_list_by_group[group_name]: if interval_list_by_group[group_name][s.name( ), f.name()].as_manifest() != f.as_manifest(): raise errors.InvalidArgumentError( "Already have interval_list for group %s file %s/%s, but manifests are not identical!" % (group_name, s.name(), f.name())) else: interval_list_by_group[group_name][s.name(), f.name()] = f continue # if we make it this far, we have files that we are ignoring ignored_files.append("%s/%s" % (s.name(), f.name())) # finally, process the last stream print "Processing last stream" one_task_per_gvcf_group_in_stream_combined_inputs( stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func) # report on any ignored files if len(ignored_files) > 0: print "WARNING: ignored non-matching files in inputs_collection: %s" % ( ' '.join(ignored_files)) # TODO: could use `setmedian` from https://github.com/ztane/python-Levenshtein # to print most representative "median" filename (i.e. skipped 15 files like median), then compare the # rest of the files to that median (perhaps with `ratio`) if and_end_task: print "Ending task %s successfully" % if_sequence arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def chunked_tasks_per_cram_file( ref_input, job_input, interval_lists, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=True, reuse_tasks_retrieve_all=True, interval_list_param="interval_list", oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script=arvados.current_job()['script']): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # prepare interval lists cr = arvados.CollectionReader(interval_lists) chunk_interval_list = {} chunk_input_pdh_names = [] for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.interval_list$', f.name()): chunk_interval_list[s.name(), f.name()] = f for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()): chunk_input = chunk_interval_list_f.as_manifest() try: r = arvados.api().collections().create(body={ "manifest_text": chunk_input }).execute() chunk_input_pdh = r["portable_data_hash"] chunk_input_name = os.path.join(s_name, f_name) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) except: raise if len(chunk_input_pdh_names) == 0: raise errors.InvalidArgumentError( "No interval_list files found in %s" % (interval_lists)) # prepare CRAM input collections cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): crai_f = crai.get( (s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise errors.InvalidArgumentError( "No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={ "manifest_text": task_input }).execute() task_input_pdh = r["portable_data_hash"] except: raise if reuse_tasks: task_key_params = ['input', 'ref', 'chunk'] # get candidates for task reuse job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] if reuse_tasks_retrieve_all: # retrieve a full set of all possible reusable tasks reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % ( len(reusable_tasks)) else: reusable_task_jobs = get_jobs_for_task_reuse(job_filters) print "Have %s jobs for potential task reuse" % ( len(reusable_task_jobs)) reusable_task_job_uuids = [ job['uuid'] for job in reusable_task_jobs['items'] ] for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names: # Create task for each CRAM / chunk new_task_params = { 'input': task_input_pdh, 'ref': ref_input, 'chunk': chunk_input_pdh } print "Creating new task to process %s with chunk interval %s " % ( f_name, chunk_input_name) if reuse_tasks: if reuse_tasks_retrieve_all: task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output) else: task = create_or_reuse_task_from_jobs( if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output) else: task = create_task(if_sequence + 1, new_task_params) if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] if arvados.current_task()['sequence'] == 0: # get candidates for task reuse task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list script="gatk-genotypegvcfs.py" oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2' job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']], ] # retrieve a full set of all possible reusable tasks at sequence 1 print "Retrieving all potentially reusable tasks" reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) def create_task_with_validated_reuse(sequence, params): return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output) # Setup sub tasks (and terminate if this is task 0) hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task_with_validated_reuse) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Genotype gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs") name = this_task['parameters'].get('name') if not name: name = "unknown" out_file = name + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # GenotypeGVCFs! gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g") if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file( ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file( ref_fai_file, error_exception=FileAccessError( "reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file( chunk_file, error_exception=FileAccessError( "Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError( "CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename( cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join( tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend( [concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None)): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = [ "teepot", bcftools_view_noheader_input_fifo, "-" ] bcftools_view_noheader_cmd = [ "bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo ] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = [ "bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp ] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = [ "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file ] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % ( region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe( ) print "Creating 'bcftools norm | tee' pipe for region %s" % ( region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % ( region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe( ) print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % ( bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd( bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd( bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd( part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd( bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd( bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished( bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished( bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[ bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write ]) part_tee_p = close_process_if_finished( part_tee_p, "tee %s" % (region_label), close_fds=[ part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write ], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished( bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished( bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished( region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ((region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None)): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % ( concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [ concat_headeronly_tmps[region] for region in regions ]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe( ) grep_headeronly_cmd = [ "egrep", "-v", "^[#][#](bcftools|mpileup|reference)" ] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = [ "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn ] bcftools_concat_headeronly_p = run_child_cmd( bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished( bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished( grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % ( final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished( final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe( ) print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = [ "bcftools", "view", "-Oz", "-o", penultimate_out_file ] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd( final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd( final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished( final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished( final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % ( final_out_file) final_bcftools_reheader_cmd = [ "bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file ] final_bcftools_reheader_p = run_child_cmd( final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished( final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
#!/usr/bin/env python # Import the hashlib module (part of the Python standard library) to compute md5. import hashlib # Import the Arvados sdk module import arvados # Get information about the task from the environment this_task = arvados.current_task() # Get the "input" field from "script_parameters" on the job creation object this_job_input = arvados.getjobparam('input') # Create the object access to the collection referred to in the input collection = arvados.CollectionReader(this_job_input) # Create an object to write a new collection as output out = arvados.CollectionWriter() # Set the name of output file within the collection out.set_current_file_name("md5sum.txt") # Get an iterator over the files listed in the collection all_files = collection.all_files() # Iterate over each file for input_file in all_files: # Create the object that will actually compute the md5 hash digestor = hashlib.new('md5')
def one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex='_decoy$', genome_chunks=200): """ Queue one task for each cram file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .cram file and its corresponding .crai index file. Files in the input collection that are not named *.cram or *.crai (as well as *.crai files that do not match any .cram file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return # setup multiprocessing pool pool_processes = cpu_count() - 1 print 'Using %d processes to submit tasks\n' % pool_processes pool = Pool(processes=pool_processes) skip_sq_sn_r = re.compile(skip_sq_sn_regex) # Ensure we have a .fa reference file with corresponding .fai index and .dict reference_coll = arvados.current_job( )['script_parameters']['reference_collection'] rcr = arvados.CollectionReader(reference_coll) ref_fasta = {} ref_fai = {} ref_dict = {} ref_input = None dict_reader = None for rs in rcr.all_streams(): for rf in rs.all_files(): if re.search(r'\.fa$', rf.name()): ref_fasta[rs.name(), rf.name()] = rf elif re.search(r'\.fai$', rf.name()): ref_fai[rs.name(), rf.name()] = rf elif re.search(r'\.dict$', rf.name()): ref_dict[rs.name(), rf.name()] = rf for ((s_name, f_name), fasta_f) in ref_fasta.items(): fai_f = ref_fai.get( (s_name, re.sub(r'fa$', 'fai', f_name)), ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None)) dict_f = ref_dict.get( (s_name, re.sub(r'fa$', 'dict', f_name)), ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None)) if fasta_f and fai_f and dict_f: # found a set of all three! ref_input = fasta_f.as_manifest() ref_input += fai_f.as_manifest() ref_input += dict_f.as_manifest() dict_reader = dict_f break if ref_input is None: raise InvalidArgumentError( "Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) if dict_reader is None: raise InvalidArgumentError( "Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) # Create a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={ "manifest_text": ref_input }).execute() ref_input_pdh = r["portable_data_hash"] except: raise # Load the dict data interval_header = "" dict_lines = dict_reader.readlines() dict_header = dict_lines.pop(0) if re.search(r'^@HD', dict_header) is None: raise InvalidArgumentError( "Dict file in reference collection does not have correct header: [%s]" % dict_header) interval_header += dict_header print "Dict header is %s" % dict_header sn_intervals = dict() sns = [] skip_sns = [] total_len = 0 for sq in dict_lines: if re.search(r'^@SQ', sq) is None: raise InvalidArgumentError( "Dict file contains malformed SQ line: [%s]" % sq) interval_header += sq sn = None ln = None for tagval in sq.split("\t"): tv = tagval.split(":", 1) if tv[0] == "SN": sn = tv[1] if tv[0] == "LN": ln = tv[1] if sn and ln: break if not (sn and ln): raise InvalidArgumentError( "Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq) assert (sn and ln) if sn_intervals.has_key(sn): raise InvalidArgumentError( "Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq)) if skip_sq_sn_r.search(sn): skip_sns.append(sn) continue sn_intervals[sn] = (1, int(ln)) sns.append(sn) total_len += int(ln) total_sequences = len(sns) print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex) # Chunk the genome into genome_chunks pieces # weighted by both number of base pairs and number of seqs print "Total sequences included: %s" % (total_sequences) print "Total genome length: %s" % (total_len) total_points = total_len + (total_sequences * weight_seq) chunk_points = int(total_points / genome_chunks) chunk_input_pdh_names = [] print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points) for chunk_i in range(0, genome_chunks): chunk_num = chunk_i + 1 chunk_intervals_count = 0 chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks)) print "Creating interval file for chunk %s" % chunk_num chunk_c = arvados.collection.CollectionWriter(num_retries=3) chunk_c.start_new_file(newfilename=chunk_input_name) # chunk_c.write(interval_header) remaining_points = chunk_points while len(sns) > 0: sn = sns.pop(0) remaining_points -= weight_seq if remaining_points <= 0: sns.insert(0, sn) break if not sn_intervals.has_key(sn): raise ValueError("sn_intervals missing entry for sn [%s]" % sn) start, end = sn_intervals[sn] if (end - start + 1) > remaining_points: # not enough space for the whole sq, split it real_end = end end = remaining_points + start - 1 assert ((end - start + 1) <= remaining_points) sn_intervals[sn] = (end + 1, real_end) sns.insert(0, sn) #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn)) interval = "%s\t%s\t%s\n" % (sn, start, end) remaining_points -= (end - start + 1) chunk_c.write(interval) chunk_intervals_count += 1 if remaining_points <= 0: break if chunk_intervals_count > 0: chunk_input_pdh = chunk_c.finish() print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh) chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name)) else: print "WARNING: skipping empty intervals for %s" % chunk_input_name print "Have %s chunk collections: [%s]" % (len( chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names])) # prepare CRAM input collections job_input = arvados.current_job()['script_parameters']['inputs_collection'] cr = arvados.CollectionReader(job_input) cram = {} crai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.cram$', f.name()): cram[s.name(), f.name()] = f elif re.search(r'\.crai$', f.name()): crai[s.name(), f.name()] = f for ((s_name, f_name), cram_f) in cram.items(): # Handle this CRAM file crai_f = crai.get( (s_name, re.sub(r'cram$', 'crai', f_name)), crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None)) task_input = cram_f.as_manifest() if crai_f: task_input += crai_f.as_manifest() else: # no CRAI for CRAM raise InvalidArgumentError( "No correponding CRAI file found for CRAM file %s" % f_name) # Create a portable data hash for the task's subcollection try: r = arvados.api().collections().create(body={ "manifest_text": task_input }).execute() task_input_pdh = r["portable_data_hash"] except: raise create_chunk_tasks(f_name, chunk_input_pdh_names, if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, pool=pool) print "Waiting for asynchronous requests to complete" pool.close() pool.join() if and_end_task: print "Ending task 0 successfully" arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo) meminfo = sp.check_output(["free", "-hm"]) of.write("mem:\n" + meminfo) hostinfo = sp.check_output(["hostname"]) of.write("host: " + hostinfo) job = arv.current_job() task = arv.current_task() of = arv.CollectionWriter() of.set_current_file_name("info.log") whoinfo = sp.check_output(["whoami"]) of.write("user: "******"\n" ) pwdinfo = sp.check_output(["pwd"]) of.write("pwd: " + pwdinfo + "\n" ) lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo + "\n" )
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection']) job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job()['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, max_gvcfs_to_combine, if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert(this_task['sequence'] > 0) ################################################################################ # Phase II: Read interval_list and split into additional intervals ################################################################################ hgi_arvados.one_task_per_interval(interval_count, validate_task_output, reuse_tasks=True, oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1", if_sequence=1, and_end_task=True) # We will never reach this point if we are in the 1st task sequence assert(this_task['sequence'] > 1) ################################################################################ # Phase IIIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIIb: Combine gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() name = this_task['parameters'].get('name') if not name: name = "unknown" interval_str = this_task['parameters'].get('interval') if not interval_str: interval_str = "" interval_strs = interval_str.split() intervals = [] for interval in interval_strs: intervals.extend(["--intervals", interval]) out_file = name + ".vcf.gz" if interval_count > 1: out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz" if len(out_file) > 255: out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz" print "Output file name was too long with full interval list, shortened it to: %s" % out_file if len(out_file) > 255: raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file) # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # CombineGVCFs! extra_args = intervals extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"]) gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args) if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={'success':False} ).execute()
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert(this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ( (bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None) ): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"] bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe() print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd(bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd(part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished(bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write]) part_tee_p = close_process_if_finished(part_tee_p, "tee %s" % (region_label), close_fds=[part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished(region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ( (region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None) ): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe() grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn] bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished(grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished(final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe() print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished(final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % (final_out_file) final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file] final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()["script_parameters"]["reference_collection"] ) job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"] interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"] interval_count = 1 if "interval_count" in arvados.current_job()["script_parameters"]: interval_count = arvados.current_job()["script_parameters"]["interval_count"] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.chunked_tasks_per_cram_file( ref_input_pdh, job_input_pdh, interval_lists_pdh, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=False, oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2", script="gatk-haplotypecaller-cram.py", ) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert this_task["sequence"] != 0 ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if "reuse_job_task" in this_task["parameters"]: print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"] exit(0) ################################################################################ # Phase IIb: Call Haplotypes! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk") cram_file = gatk_helper.mount_gatk_cram_input(input_param="input") cram_file_base, cram_file_ext = os.path.splitext(cram_file) out_dir = hgi_arvados.prepare_out_dir() out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_filename = out_filename.replace(".bcf", "._cf") # HaplotypeCaller! gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename)) if gatk_exit != 0: print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() print "Task output written to keep, validating it" if validate_task_output(output_locator): print "Task output validated, setting output to %s" % (output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % (output_locator) arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
def run(): # Timestamps are added by crunch-job, so don't print redundant timestamps. arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s')) # Print package versions logger.info(arvados_cwl.versionstring()) api = arvados.api("v1") arvados_cwl.add_arv_hints() runner = None try: job_order_object = arvados.current_job()['script_parameters'] toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool")) pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$') def keeppath(v): if pdh_path.match(v): return "keep:%s" % v else: return v def keeppathObj(v): if "location" in v: v["location"] = keeppath(v["location"]) for k,v in job_order_object.items(): if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v): job_order_object[k] = { "class": "File", "location": "keep:%s" % v } adjustFileObjs(job_order_object, keeppathObj) adjustDirObjs(job_order_object, keeppathObj) normalizeFilesDirs(job_order_object) output_name = None output_tags = None enable_reuse = True on_error = "continue" debug = False if "arv:output_name" in job_order_object: output_name = job_order_object["arv:output_name"] del job_order_object["arv:output_name"] if "arv:output_tags" in job_order_object: output_tags = job_order_object["arv:output_tags"] del job_order_object["arv:output_tags"] if "arv:enable_reuse" in job_order_object: enable_reuse = job_order_object["arv:enable_reuse"] del job_order_object["arv:enable_reuse"] if "arv:on_error" in job_order_object: on_error = job_order_object["arv:on_error"] del job_order_object["arv:on_error"] if "arv:debug" in job_order_object: debug = job_order_object["arv:debug"] del job_order_object["arv:debug"] runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache( api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}), output_name=output_name, output_tags=output_tags) make_fs_access = functools.partial(CollectionFsAccess, collection_cache=runner.collection_cache) t = load_tool(toolpath, runner.arv_make_tool, fetcher_constructor=functools.partial(CollectionFetcher, api_client=runner.api, fs_access=make_fs_access(""), num_retries=runner.num_retries)) if debug: logger.setLevel(logging.DEBUG) logging.getLogger('arvados').setLevel(logging.DEBUG) logging.getLogger("cwltool").setLevel(logging.DEBUG) args = argparse.Namespace() args.project_uuid = arvados.current_job()["owner_uuid"] args.enable_reuse = enable_reuse args.on_error = on_error args.submit = False args.debug = debug args.quiet = False args.ignore_docker_for_reuse = False args.basedir = os.getcwd() args.name = None args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]} args.make_fs_access = make_fs_access args.trash_intermediate = False args.intermediate_output_ttl = 0 args.priority = arvados_cwl.DEFAULT_PRIORITY args.do_validate = True args.disable_js_validation = False runner.arv_executor(t, job_order_object, **vars(args)) except Exception as e: if isinstance(e, WorkflowException): logging.info("Workflow error %s", e) else: logging.exception("Unhandled exception") if runner and runner.final_output_collection: outputCollection = runner.final_output_collection.portable_data_hash() else: outputCollection = None api.job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'output': outputCollection, 'success': False, 'progress':1.0 }).execute()
% (t1 - t0)) print "Scaled, truncated, test population shape is %s, number of MB is %f" % ( test_population.shape, test_population.nbytes / 1000000.) with out.open('%s_training.npy' % (antigen_type), 'w') as f: np.save(f, population) with out.open('%s_test.npy' % (antigen_type), 'w') as f: np.save(f, test_population) with out.open('%s_training_labels.npy' % (antigen_type), 'w') as f: np.save(f, training_labels) with out.open('%s_where_well_sequenced.npy' % (antigen_type), 'w') as f: np.save(f, where_well_sequenced) with out.open('%s_where_homozygous.npy' % (antigen_type), 'w') as f: np.save(f, where_homozygous) with out.open('python_variables.py', 'w') as f: f.write('training_names=%s\n' % (training_names)) f.write('test_names=%s\n' % (test_names)) time_logging_fh.close() # Commit the output to keep task_output = out.save_new(create_collection_record=False) arvados.current_task().set_output(task_output) ###########################################################################################################################################
#!/usr/bin/python # import arvados import re arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True) this_job = arvados.current_job() this_task = arvados.current_task() this_task_input = this_task['parameters']['input'] input_file = list( arvados.CollectionReader(this_task_input).all_files() )[0] out = arvados.CollectionWriter() out.set_current_file_name(input_file.decompressed_name()) out.set_current_stream_name(input_file.stream_name()) for line in input_file.readlines(): out.write( "!!!" + line.upper() ) this_task.set_output(out.finish())