Example #1
0
def write_tmpdir(dir):
    coll_writer = arvados.CollectionWriter()
    filenames=next(os.walk(dir))[-1]
    for filename in filenames:
        coll_writer.write_file(os.path.join(dir,filename))
    pdh = coll_writer.finish()
    arvados.current_task().set_output(pdh)
Example #2
0
def spawn_new_task_per_file(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
        if not re.search(regex,name):
            continue
        new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': name,
                        }
                    }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
Example #3
0
def run(**kwargs):
    kwargs.setdefault('cwd', arvados.current_task().tmpdir)
    kwargs.setdefault('stdout', sys.stderr)
    execargs = ['java',
                '-Xmx%dm' % memory_limit(),
                '-Djava.io.tmpdir=' + arvados.current_task().tmpdir,
                '-jar', os.path.join(install_path(), 'GenomeAnalysisTK.jar')]
    execargs += [str(arg) for arg in kwargs.pop('args', [])]
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    return arvados.util.run_command(execargs, **kwargs)
Example #4
0
def run(module, **kwargs):
    kwargs.setdefault('cwd', arvados.current_task().tmpdir)
    execargs = ['java',
                '-Xmx1500m',
                '-Djava.io.tmpdir=' + arvados.current_task().tmpdir,
                '-jar', os.path.join(install_path(), module + '.jar')]
    execargs += [str(arg) for arg in kwargs.pop('args', [])]
    for key, value in kwargs.pop('params', {}).items():
        execargs += [key.upper() + '=' + str(value)]
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    return arvados.util.run_command(execargs, **kwargs)
Example #5
0
def run(module, **kwargs):
    kwargs.setdefault('cwd', arvados.current_task().tmpdir)
    execargs = [
        'java', '-Xmx1500m',
        '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar',
        os.path.join(install_path(), module + '.jar')
    ]
    execargs += [str(arg) for arg in kwargs.pop('args', [])]
    for key, value in kwargs.pop('params', {}).items():
        execargs += [key.upper() + '=' + str(value)]
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    return arvados.util.run_command(execargs, **kwargs)
Example #6
0
def run(**kwargs):
    kwargs.setdefault('cwd', arvados.current_task().tmpdir)
    kwargs.setdefault('stdout', sys.stderr)
    execargs = [
        'java',
        '-Xmx%dm' % memory_limit(),
        '-Djava.io.tmpdir=' + arvados.current_task().tmpdir, '-jar',
        os.path.join(install_path(), 'GenomeAnalysisTK.jar')
    ]
    execargs += [str(arg) for arg in kwargs.pop('args', [])]
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    return arvados.util.run_command(execargs, **kwargs)
Example #7
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': left_file.as_manifest(),
                        'input_2': right_file.as_manifest()
                    }
                }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Example #8
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1':left_file.as_manifest(),
                        'input_2':right_file.as_manifest()
                        }
                    }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit(0)
def main():
    current_job = arvados.current_job()

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the interval_list collection to only those files relevant to gatk
    il_input_pdh = prepare_gatk_interval_list_collection(interval_list_coll=current_job['script_parameters']['interval_list_collection'])

    # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection
    output_locator = create_interval_lists(genome_chunks, il_input_pdh)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
Example #10
0
def run_rtg(command, output_dir, command_args, **kwargs):
    global rtg_install_path
    execargs = [
        os.path.join(rtg_install_path, 'rtg'), command, '-o', output_dir
    ]
    execargs += command_args
    sys.stderr.write("run_rtg: exec %s\n" % str(execargs))
    arvados.util.run_command(execargs,
                             cwd=arvados.current_task().tmpdir,
                             stderr=sys.stderr,
                             stdout=sys.stderr)

    # Exit status cannot be trusted in rtg 1.1.1.
    assert_done(output_dir)

    # Copy log files to stderr and delete them to avoid storing them
    # in Keep with the output data.
    for dirent in arvados.util.listdir_recursive(output_dir):
        if is_log_file(dirent):
            log_file = os.path.join(output_dir, dirent)
            sys.stderr.write(' '.join(['==>', dirent, '<==\n']))
            with open(log_file, 'rb') as f:
                while True:
                    buf = f.read(2**20)
                    if len(buf) == 0:
                        break
                    sys.stderr.write(buf)
            sys.stderr.write('\n')  # in case log does not end in newline
            os.unlink(log_file)
Example #11
0
def run_rtg(command, output_dir, command_args, **kwargs):
    global rtg_install_path
    execargs = [os.path.join(rtg_install_path, 'rtg'),
                command,
                '-o', output_dir]
    execargs += command_args
    sys.stderr.write("run_rtg: exec %s\n" % str(execargs))
    arvados.util.run_command(
        execargs,
        cwd=arvados.current_task().tmpdir,
        stderr=sys.stderr,
        stdout=sys.stderr)

    # Exit status cannot be trusted in rtg 1.1.1.
    assert_done(output_dir)

    # Copy log files to stderr and delete them to avoid storing them
    # in Keep with the output data.
    for dirent in arvados.util.listdir_recursive(output_dir):
        if is_log_file(dirent):
            log_file = os.path.join(output_dir, dirent)
            sys.stderr.write(' '.join(['==>', dirent, '<==\n']))
            with open(log_file, 'rb') as f:
                while True:
                    buf = f.read(2**20)
                    if len(buf) == 0:
                        break
                    sys.stderr.write(buf)
            sys.stderr.write('\n') # in case log does not end in newline
            os.unlink(log_file)
def create_chunk_tasks(f_name, chunk_input_pdh_names, 
                       if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, 
                       pool=None):
    async_results = []
    for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
        # Create task for each CRAM / chunk
        job_uuid = arvados.current_job()['uuid']
        current_task_uuid = arvados.current_task()['uuid']
        new_task_attrs = {
            'job_uuid': job_uuid,
            'created_by_job_task_uuid': current_task_uuid,
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input_pdh,
                'ref': ref_input_pdh,
                'chunk': chunk_input_pdh
                }
            }
        async_result = pool.apply_async(arv_create_task, (
                new_task_attrs,
                "Created new task to process %s with chunk interval %s (job_uuid %s)" % (f_name, chunk_input_name, job_uuid)))
        async_results.append(async_result)

    for async_result in async_results:
        async_result.wait()
        try:
            (res, report) = async_result.get()
            if (not res) or (not 'qsequence' in res):
                raise InternalError("Could not create job task: %s" % res)
            else:
                print report + " qsequence %s" % res['qsequence']
        except Exception as e:
            raise InternalError("Exception creating job task: %s" % e)
Example #13
0
def upload(source_dir, logger=None):
    if logger is None:
        logger = logging.getLogger("arvados")

    source_dir = os.path.abspath(source_dir)
    done = False
    if 'TASK_WORK' in os.environ:
        resume_cache = put.ResumeCache(os.path.join(arvados.current_task().tmpdir, "upload-output-checkpoint"))
    else:
        resume_cache = put.ResumeCache(put.ResumeCache.make_path(Args(source_dir)))
    reporter = put.progress_writer(machine_progress)
    bytes_expected = put.expected_bytes_for([source_dir])
    backoff = 1
    outuuid = None
    while not done:
        try:
            out = put.ArvPutCollectionWriter.from_cache(resume_cache, reporter, bytes_expected)
            out.do_queued_work()
            out.write_directory_tree(source_dir, max_manifest_depth=0)
            outuuid = out.finish()
            done = True
        except KeyboardInterrupt as e:
            logger.critical("caught interrupt signal 2")
            raise e
        except Exception as e:
            logger.exception("caught exception:")
            backoff *= 2
            if backoff > 256:
                logger.critical("Too many upload failures, giving up")
                raise e
            else:
                logger.warning("Sleeping for %s seconds before trying again" % backoff)
                time.sleep(backoff)
    return outuuid
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output):
    new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': sequence,
            'parameters': parameters
            }
    # See if there is a task in reusable_tasks that can be reused
    ct_index = tuple([parameters[index_param] for index_param in task_key_params])
    if len(reusable_tasks) == 0:
        print "No reusable tasks were available"
    elif ct_index in reusable_tasks:
        # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output
        reuse_task = reusable_tasks[ct_index]
        if validate_task_output(reuse_task['output']):
            print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output'])
            # remove task from reusable_tasks as it won't be used more than once
            del reusable_tasks[ct_index]
            # copy relevant attrs from reuse_task so that the new tasks start already finished
            for attr in ['success', 'output', 'progress', 'started_at', 'finished_at', 'parameters']:
                new_task_attrs[attr] = reuse_task[attr]
            # crunch seems to ignore the fact that the job says it is done and queue it anyway
            # signal ourselves to just immediately exit successfully when we are run
            new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid']
        else:
            print "Output %s for potential task reuse did not validate" % (reuse_task['output'])
    else:
        print "No reusable JobTask matched key parameters %s" % (list(ct_index))

    # Create the "new" task (may be new work or may be already finished work)
    new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if not new_task:
        raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs)
    return new_task
Example #15
0
def one_task_per_bam_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each bam file in this job's input collection.

    Each new task will have an "input" parameter: a manifest
    containing one .bam file and (if available) the corresponding .bai
    index file.

    Files in the input collection that are not named *.bam or *.bai
    (as well as *.bai files that do not match any .bam file present)
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    bam = {}
    bai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.bam$', f.name()):
                bam[s.name(), f.name()] = f
            elif re.search(r'\.bai$', f.name()):
                bai[s.name(), f.name()] = f
    for ((s_name, f_name), bam_f) in bam.items():
        bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
        task_input = bam_f.as_manifest()
        if bai_f:
            task_input += bai_f.as_manifest()
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input
            }
        }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Example #16
0
def one_task_per_bam_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each bam file in this job's input collection.

    Each new task will have an "input" parameter: a manifest
    containing one .bam file and (if available) the corresponding .bai
    index file.

    Files in the input collection that are not named *.bam or *.bai
    (as well as *.bai files that do not match any .bam file present)
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    bam = {}
    bai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.bam$', f.name()):
                bam[s.name(), f.name()] = f
            elif re.search(r'\.bai$', f.name()):
                bai[s.name(), f.name()] = f
    for ((s_name, f_name), bam_f) in bam.items():
        bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
        task_input = bam_f.as_manifest()
        if bai_f:
            task_input += bai_f.as_manifest()
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input
                }
            }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
Example #17
0
def main():
    current_job = arvados.current_job()

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the interval_list collection to only those files relevant to gatk
    il_input_pdh = prepare_gatk_interval_list_collection(
        interval_list_coll=current_job['script_parameters']
        ['interval_list_collection'])

    # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection
    output_locator = create_interval_lists(genome_chunks, il_input_pdh)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
def create_task(sequence, params):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': params
    }
    task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    return task
Example #19
0
def create_task(sequence, params):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': params
    }
    task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    return task
def main():
    # Get object representing the current task
    this_task = arvados.current_task()

    sort_by_r = re.compile(sort_by_regex)

    ################################################################################
    # Concatentate VCFs in numerically sorted order of sort_by_regex
    ################################################################################
    vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    output_prefix = arvados.current_job()['script_parameters']['output_prefix']
    out_file = output_prefix + ".vcf.gz"

    # Concatenate VCFs
    bcftools_concat_exit = bcftools.concat(sorted(vcf_files, key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))),
                                    os.path.join(out_dir, out_file))

    if bcftools_concat_exit != 0:
        print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "bcftools concat exited successfully, indexing"

        bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file))

        if bcftools_index_exit != 0:
            print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                         ).execute()
        else:
            print "bcftools index exited successfully, writing output to keep"


            # Write a new collection as output
            out = arvados.CollectionWriter()

            # Write out_dir to keep
            out.write_directory_tree(out_dir)

            # Commit the output to Keep.
            output_locator = out.finish()

            if validate_task_output(output_locator):
                print "Task output validated, setting output to %s" % (output_locator)

                # Use the resulting locator as the output for this task.
                this_task.set_output(output_locator)
            else:
                print "ERROR: Failed to validate task output (%s)" % (output_locator)
                arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                                 body={'success':False}
                                             ).execute()
def main():
    current_job = arvados.current_job()
    skip_sq_sn_regex = '_decoy$'
    if 'skip_sq_sn_regex' in current_job['script_parameters']:
        skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex']
    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the reference collection to only those files relevant to gatk
    ref_input_pdh = prepare_gatk_reference_collection(reference_coll=current_job['script_parameters']['reference_collection'])

    # Create an interval_list file for each chunk based on the .dict in the reference collection
    output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True):
    if if_sequence != arvados.current_task()['sequence']:
        return
    api_client = arvados.api('v1')
    for i in range(num_classifiers_to_parameterize):
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'classifier_index':i,
                'time_to_wait':i*560
            }
        }
        api_client.job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        api_client.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                      body={'success':True}
                                      ).execute()
        exit(0)
Example #23
0
def clear_tmpdir(path=None):
    """
    Ensure the given directory (or TASK_TMPDIR if none given)
    exists and is empty.
    """
    if path is None:
        path = arvados.current_task().tmpdir
    if os.path.exists(path):
        p = subprocess.Popen(['rm', '-rf', path])
        stdout, stderr = p.communicate(None)
        if p.returncode != 0:
            raise Exception('rm -rf %s: %s' % (path, stderr))
    os.mkdir(path)
def main():
    current_job = arvados.current_job()
    skip_sq_sn_regex = '_decoy$'
    if 'skip_sq_sn_regex' in current_job['script_parameters']:
        skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex']
    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the reference collection to only those files relevant to gatk
    ref_input_pdh = prepare_gatk_reference_collection(
        reference_coll=current_job['script_parameters']
        ['reference_collection'])

    # Create an interval_list file for each chunk based on the .dict in the reference collection
    output_locator = create_interval_lists(genome_chunks, ref_input_pdh,
                                           skip_sq_sn_r)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
Example #25
0
def clear_tmpdir(path=None):
    """
    Ensure the given directory (or TASK_TMPDIR if none given)
    exists and is empty.
    """
    if path is None:
        path = arvados.current_task().tmpdir
    if os.path.exists(path):
        p = subprocess.Popen(['rm', '-rf', path])
        stdout, stderr = p.communicate(None)
        if p.returncode != 0:
            raise Exception('rm -rf %s: %s' % (path, stderr))
    os.mkdir(path)
Example #26
0
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from 
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
	if not re.search(regex,name):
	    continue
	name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name)
        bed_lines = (line.split() for line in open(name_path, 'r'))
        # Start the biggest regions first
        def cmp_desc_region_size(a, b):
            return ((int(b[2]) - int(b[1])) -
                    (int(a[2]) - int(a[1])))
        for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size):
	    print bed_line
     	    new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'chrom': bed_line[0],
			'start': bed_line[1],
			'end': bed_line[2]
                        }
                    }
            arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit()
def prepare_out_dir():
    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    return out_dir
Example #28
0
def prepare_out_dir():
    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    return out_dir
Example #29
0
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params,
                         validate_task_output):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': parameters
    }
    # See if there is a task in reusable_tasks that can be reused
    ct_index = tuple(
        [parameters[index_param] for index_param in task_key_params])
    if len(reusable_tasks) == 0:
        print "No reusable tasks were available"
    elif ct_index in reusable_tasks:
        # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output
        reuse_task = reusable_tasks[ct_index]
        if validate_task_output(reuse_task['output']):
            print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (
                reuse_task['uuid'], reuse_task['job_uuid'],
                reuse_task['output'])
            # remove task from reusable_tasks as it won't be used more than once
            del reusable_tasks[ct_index]
            # copy relevant attrs from reuse_task so that the new tasks start already finished
            for attr in [
                    'success', 'output', 'progress', 'started_at',
                    'finished_at', 'parameters'
            ]:
                new_task_attrs[attr] = reuse_task[attr]
            # crunch seems to ignore the fact that the job says it is done and queue it anyway
            # signal ourselves to just immediately exit successfully when we are run
            new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid']
        else:
            print "Output %s for potential task reuse did not validate" % (
                reuse_task['output'])
    else:
        print "No reusable JobTask matched key parameters %s" % (
            list(ct_index))

    # Create the "new" task (may be new work or may be already finished work)
    new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if not new_task:
        raise errors.APIError("Attempt to create new job_task failed: [%s]" %
                              new_task_attrs)
    return new_task
def mount_gatk_gvcf_inputs(inputs_param="inputs"):
    # Get input gVCFs for this task
    print "Mounting task input collection"
    inputs_dir = ""
    if inputs_param in arvados.current_task()['parameters']:
        inputs_dir = arvados.get_task_param_mount(inputs_param)
    else:
        inputs_dir = arvados.get_job_param_mount(inputs_param)

    # Sanity check input gVCFs
    input_gvcf_files = []
    for f in arvados.util.listdir_recursive(inputs_dir):
        if re.search(r'\.vcf\.gz$', f):
            input_gvcf_files.append(os.path.join(inputs_dir, f))
        elif re.search(r'\.tbi$', f):
            pass
        elif re.search(r'\.interval_list$', f):
            pass
        else:
            print "WARNING: collection contains unexpected file %s" % f
    if len(input_gvcf_files) == 0:
        raise errors.InvalidArgumentError(
            "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)"
            % inputs_dir)

    # Ensure we can read the gVCF files and that they each have an index
    for gvcf_file in input_gvcf_files:
        if not os.access(gvcf_file, os.R_OK):
            raise errors.FileAccessError("gVCF file not readable: %s" %
                                         gvcf_file)

        # Ensure we have corresponding .tbi index and can read it as well
        (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file)
        assert (gvcf_file_ext == ".gz")
        tbi_file = gvcf_file_base + ".gz.tbi"
        if not os.access(tbi_file, os.R_OK):
            tbi_file = gvcf_file_base + ".tbi"
            if not os.access(tbi_file, os.R_OK):
                raise errors.FileAccessError(
                    "No readable gVCF index file for gVCF file: %s" %
                    gvcf_file)
    return input_gvcf_files
Example #31
0
def run(command, command_args, **kwargs):
    """
    Build and run the samtools binary.

    command is the samtools subcommand, e.g., "view" or "sort".

    command_args is a list of additional command line arguments, e.g.,
    ['-bt', 'ref_list.txt', '-o', 'aln.bam', 'aln.sam.gz']

    It is assumed that we are running in a Crunch job environment, and
    the job's "samtools_tgz" parameter is a collection containing the
    samtools source tree in a .tgz file.
    """
    execargs = [samtools_binary(), command]
    execargs += command_args
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    arvados.util.run_command(execargs,
                             cwd=arvados.current_task().tmpdir,
                             stdin=kwargs.get('stdin', subprocess.PIPE),
                             stderr=kwargs.get('stderr', sys.stderr),
                             stdout=kwargs.get('stdout', sys.stderr))
Example #32
0
def run(command, command_args, **kwargs):
    """
    Build and run the bwa binary.

    command is the bwa module, e.g., "index" or "aln".

    command_args is a list of additional command line arguments, e.g.,
    ['-a', 'bwtsw', 'ref.fasta']

    It is assumed that we are running in a Crunch job environment, and
    the job's "bwa_tbz" parameter is a collection containing the bwa
    source tree in a .tbz file.
    """
    execargs = [bwa_binary(), command]
    execargs += command_args
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    arvados.util.run_command(execargs,
                             cwd=arvados.current_task().tmpdir,
                             stderr=sys.stderr,
                             stdin=kwargs.get('stdin', subprocess.PIPE),
                             stdout=kwargs.get('stdout', sys.stderr))
def create_chunk_tasks(f_name,
                       chunk_input_pdh_names,
                       if_sequence,
                       task_input_pdh,
                       ref_input_pdh,
                       chunk_input_pdh,
                       pool=None):
    async_results = []
    for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
        # Create task for each CRAM / chunk
        job_uuid = arvados.current_job()['uuid']
        current_task_uuid = arvados.current_task()['uuid']
        new_task_attrs = {
            'job_uuid': job_uuid,
            'created_by_job_task_uuid': current_task_uuid,
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input_pdh,
                'ref': ref_input_pdh,
                'chunk': chunk_input_pdh
            }
        }
        async_result = pool.apply_async(arv_create_task, (
            new_task_attrs,
            "Created new task to process %s with chunk interval %s (job_uuid %s)"
            % (f_name, chunk_input_name, job_uuid)))
        async_results.append(async_result)

    for async_result in async_results:
        async_result.wait()
        try:
            (res, report) = async_result.get()
            if (not res) or (not 'qsequence' in res):
                raise InternalError("Could not create job task: %s" % res)
            else:
                print report + " qsequence %s" % res['qsequence']
        except Exception as e:
            raise InternalError("Exception creating job task: %s" % e)
Example #34
0
def run(command, command_args, **kwargs):
    """
    Build and run the samtools binary.

    command is the samtools subcommand, e.g., "view" or "sort".

    command_args is a list of additional command line arguments, e.g.,
    ['-bt', 'ref_list.txt', '-o', 'aln.bam', 'aln.sam.gz']

    It is assumed that we are running in a Crunch job environment, and
    the job's "samtools_tgz" parameter is a collection containing the
    samtools source tree in a .tgz file.
    """
    execargs = [samtools_binary(),
                command]
    execargs += command_args
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    arvados.util.run_command(
        execargs,
        cwd=arvados.current_task().tmpdir,
        stdin=kwargs.get('stdin', subprocess.PIPE),
        stderr=kwargs.get('stderr', sys.stderr),
        stdout=kwargs.get('stdout', sys.stderr))
Example #35
0
def run(command, command_args, **kwargs):
    """
    Build and run the bwa binary.

    command is the bwa module, e.g., "index" or "aln".

    command_args is a list of additional command line arguments, e.g.,
    ['-a', 'bwtsw', 'ref.fasta']

    It is assumed that we are running in a Crunch job environment, and
    the job's "bwa_tbz" parameter is a collection containing the bwa
    source tree in a .tbz file.
    """
    execargs = [bwa_binary(),
                command]
    execargs += command_args
    sys.stderr.write("%s.run: exec %s\n" % (__name__, str(execargs)))
    arvados.util.run_command(
        execargs,
        cwd=arvados.current_task().tmpdir,
        stderr=sys.stderr,
        stdin=kwargs.get('stdin', subprocess.PIPE),
        stdout=kwargs.get('stdout', sys.stderr))
def one_task_per_interval(interval_count, validate_task_output,
                          if_sequence=0, and_end_task=True,
                          reuse_tasks=True,
                          interval_list_param="interval_list",
                          oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
                          task_key_params=['name', 'inputs', 'interval', 'ref'],
                          script=arvados.current_job()['script']):
    """
    Queue one task for each of interval_count intervals, splitting
    the genome chunk (described by the .interval_list file) evenly.

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param=interval_list_param)

    interval_reader = open(interval_list_file, mode="r")

    lines = interval_reader.readlines()
    sn_intervals = dict()
    sns = []
    total_len = 0
    for line in lines:
        if line[0] == '@':
            # skip all lines starting with '@'
            continue
        fields = line.split("\t")
        if len(fields) != 5:
            raise errors.InvalidArgumentError("interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields)))
        sn = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        length = int(end) - int(start) + 1
        total_len += int(length)
        sn_intervals[sn] = (start, end)
        sns.append(sn)

    print "Total chunk length is %s" % total_len
    interval_len = int(total_len / interval_count)
    intervals = []
    print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len)
    for interval_i in range(0, interval_count):
        interval_num = interval_i + 1
        intervals_count = 0
        remaining_len = interval_len
        interval = []
        while len(sns) > 0:
            sn = sns.pop(0)
            if not sn_intervals.has_key(sn):
                raise errors.ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end-start+1) > remaining_len:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_len + start - 1
                assert((end-start+1) <= remaining_len)
                sn_intervals[sn] = (end+1, real_end)
                sns.insert(0, sn)
            interval.append("%s:%s-%s" % (sn, start, end))
            remaining_len -= (end-start+1)
            intervals_count += 1
            if remaining_len <= 0:
                break
        if intervals_count > 0:
            intervals.append(interval)
        else:
            print "WARNING: skipping empty intervals for %s" % interval_input_name
    print "Have %s intervals" % (len(intervals))

    if reuse_tasks:
        # get candidates for task reuse
        job_filters = [
            ['script', '=', script],
            ['repository', '=', arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
        ]
        reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters)
        print "Have %s potentially reusable tasks" % (len(reusable_tasks))

    for interval in intervals:
        interval_str = ' '.join(interval)
        print "Creating task to process interval: [%s]" % interval_str
        new_task_params = arvados.current_task()['parameters']
        new_task_params['interval'] = interval_str
        if reuse_tasks:
            task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output)
        else:
            task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
def one_task_per_group_combined_inputs(ref_input, job_input, interval_lists, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task):
    """
    Queue one task for each group of gVCFs and corresponding interval_list
    in the inputs_collection, with grouping based on three things:
      - the stream in which the gVCFs are held within the collection
      - the value of the named capture group "group_by" in the
        group_by_regex against the filename in the inputs_collection

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    group_by_r = re.compile(group_by_regex)

    # prepare interval_lists
    il_cr = arvados.CollectionReader(interval_lists)
    il_ignored_files = []
    interval_list_by_group = {}
    for s in il_cr.all_streams():
        for f in s.all_files():
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    interval_list_by_group[group_name][s.name(), f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            il_ignored_files.append("%s/%s" % (s.name(), f.name()))

    # prepare gVCF input collections
    cr = arvados.CollectionReader(job_input)
    ignored_files = []
    last_stream_name = ""
    gvcf_by_group = {}
    gvcf_indices = {}
    for s in sorted(cr.all_streams(), key=lambda stream: stream.name()):
        stream_name = s.name()
        # handle each stream name separately
        if stream_name != last_stream_name:
            if last_stream_name != "":
                print "Done processing files in stream %s" % last_stream_name
                one_task_per_gvcf_group_in_stream_combined_inputs(last_stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func)
                # now that we are done with last_stream_name, reinitialise dicts to
                # process data from new stream
                print "Processing files in stream %s" % stream_name
                gvcf_by_group = {}
                gvcf_indices = {}
            last_stream_name = stream_name

        # loop over all the files in this stream (there may be only one)
        for f in s.all_files():
            if re.search(r'\.tbi$', f.name()):
                gvcf_indices[s.name(), f.name()] = f
                continue
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                gvcf_m = re.search(r'\.vcf\.gz$', f.name())
                if gvcf_m:
                    if group_name not in gvcf_by_group:
                        gvcf_by_group[group_name] = dict()
                    gvcf_by_group[group_name][s.name(), f.name()] = f
                    continue
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    if (s.name(), f.name()) in interval_list_by_group[group_name]:
                        if interval_list_by_group[group_name][s.name(), f.name()].as_manifest() != f.as_manifest():
                            raise errors.InvalidArgumentError("Already have interval_list for group %s file %s/%s, but manifests are not identical!" % (group_name, s.name(), f.name()))
                    else:
                        interval_list_by_group[group_name][s.name(), f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            ignored_files.append("%s/%s" % (s.name(), f.name()))
    # finally, process the last stream
    print "Processing last stream"
    one_task_per_gvcf_group_in_stream_combined_inputs(stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input, create_task_func=create_task_func)

    # report on any ignored files
    if len(ignored_files) > 0:
        print "WARNING: ignored non-matching files in inputs_collection: %s" % (' '.join(ignored_files))
        # TODO: could use `setmedian` from https://github.com/ztane/python-Levenshtein
        # to print most representative "median" filename (i.e. skipped 15 files like median), then compare the
        # rest of the files to that median (perhaps with `ratio`)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
def chunked_tasks_per_cram_file(ref_input, job_input, interval_lists, validate_task_output,
                                if_sequence=0, and_end_task=True,
                                reuse_tasks=True, reuse_tasks_retrieve_all=True,
                                interval_list_param="interval_list",
                                oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
                                script=arvados.current_job()['script']):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # prepare interval lists
    cr = arvados.CollectionReader(interval_lists)
    chunk_interval_list = {}
    chunk_input_pdh_names = []
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.interval_list$', f.name()):
                chunk_interval_list[s.name(), f.name()] = f
    for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()):
        chunk_input = chunk_interval_list_f.as_manifest()
        try:
            r = arvados.api().collections().create(body={"manifest_text": chunk_input}).execute()
            chunk_input_pdh = r["portable_data_hash"]
            chunk_input_name = os.path.join(s_name, f_name)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        except:
            raise

    if len(chunk_input_pdh_names) == 0:
        raise errors.InvalidArgumentError("No interval_list files found in %s" % (interval_lists))

    # prepare CRAM input collections
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)),
                          crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)),
                                   None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise errors.InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={"manifest_text": task_input}).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        if reuse_tasks:
            task_key_params=['input', 'ref', 'chunk']
            # get candidates for task reuse
            job_filters = [
                ['script', '=', script],
                ['repository', '=', arvados.current_job()['repository']],
                ['script_version', 'in git', oldest_git_commit_to_reuse],
                ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
            ]
            if reuse_tasks_retrieve_all:
                # retrieve a full set of all possible reusable tasks
                reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters)
                print "Have %s tasks for potential reuse" % (len(reusable_tasks))
            else:
                reusable_task_jobs = get_jobs_for_task_reuse(job_filters)
                print "Have %s jobs for potential task reuse" % (len(reusable_task_jobs))
                reusable_task_job_uuids = [job['uuid'] for job in reusable_task_jobs['items']]

        for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
            # Create task for each CRAM / chunk
            new_task_params = {
                'input': task_input_pdh,
                'ref': ref_input,
                'chunk': chunk_input_pdh
            }
            print "Creating new task to process %s with chunk interval %s " % (f_name, chunk_input_name)
            if reuse_tasks:
                if reuse_tasks_retrieve_all:
                    task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output)
                else:
                    task = create_or_reuse_task_from_jobs(if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output)
            else:
                task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
Example #39
0
    lsinfo = sp.check_output(["ls", "-lahR"])
    of.write("directory structure:\n" + lsinfo)

    dfinfo = sp.check_output(["df", "-h"])
    of.write("df:\n" + dfinfo)

    meminfo = sp.check_output(["free", "-hm"])
    of.write("mem:\n" + meminfo)

    hostinfo = sp.check_output(["hostname"])
    of.write("host: " + hostinfo)


job = arv.current_job()
task = arv.current_task()

of = arv.CollectionWriter()
of.set_current_file_name("info.log")

whoinfo = sp.check_output(["whoami"])
of.write("user: "******"\n")

pwdinfo = sp.check_output(["pwd"])
of.write("pwd: " + pwdinfo + "\n")

lsinfo = sp.check_output(["ls", "-lahR"])
of.write("directory structure:\n" + lsinfo)

dfinfo = sp.check_output(["df", "-h"])
of.write("df:\n" + dfinfo + "\n")
#!/usr/bin/python
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import arvados
import subprocess
import crunchutil.subst as subst
import shutil
import os
import sys
import time

if len(arvados.current_task()['parameters']) > 0:
    p = arvados.current_task()['parameters']
else:
    p = arvados.current_job()['script_parameters']

t = arvados.current_task().tmpdir

os.unlink("/usr/local/share/bcbio-nextgen/galaxy")
os.mkdir("/usr/local/share/bcbio-nextgen/galaxy")
shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml",
            "/usr/local/share/bcbio-nextgen/galaxy")

with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml",
          "w") as f:
    f.write('''<tables>
    <!-- Locations of indexes in the BWA mapper format -->
    <table name="bwa_indexes" comment_char="#">
        <columns>value, dbkey, name, path</columns>
def one_task_per_cram_file(if_sequence=0, and_end_task=True, 
                           skip_sq_sn_regex='_decoy$', 
                           genome_chunks=200):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # setup multiprocessing pool
    pool_processes = cpu_count() - 1
    print 'Using %d processes to submit tasks\n' % pool_processes
    pool = Pool(processes=pool_processes)

    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    # Ensure we have a .fa reference file with corresponding .fai index and .dict
    reference_coll = arvados.current_job()['script_parameters']['reference_collection']
    rcr = arvados.CollectionReader(reference_coll)
    ref_fasta = {}
    ref_fai = {}
    ref_dict = {}
    ref_input = None
    dict_reader = None
    for rs in rcr.all_streams():
        for rf in rs.all_files():
            if re.search(r'\.fa$', rf.name()):
                ref_fasta[rs.name(), rf.name()] = rf
            elif re.search(r'\.fai$', rf.name()):
                ref_fai[rs.name(), rf.name()] = rf
            elif re.search(r'\.dict$', rf.name()):
                ref_dict[rs.name(), rf.name()] = rf
    for ((s_name, f_name), fasta_f) in ref_fasta.items():
        fai_f = ref_fai.get((s_name, re.sub(r'fa$', 'fai', f_name)), 
                            ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), 
                                        None))
        dict_f = ref_dict.get((s_name, re.sub(r'fa$', 'dict', f_name)), 
                              ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), 
                                           None))
        if fasta_f and fai_f and dict_f:
            # found a set of all three! 
            ref_input = fasta_f.as_manifest()
            ref_input += fai_f.as_manifest()
            ref_input += dict_f.as_manifest()
            dict_reader = dict_f
            break
    if ref_input is None:
        raise InvalidArgumentError("Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files()))
    if dict_reader is None:
        raise InvalidArgumentError("Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files()))

    # Create a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={"manifest_text": ref_input}).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise 

    # Load the dict data
    interval_header = ""
    dict_lines = dict_reader.readlines()
    dict_header = dict_lines.pop(0)
    if re.search(r'^@HD', dict_header) is None:
        raise InvalidArgumentError("Dict file in reference collection does not have correct header: [%s]" % dict_header)
    interval_header += dict_header
    print "Dict header is %s" % dict_header
    sn_intervals = dict()
    sns = []
    skip_sns = []
    total_len = 0
    for sq in dict_lines:
        if re.search(r'^@SQ', sq) is None:
            raise InvalidArgumentError("Dict file contains malformed SQ line: [%s]" % sq)
        interval_header += sq
        sn = None
        ln = None
        for tagval in sq.split("\t"):
            tv = tagval.split(":", 1)
            if tv[0] == "SN":
                sn = tv[1]
            if tv[0] == "LN":
                ln = tv[1]
            if sn and ln:
                break
        if not (sn and ln):
            raise InvalidArgumentError("Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq)
        assert(sn and ln)
        if sn_intervals.has_key(sn):
            raise InvalidArgumentError("Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq))
        if skip_sq_sn_r.search(sn):
            skip_sns.append(sn)
            continue
        sn_intervals[sn] = (1, int(ln))
        sns.append(sn)
        total_len += int(ln)
    total_sequences = len(sns)

    print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex)

    # Chunk the genome into genome_chunks pieces
    # weighted by both number of base pairs and number of seqs
    print "Total sequences included: %s" % (total_sequences)
    print "Total genome length: %s" % (total_len)
    total_points = total_len + (total_sequences * weight_seq)
    chunk_points = int(total_points / genome_chunks)
    chunk_input_pdh_names = []
    print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points)
    for chunk_i in range(0, genome_chunks):
        chunk_num = chunk_i + 1
        chunk_intervals_count = 0
        chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks))
        print "Creating interval file for chunk %s" % chunk_num
        chunk_c = arvados.collection.CollectionWriter(num_retries=3)
        chunk_c.start_new_file(newfilename=chunk_input_name)
        # chunk_c.write(interval_header)
        remaining_points = chunk_points
        while len(sns) > 0:
            sn = sns.pop(0)
            remaining_points -= weight_seq
            if remaining_points <= 0:
                sns.insert(0, sn)
                break
            if not sn_intervals.has_key(sn):
                raise ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end-start+1) > remaining_points:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_points + start - 1
                assert((end-start+1) <= remaining_points)
                sn_intervals[sn] = (end+1, real_end)
                sns.insert(0, sn)
            #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn))
            interval = "%s\t%s\t%s\n" % (sn, start, end)
            remaining_points -= (end-start+1)
            chunk_c.write(interval)
            chunk_intervals_count += 1
            if remaining_points <= 0:
                break
        if chunk_intervals_count > 0:
            chunk_input_pdh = chunk_c.finish()
            print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        else:
            print "WARNING: skipping empty intervals for %s" % chunk_input_name
    print "Have %s chunk collections: [%s]" % (len(chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names]))

    # prepare CRAM input collections
    job_input = arvados.current_job()['script_parameters']['inputs_collection']
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        # Handle this CRAM file
        crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), 
                          crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), 
                                   None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={"manifest_text": task_input}).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise 

        create_chunk_tasks(f_name, chunk_input_pdh_names, 
                           if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, 
                           pool=pool)

    print "Waiting for asynchronous requests to complete"
    pool.close()
    pool.join()

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
Example #42
0
def one_task_per_interval(
        interval_count,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        task_key_params=['name', 'inputs', 'interval', 'ref'],
        script=arvados.current_job()['script']):
    """
    Queue one task for each of interval_count intervals, splitting
    the genome chunk (described by the .interval_list file) evenly.

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param=interval_list_param)

    interval_reader = open(interval_list_file, mode="r")

    lines = interval_reader.readlines()
    sn_intervals = dict()
    sns = []
    total_len = 0
    for line in lines:
        if line[0] == '@':
            # skip all lines starting with '@'
            continue
        fields = line.split("\t")
        if len(fields) != 5:
            raise errors.InvalidArgumentError(
                "interval_list %s has invalid line [%s] - expected 5 fields but got %s"
                % (interval_list_file, line, len(fields)))
        sn = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        length = int(end) - int(start) + 1
        total_len += int(length)
        sn_intervals[sn] = (start, end)
        sns.append(sn)

    print "Total chunk length is %s" % total_len
    interval_len = int(total_len / interval_count)
    intervals = []
    print "Splitting chunk into %s intervals of size ~%s" % (interval_count,
                                                             interval_len)
    for interval_i in range(0, interval_count):
        interval_num = interval_i + 1
        intervals_count = 0
        remaining_len = interval_len
        interval = []
        while len(sns) > 0:
            sn = sns.pop(0)
            if not sn_intervals.has_key(sn):
                raise errors.ValueError(
                    "sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end - start + 1) > remaining_len:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_len + start - 1
                assert ((end - start + 1) <= remaining_len)
                sn_intervals[sn] = (end + 1, real_end)
                sns.insert(0, sn)
            interval.append("%s:%s-%s" % (sn, start, end))
            remaining_len -= (end - start + 1)
            intervals_count += 1
            if remaining_len <= 0:
                break
        if intervals_count > 0:
            intervals.append(interval)
        else:
            print "WARNING: skipping empty intervals for %s" % interval_input_name
    print "Have %s intervals" % (len(intervals))

    if reuse_tasks:
        # get candidates for task reuse
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]
        reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params,
                                            job_filters)
        print "Have %s potentially reusable tasks" % (len(reusable_tasks))

    for interval in intervals:
        interval_str = ' '.join(interval)
        print "Creating task to process interval: [%s]" % interval_str
        new_task_params = arvados.current_task()['parameters']
        new_task_params['interval'] = interval_str
        if reuse_tasks:
            task = create_or_reuse_task(if_sequence + 1, new_task_params,
                                        reusable_tasks, task_key_params,
                                        validate_task_output)
        else:
            task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Example #43
0
#!/usr/bin/python

import arvados
import subprocess
import subst
import shutil
import os
import sys

if len(arvados.current_task()["parameters"]) > 0:
    p = arvados.current_task()["parameters"]
else:
    p = arvados.current_job()["script_parameters"]

t = arvados.current_task().tmpdir

os.unlink("/usr/local/share/bcbio-nextgen/galaxy")
os.mkdir("/usr/local/share/bcbio-nextgen/galaxy")
shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml", "/usr/local/share/bcbio-nextgen/galaxy")

with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml", "w") as f:
    f.write(
        """<tables>
    <!-- Locations of indexes in the BWA mapper format -->
    <table name="bwa_indexes" comment_char="#">
        <columns>value, dbkey, name, path</columns>
        <file path="tool-data/bwa_index.loc" />
    </table>
    <!-- Locations of indexes in the Bowtie2 mapper format -->
    <table name="bowtie2_indexes" comment_char="#">
        <columns>value, dbkey, name, path</columns>
Example #44
0
def one_task_per_group_combined_inputs(ref_input,
                                       job_input,
                                       interval_lists,
                                       group_by_regex,
                                       if_sequence=0,
                                       and_end_task=True,
                                       create_task_func=create_task):
    """
    Queue one task for each group of gVCFs and corresponding interval_list
    in the inputs_collection, with grouping based on three things:
      - the stream in which the gVCFs are held within the collection
      - the value of the named capture group "group_by" in the
        group_by_regex against the filename in the inputs_collection

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    group_by_r = re.compile(group_by_regex)

    # prepare interval_lists
    il_cr = arvados.CollectionReader(interval_lists)
    il_ignored_files = []
    interval_list_by_group = {}
    for s in il_cr.all_streams():
        for f in s.all_files():
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    interval_list_by_group[group_name][s.name(), f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            il_ignored_files.append("%s/%s" % (s.name(), f.name()))

    # prepare gVCF input collections
    cr = arvados.CollectionReader(job_input)
    ignored_files = []
    last_stream_name = ""
    gvcf_by_group = {}
    gvcf_indices = {}
    for s in sorted(cr.all_streams(), key=lambda stream: stream.name()):
        stream_name = s.name()
        # handle each stream name separately
        if stream_name != last_stream_name:
            if last_stream_name != "":
                print "Done processing files in stream %s" % last_stream_name
                one_task_per_gvcf_group_in_stream_combined_inputs(
                    last_stream_name,
                    gvcf_by_group,
                    gvcf_indices,
                    interval_list_by_group,
                    if_sequence,
                    ref_input,
                    create_task_func=create_task_func)
                # now that we are done with last_stream_name, reinitialise dicts to
                # process data from new stream
                print "Processing files in stream %s" % stream_name
                gvcf_by_group = {}
                gvcf_indices = {}
            last_stream_name = stream_name

        # loop over all the files in this stream (there may be only one)
        for f in s.all_files():
            if re.search(r'\.tbi$', f.name()):
                gvcf_indices[s.name(), f.name()] = f
                continue
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                gvcf_m = re.search(r'\.vcf\.gz$', f.name())
                if gvcf_m:
                    if group_name not in gvcf_by_group:
                        gvcf_by_group[group_name] = dict()
                    gvcf_by_group[group_name][s.name(), f.name()] = f
                    continue
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    if (s.name(),
                            f.name()) in interval_list_by_group[group_name]:
                        if interval_list_by_group[group_name][s.name(
                        ), f.name()].as_manifest() != f.as_manifest():
                            raise errors.InvalidArgumentError(
                                "Already have interval_list for group %s file %s/%s, but manifests are not identical!"
                                % (group_name, s.name(), f.name()))
                    else:
                        interval_list_by_group[group_name][s.name(),
                                                           f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            ignored_files.append("%s/%s" % (s.name(), f.name()))
    # finally, process the last stream
    print "Processing last stream"
    one_task_per_gvcf_group_in_stream_combined_inputs(
        stream_name,
        gvcf_by_group,
        gvcf_indices,
        interval_list_by_group,
        if_sequence,
        ref_input,
        create_task_func=create_task_func)

    # report on any ignored files
    if len(ignored_files) > 0:
        print "WARNING: ignored non-matching files in inputs_collection: %s" % (
            ' '.join(ignored_files))
        # TODO: could use `setmedian` from https://github.com/ztane/python-Levenshtein
        # to print most representative "median" filename (i.e. skipped 15 files like median), then compare the
        # rest of the files to that median (perhaps with `ratio`)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Example #45
0
def chunked_tasks_per_cram_file(
        ref_input,
        job_input,
        interval_lists,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        reuse_tasks_retrieve_all=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        script=arvados.current_job()['script']):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # prepare interval lists
    cr = arvados.CollectionReader(interval_lists)
    chunk_interval_list = {}
    chunk_input_pdh_names = []
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.interval_list$', f.name()):
                chunk_interval_list[s.name(), f.name()] = f
    for ((s_name, f_name),
         chunk_interval_list_f) in sorted(chunk_interval_list.items()):
        chunk_input = chunk_interval_list_f.as_manifest()
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": chunk_input
            }).execute()
            chunk_input_pdh = r["portable_data_hash"]
            chunk_input_name = os.path.join(s_name, f_name)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        except:
            raise

    if len(chunk_input_pdh_names) == 0:
        raise errors.InvalidArgumentError(
            "No interval_list files found in %s" % (interval_lists))

    # prepare CRAM input collections
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        crai_f = crai.get(
            (s_name, re.sub(r'cram$', 'crai', f_name)),
            crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise errors.InvalidArgumentError(
                "No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": task_input
            }).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        if reuse_tasks:
            task_key_params = ['input', 'ref', 'chunk']
            # get candidates for task reuse
            job_filters = [
                ['script', '=', script],
                ['repository', '=',
                 arvados.current_job()['repository']],
                ['script_version', 'in git', oldest_git_commit_to_reuse],
                [
                    'docker_image_locator', 'in docker',
                    arvados.current_job()['docker_image_locator']
                ],
            ]
            if reuse_tasks_retrieve_all:
                # retrieve a full set of all possible reusable tasks
                reusable_tasks = get_reusable_tasks(if_sequence + 1,
                                                    task_key_params,
                                                    job_filters)
                print "Have %s tasks for potential reuse" % (
                    len(reusable_tasks))
            else:
                reusable_task_jobs = get_jobs_for_task_reuse(job_filters)
                print "Have %s jobs for potential task reuse" % (
                    len(reusable_task_jobs))
                reusable_task_job_uuids = [
                    job['uuid'] for job in reusable_task_jobs['items']
                ]

        for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
            # Create task for each CRAM / chunk
            new_task_params = {
                'input': task_input_pdh,
                'ref': ref_input,
                'chunk': chunk_input_pdh
            }
            print "Creating new task to process %s with chunk interval %s " % (
                f_name, chunk_input_name)
            if reuse_tasks:
                if reuse_tasks_retrieve_all:
                    task = create_or_reuse_task(if_sequence + 1,
                                                new_task_params,
                                                reusable_tasks,
                                                task_key_params,
                                                validate_task_output)
                else:
                    task = create_or_reuse_task_from_jobs(
                        if_sequence + 1, new_task_params,
                        reusable_task_job_uuids, task_key_params,
                        validate_task_output)
            else:
                task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list
        script="gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=', arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                       group_by_regex,
                                                       if_sequence=0, and_end_task=True,
                                                       create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)

    this_job = arvados.current_job()

    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex=skip_sq_sn_regex,
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(
        ref_file,
        error_exception=FileAccessError("reference fasta not readable: %s" %
                                        ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(
        ref_fai_file,
        error_exception=FileAccessError(
            "reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(
        chunk_file,
        error_exception=FileAccessError(
            "Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file,
                              error_exception=FileAccessError(
                                  "CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(
        cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(
        tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

    #    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.g.vcf" %
                               (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" %
                               (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend(
        [concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')

    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")

    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()

        if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None)
                and (part_tee_p is None)
                and (bcftools_view_headeronly_p is None)
                and (bcftools_view_noheader_p is None)):
            # no per-region processes are running (they have finished or
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num,
                                               total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(
                    tmp_dir, output_basename +
                    (".part_%s_of_%s.noheader.g.bcf" %
                     (current_region_num, total_region_count)))
                part_tee_cmd = [
                    "teepot", bcftools_view_noheader_input_fifo, "-"
                ]
                bcftools_view_noheader_cmd = [
                    "bcftools", "view", "-H", "-Ov",
                    bcftools_view_noheader_input_fifo
                ]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = [
                    "bcftools", "view", "-h", "-Oz", "-o",
                    concat_headeronly_tmp
                ]
                bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"]
                bcftools_mpileup_cmd = [
                    "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50",
                    "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15",
                    "-f", ref_file, "-Ou", "-r", region, cram_file
                ]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (
                    region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe(
                )

                print "Creating 'bcftools norm | tee' pipe for region %s" % (
                    region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()

                print "Creating 'tee | bcftools view -h' pipe for region %s" % (
                    region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe(
                )

                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (
                    bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo,
                                                      'wb')

                bcftools_mpileup_p = run_child_cmd(
                    bcftools_mpileup_cmd,
                    stdout=bcftools_norm_stdin_pipe_write,
                    tag="bcftools mpileup %s" % (region_label))

                bcftools_norm_p = run_child_cmd(
                    bcftools_norm_cmd,
                    stdin=bcftools_norm_stdin_pipe_read,
                    stdout=part_tee_stdin_pipe_write,
                    tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(
                    part_tee_cmd,
                    stdin=part_tee_stdin_pipe_read,
                    stdout=bcftools_view_headeronly_stdin_pipe_write,
                    tag="tee %s" % (region_label))

                bcftools_view_headeronly_p = run_child_cmd(
                    bcftools_view_headeronly_cmd,
                    stdin=bcftools_view_headeronly_stdin_pipe_read,
                    tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(
                    bcftools_view_noheader_cmd,
                    stdout=current_concat_noheader_fifo_f,
                    tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(
            bcftools_mpileup_p,
            "bcftools mpileup %s" % (region_label),
            close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(
            bcftools_norm_p,
            "bcftools norm %s" % (region_label),
            close_fds=[
                bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write
            ])

        part_tee_p = close_process_if_finished(
            part_tee_p,
            "tee %s" % (region_label),
            close_fds=[
                part_tee_stdin_pipe_read,
                bcftools_view_headeronly_stdin_pipe_write
            ],
            ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(
            bcftools_view_headeronly_p,
            "bcftools view -h %s" % (region_label),
            close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(
            bcftools_view_noheader_p,
            "bcftools view %s" % (region_label),
            close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(
            region_concat_p, "bcftools concat", close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if ((region_concat_p is None) and (bcftools_view_noheader_p is None)
                and (bcftools_view_headeronly_p is None)
                and (part_tee_p is None) and (bcftools_norm_p is None)
                and (bcftools_mpileup_p is None)):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir,
                                              output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (
        concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [
                concat_headeronly_tmps[region] for region in regions
        ]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir,
                                        output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe"
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe(
    )

    grep_headeronly_cmd = [
        "egrep", "-v", "^[#][#](bcftools|mpileup|reference)"
    ]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = [
        "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn
    ]
    bcftools_concat_headeronly_p = run_child_cmd(
        bcftools_concat_headeronly_cmd,
        stdout=grep_headeronly_stdin_pipe_write,
        tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(
            bcftools_concat_headeronly_p,
            "bcftools concat (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(
            grep_headeronly_p,
            "grep (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_read],
            close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
                and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done!
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"

    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (
            final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd,
                                      tag="final bgzip",
                                      stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(
                final_bgzip_p, "final bgzip", close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe(
        )
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = [
            "bcftools", "view", "-Oz", "-o", penultimate_out_file
        ]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(
            final_bcftools_view_cmd,
            tag="final bcftools view -Oz",
            stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(
            final_concat_cmd,
            tag="final cat (header+data)",
            stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(
                final_bcftools_view_p,
                "final bcftools view -Oz",
                close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(
                final_concat_p,
                "final cat (header+data)",
                close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None) and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (
            final_out_file)
        final_bcftools_reheader_cmd = [
            "bcftools", "reheader", "-h", final_headeronly_tmp, "-o",
            final_out_file, penultimate_out_file
        ]
        final_bcftools_reheader_p = run_child_cmd(
            final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(
                final_bcftools_reheader_p, "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
Example #48
0
#!/usr/bin/env python

# Import the hashlib module (part of the Python standard library) to compute md5.
import hashlib

# Import the Arvados sdk module
import arvados

# Get information about the task from the environment
this_task = arvados.current_task()

# Get the "input" field from "script_parameters" on the job creation object
this_job_input = arvados.getjobparam('input')

# Create the object access to the collection referred to in the input
collection = arvados.CollectionReader(this_job_input)

# Create an object to write a new collection as output
out = arvados.CollectionWriter()

# Set the name of output file within the collection
out.set_current_file_name("md5sum.txt")

# Get an iterator over the files listed in the collection
all_files = collection.all_files()

# Iterate over each file
for input_file in all_files:
    # Create the object that will actually compute the md5 hash
    digestor = hashlib.new('md5')
def one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex='_decoy$',
                           genome_chunks=200):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # setup multiprocessing pool
    pool_processes = cpu_count() - 1
    print 'Using %d processes to submit tasks\n' % pool_processes
    pool = Pool(processes=pool_processes)

    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    # Ensure we have a .fa reference file with corresponding .fai index and .dict
    reference_coll = arvados.current_job(
    )['script_parameters']['reference_collection']
    rcr = arvados.CollectionReader(reference_coll)
    ref_fasta = {}
    ref_fai = {}
    ref_dict = {}
    ref_input = None
    dict_reader = None
    for rs in rcr.all_streams():
        for rf in rs.all_files():
            if re.search(r'\.fa$', rf.name()):
                ref_fasta[rs.name(), rf.name()] = rf
            elif re.search(r'\.fai$', rf.name()):
                ref_fai[rs.name(), rf.name()] = rf
            elif re.search(r'\.dict$', rf.name()):
                ref_dict[rs.name(), rf.name()] = rf
    for ((s_name, f_name), fasta_f) in ref_fasta.items():
        fai_f = ref_fai.get(
            (s_name, re.sub(r'fa$', 'fai', f_name)),
            ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None))
        dict_f = ref_dict.get(
            (s_name, re.sub(r'fa$', 'dict', f_name)),
            ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None))
        if fasta_f and fai_f and dict_f:
            # found a set of all three!
            ref_input = fasta_f.as_manifest()
            ref_input += fai_f.as_manifest()
            ref_input += dict_f.as_manifest()
            dict_reader = dict_f
            break
    if ref_input is None:
        raise InvalidArgumentError(
            "Expected a reference fasta with fai and dict in reference_collection. Found [%s]"
            % ' '.join(rf.name() for rf in rs.all_files()))
    if dict_reader is None:
        raise InvalidArgumentError(
            "Could not find .dict file in reference_collection. Found [%s]" %
            ' '.join(rf.name() for rf in rs.all_files()))

    # Create a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={
            "manifest_text": ref_input
        }).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise

    # Load the dict data
    interval_header = ""
    dict_lines = dict_reader.readlines()
    dict_header = dict_lines.pop(0)
    if re.search(r'^@HD', dict_header) is None:
        raise InvalidArgumentError(
            "Dict file in reference collection does not have correct header: [%s]"
            % dict_header)
    interval_header += dict_header
    print "Dict header is %s" % dict_header
    sn_intervals = dict()
    sns = []
    skip_sns = []
    total_len = 0
    for sq in dict_lines:
        if re.search(r'^@SQ', sq) is None:
            raise InvalidArgumentError(
                "Dict file contains malformed SQ line: [%s]" % sq)
        interval_header += sq
        sn = None
        ln = None
        for tagval in sq.split("\t"):
            tv = tagval.split(":", 1)
            if tv[0] == "SN":
                sn = tv[1]
            if tv[0] == "LN":
                ln = tv[1]
            if sn and ln:
                break
        if not (sn and ln):
            raise InvalidArgumentError(
                "Dict file SQ entry missing required SN and/or LN parameters: [%s]"
                % sq)
        assert (sn and ln)
        if sn_intervals.has_key(sn):
            raise InvalidArgumentError(
                "Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq))
        if skip_sq_sn_r.search(sn):
            skip_sns.append(sn)
            continue
        sn_intervals[sn] = (1, int(ln))
        sns.append(sn)
        total_len += int(ln)
    total_sequences = len(sns)

    print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns),
                                                           skip_sq_sn_regex)

    # Chunk the genome into genome_chunks pieces
    # weighted by both number of base pairs and number of seqs
    print "Total sequences included: %s" % (total_sequences)
    print "Total genome length: %s" % (total_len)
    total_points = total_len + (total_sequences * weight_seq)
    chunk_points = int(total_points / genome_chunks)
    chunk_input_pdh_names = []
    print "Chunking genome into %s chunks of ~%s points" % (genome_chunks,
                                                            chunk_points)
    for chunk_i in range(0, genome_chunks):
        chunk_num = chunk_i + 1
        chunk_intervals_count = 0
        chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" %
                                                 (chunk_num, genome_chunks))
        print "Creating interval file for chunk %s" % chunk_num
        chunk_c = arvados.collection.CollectionWriter(num_retries=3)
        chunk_c.start_new_file(newfilename=chunk_input_name)
        # chunk_c.write(interval_header)
        remaining_points = chunk_points
        while len(sns) > 0:
            sn = sns.pop(0)
            remaining_points -= weight_seq
            if remaining_points <= 0:
                sns.insert(0, sn)
                break
            if not sn_intervals.has_key(sn):
                raise ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end - start + 1) > remaining_points:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_points + start - 1
                assert ((end - start + 1) <= remaining_points)
                sn_intervals[sn] = (end + 1, real_end)
                sns.insert(0, sn)
            #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn))
            interval = "%s\t%s\t%s\n" % (sn, start, end)
            remaining_points -= (end - start + 1)
            chunk_c.write(interval)
            chunk_intervals_count += 1
            if remaining_points <= 0:
                break
        if chunk_intervals_count > 0:
            chunk_input_pdh = chunk_c.finish()
            print "Chunk intervals file %s saved as %s" % (chunk_input_name,
                                                           chunk_input_pdh)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        else:
            print "WARNING: skipping empty intervals for %s" % chunk_input_name
    print "Have %s chunk collections: [%s]" % (len(
        chunk_input_pdh_names), ' '.join([x[0]
                                          for x in chunk_input_pdh_names]))

    # prepare CRAM input collections
    job_input = arvados.current_job()['script_parameters']['inputs_collection']
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        # Handle this CRAM file
        crai_f = crai.get(
            (s_name, re.sub(r'cram$', 'crai', f_name)),
            crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise InvalidArgumentError(
                "No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": task_input
            }).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        create_chunk_tasks(f_name,
                           chunk_input_pdh_names,
                           if_sequence,
                           task_input_pdh,
                           ref_input_pdh,
                           chunk_input_pdh,
                           pool=pool)

    print "Waiting for asynchronous requests to complete"
    pool.close()
    pool.join()

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Example #50
0
  lsinfo = sp.check_output(["ls", "-lahR"])
  of.write("directory structure:\n" + lsinfo)

  dfinfo = sp.check_output(["df", "-h"])
  of.write("df:\n" + dfinfo)

  meminfo = sp.check_output(["free", "-hm"])
  of.write("mem:\n" + meminfo)

  hostinfo = sp.check_output(["hostname"])
  of.write("host: " + hostinfo)


job = arv.current_job()
task = arv.current_task()

of = arv.CollectionWriter()
of.set_current_file_name("info.log")

whoinfo = sp.check_output(["whoami"])
of.write("user: "******"\n" )

pwdinfo = sp.check_output(["pwd"])
of.write("pwd: " + pwdinfo + "\n" )

lsinfo = sp.check_output(["ls", "-lahR"])
of.write("directory structure:\n" + lsinfo)

dfinfo = sp.check_output(["df", "-h"])
of.write("df:\n" + dfinfo + "\n" )
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                   group_by_regex, max_gvcfs_to_combine,
                                                   if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(interval_count, validate_task_output,
                                      reuse_tasks=True,
                                      oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
                                      if_sequence=1, and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert(this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)
    
    this_job = arvados.current_job()
    
    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True, 
                           skip_sq_sn_regex=skip_sq_sn_regex, 
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert(this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task 
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)


    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) 
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

#    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')
    
    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")
    
    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()
    
        if (
                (bcftools_mpileup_p is None) and
                (bcftools_norm_p is None) and 
                (part_tee_p is None) and
                (bcftools_view_headeronly_p is None) and 
                (bcftools_view_noheader_p is None)
        ):
            # no per-region processes are running (they have finished or 
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count)))
                part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"]
                bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp]
                bcftools_norm_cmd = ["bcftools", "norm", 
                                     "-f", ref_file, 
                                     "-Ou"]
                bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup",
                                        "-t", "AD,INFO/AD",
                                        "-C50", 
                                        "-pm2", 
                                        "-F0.1",
                                        "-d10000",
                                        "--gvcf", "1,2,3,4,5,10,15",
                                        "-f", ref_file,
                                        "-Ou",
                                        "-r", region,
                                        cram_file]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe()

                print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb')

                bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd,
                                                   stdout=bcftools_norm_stdin_pipe_write,
                                                   tag="bcftools mpileup %s" % (region_label))
                
                bcftools_norm_p = run_child_cmd(bcftools_norm_cmd,
                                                stdin=bcftools_norm_stdin_pipe_read,
                                                stdout=part_tee_stdin_pipe_write,
                                                tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(part_tee_cmd,
                                           stdin=part_tee_stdin_pipe_read,
                                           stdout=bcftools_view_headeronly_stdin_pipe_write,
                                           tag="tee %s" % (region_label))
                
                bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd,
                                                           stdin=bcftools_view_headeronly_stdin_pipe_read,
                                                           tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd,
                                                         stdout=current_concat_noheader_fifo_f,
                                                         tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p,
                                                       "bcftools mpileup %s" % (region_label),
                                                       close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(bcftools_norm_p,
                                                    "bcftools norm %s" % (region_label),
                                                    close_fds=[bcftools_norm_stdin_pipe_read, 
                                                               part_tee_stdin_pipe_write])
        
        part_tee_p = close_process_if_finished(part_tee_p,
                                               "tee %s" % (region_label),
                                               close_fds=[part_tee_stdin_pipe_read,
                                                          bcftools_view_headeronly_stdin_pipe_write],
                                               ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p,
                                                               "bcftools view -h %s" % (region_label),
                                                               close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p,
                                                             "bcftools view %s" % (region_label),
                                                             close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(region_concat_p,
                                                      "bcftools concat",
                                                      close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if (
            (region_concat_p is None)
            and (bcftools_view_noheader_p is None)
            and (bcftools_view_headeronly_p is None)
            and (part_tee_p is None)
            and (bcftools_norm_p is None)
            and (bcftools_mpileup_p is None)
            ):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration


    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe" 
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe()

    grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn]
    bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd,
                                                 stdout=grep_headeronly_stdin_pipe_write,
                                                 tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p,
                                                                 "bcftools concat (headeronly)",
                                                                 close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(grep_headeronly_p,
                                                      "grep (headeronly)",
                                                      close_fds=[grep_headeronly_stdin_pipe_read],
                                                      close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
            and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done! 
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"


    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and 
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(final_bgzip_p,
                                                      "final bgzip",
                                                      close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe()
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p,
                                                              "final bcftools view -Oz",
                                                              close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(final_concat_p,
                                                       "final cat (header+data)",
                                                       close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None)
                and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (final_out_file)
        final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file]
        final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p,
                                                              "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()["script_parameters"]["reference_collection"]
    )
    job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"]
    interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"]
    interval_count = 1
    if "interval_count" in arvados.current_job()["script_parameters"]:
        interval_count = arvados.current_job()["script_parameters"]["interval_count"]

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.chunked_tasks_per_cram_file(
        ref_input_pdh,
        job_input_pdh,
        interval_lists_pdh,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=False,
        oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2",
        script="gatk-haplotypecaller-cram.py",
    )

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert this_task["sequence"] != 0

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if "reuse_job_task" in this_task["parameters"]:
        print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"]
        exit(0)

    ################################################################################
    # Phase IIb: Call Haplotypes!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk")
    cram_file = gatk_helper.mount_gatk_cram_input(input_param="input")
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    out_dir = hgi_arvados.prepare_out_dir()
    out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_filename = out_filename.replace(".bcf", "._cf")

    # HaplotypeCaller!
    gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename))

    if gatk_exit != 0:
        print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        print "Task output written to keep, validating it"
        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
Example #54
0
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        debug = False

        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        if "arv:debug" in job_order_object:
            debug = job_order_object["arv:debug"]
            del job_order_object["arv:debug"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache(
            api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}),
                                          output_name=output_name, output_tags=output_tags)

        make_fs_access = functools.partial(CollectionFsAccess,
                                 collection_cache=runner.collection_cache)

        t = load_tool(toolpath, runner.arv_make_tool,
                      fetcher_constructor=functools.partial(CollectionFetcher,
                                                  api_client=runner.api,
                                                  fs_access=make_fs_access(""),
                                                  num_retries=runner.num_retries))

        if debug:
            logger.setLevel(logging.DEBUG)
            logging.getLogger('arvados').setLevel(logging.DEBUG)
            logging.getLogger("cwltool").setLevel(logging.DEBUG)

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = debug
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        args.make_fs_access = make_fs_access
        args.trash_intermediate = False
        args.intermediate_output_ttl = 0
        args.priority = arvados_cwl.DEFAULT_PRIORITY
        args.do_validate = True
        args.disable_js_validation = False

        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()
    % (t1 - t0))
print "Scaled, truncated, test population shape is %s, number of MB is %f" % (
    test_population.shape, test_population.nbytes / 1000000.)

with out.open('%s_training.npy' % (antigen_type), 'w') as f:
    np.save(f, population)

with out.open('%s_test.npy' % (antigen_type), 'w') as f:
    np.save(f, test_population)

with out.open('%s_training_labels.npy' % (antigen_type), 'w') as f:
    np.save(f, training_labels)

with out.open('%s_where_well_sequenced.npy' % (antigen_type), 'w') as f:
    np.save(f, where_well_sequenced)

with out.open('%s_where_homozygous.npy' % (antigen_type), 'w') as f:
    np.save(f, where_homozygous)

with out.open('python_variables.py', 'w') as f:
    f.write('training_names=%s\n' % (training_names))
    f.write('test_names=%s\n' % (test_names))

time_logging_fh.close()

# Commit the output to keep
task_output = out.save_new(create_collection_record=False)
arvados.current_task().set_output(task_output)

###########################################################################################################################################
Example #56
0
#!/usr/bin/python
#

import arvados
import re

arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)

this_job = arvados.current_job()
this_task = arvados.current_task()
this_task_input = this_task['parameters']['input']

input_file = list( arvados.CollectionReader(this_task_input).all_files() )[0]

out = arvados.CollectionWriter()
out.set_current_file_name(input_file.decompressed_name())
out.set_current_stream_name(input_file.stream_name())
for line in input_file.readlines():
  out.write( "!!!" + line.upper() )

this_task.set_output(out.finish())