Beispiel #1
0
def spawn_new_task_per_file(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
        if not re.search(regex,name):
            continue
        new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': name,
                        }
                    }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
Beispiel #2
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': left_file.as_manifest(),
                        'input_2': right_file.as_manifest()
                    }
                }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #3
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1':left_file.as_manifest(),
                        'input_2':right_file.as_manifest()
                        }
                    }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit(0)
Beispiel #4
0
def tarball_extract(tarball, path):
    """Retrieve a tarball from Keep and extract it to a local
    directory.  Return the absolute path where the tarball was
    extracted. If the top level of the tarball contained just one
    file or directory, return the absolute path of that single
    item.

    tarball -- collection locator
    path -- where to extract the tarball: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == tarball:
            already_have_it = True
    except OSError:
        pass
    if not already_have_it:

        # emulate "rm -f" (i.e., if the file does not exist, we win)
        try:
            os.unlink(os.path.join(path, '.locator'))
        except OSError:
            if os.path.exists(os.path.join(path, '.locator')):
                os.unlink(os.path.join(path, '.locator'))

        for f in CollectionReader(tarball).all_files():
            if re.search('\.(tbz|tar.bz2)$', f.name()):
                p = tar_extractor(path, 'j')
            elif re.search('\.(tgz|tar.gz)$', f.name()):
                p = tar_extractor(path, 'z')
            elif re.search('\.tar$', f.name()):
                p = tar_extractor(path, '')
            else:
                raise arvados.errors.AssertionError(
                    "tarball_extract cannot handle filename %s" % f.name())
            while True:
                buf = f.read(2**20)
                if len(buf) == 0:
                    break
                p.stdin.write(buf)
            p.stdin.close()
            p.wait()
            if p.returncode != 0:
                lockfile.close()
                raise arvados.errors.CommandFailedError(
                    "tar exited %d" % p.returncode)
        os.symlink(tarball, os.path.join(path, '.locator'))
    tld_extracts = [f for f in os.listdir(path) if f != '.locator']
    lockfile.close()
    if len(tld_extracts) == 1:
        return os.path.join(path, tld_extracts[0])
    return path
def create_chunk_tasks(f_name, chunk_input_pdh_names, 
                       if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, 
                       pool=None):
    async_results = []
    for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
        # Create task for each CRAM / chunk
        job_uuid = arvados.current_job()['uuid']
        current_task_uuid = arvados.current_task()['uuid']
        new_task_attrs = {
            'job_uuid': job_uuid,
            'created_by_job_task_uuid': current_task_uuid,
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input_pdh,
                'ref': ref_input_pdh,
                'chunk': chunk_input_pdh
                }
            }
        async_result = pool.apply_async(arv_create_task, (
                new_task_attrs,
                "Created new task to process %s with chunk interval %s (job_uuid %s)" % (f_name, chunk_input_name, job_uuid)))
        async_results.append(async_result)

    for async_result in async_results:
        async_result.wait()
        try:
            (res, report) = async_result.get()
            if (not res) or (not 'qsequence' in res):
                raise InternalError("Could not create job task: %s" % res)
            else:
                print report + " qsequence %s" % res['qsequence']
        except Exception as e:
            raise InternalError("Exception creating job task: %s" % e)
Beispiel #6
0
def git_checkout(url, version, path):
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    if not os.path.exists(path):
        run_command(["git", "clone", url, path], cwd=os.path.dirname(path))
    run_command(["git", "checkout", version], cwd=path)
    return path
Beispiel #7
0
def tarball_extract(tarball, path):
    """Retrieve a tarball from Keep and extract it to a local
    directory.  Return the absolute path where the tarball was
    extracted. If the top level of the tarball contained just one
    file or directory, return the absolute path of that single
    item.

    tarball -- collection locator
    path -- where to extract the tarball: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == tarball:
            already_have_it = True
    except OSError:
        pass
    if not already_have_it:

        # emulate "rm -f" (i.e., if the file does not exist, we win)
        try:
            os.unlink(os.path.join(path, '.locator'))
        except OSError:
            if os.path.exists(os.path.join(path, '.locator')):
                os.unlink(os.path.join(path, '.locator'))

        for f in CollectionReader(tarball).all_files():
            if re.search('\.(tbz|tar.bz2)$', f.name()):
                p = tar_extractor(path, 'j')
            elif re.search('\.(tgz|tar.gz)$', f.name()):
                p = tar_extractor(path, 'z')
            elif re.search('\.tar$', f.name()):
                p = tar_extractor(path, '')
            else:
                raise arvados.errors.AssertionError(
                    "tarball_extract cannot handle filename %s" % f.name())
            while True:
                buf = f.read(2**20)
                if len(buf) == 0:
                    break
                p.stdin.write(buf)
            p.stdin.close()
            p.wait()
            if p.returncode != 0:
                lockfile.close()
                raise arvados.errors.CommandFailedError("tar exited %d" %
                                                        p.returncode)
        os.symlink(tarball, os.path.join(path, '.locator'))
    tld_extracts = [f for f in os.listdir(path) if f != '.locator']
    lockfile.close()
    if len(tld_extracts) == 1:
        return os.path.join(path, tld_extracts[0])
    return path
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output):
    new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': sequence,
            'parameters': parameters
            }
    # See if there is a task in reusable_tasks that can be reused
    ct_index = tuple([parameters[index_param] for index_param in task_key_params])
    if len(reusable_tasks) == 0:
        print "No reusable tasks were available"
    elif ct_index in reusable_tasks:
        # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output
        reuse_task = reusable_tasks[ct_index]
        if validate_task_output(reuse_task['output']):
            print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output'])
            # remove task from reusable_tasks as it won't be used more than once
            del reusable_tasks[ct_index]
            # copy relevant attrs from reuse_task so that the new tasks start already finished
            for attr in ['success', 'output', 'progress', 'started_at', 'finished_at', 'parameters']:
                new_task_attrs[attr] = reuse_task[attr]
            # crunch seems to ignore the fact that the job says it is done and queue it anyway
            # signal ourselves to just immediately exit successfully when we are run
            new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid']
        else:
            print "Output %s for potential task reuse did not validate" % (reuse_task['output'])
    else:
        print "No reusable JobTask matched key parameters %s" % (list(ct_index))

    # Create the "new" task (may be new work or may be already finished work)
    new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if not new_task:
        raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs)
    return new_task
Beispiel #9
0
def one_task_per_bam_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each bam file in this job's input collection.

    Each new task will have an "input" parameter: a manifest
    containing one .bam file and (if available) the corresponding .bai
    index file.

    Files in the input collection that are not named *.bam or *.bai
    (as well as *.bai files that do not match any .bam file present)
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    bam = {}
    bai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.bam$', f.name()):
                bam[s.name(), f.name()] = f
            elif re.search(r'\.bai$', f.name()):
                bai[s.name(), f.name()] = f
    for ((s_name, f_name), bam_f) in bam.items():
        bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
        task_input = bam_f.as_manifest()
        if bai_f:
            task_input += bai_f.as_manifest()
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input
            }
        }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #10
0
def install_path():
    global gatk2_install_path
    if gatk2_install_path:
        return gatk2_install_path
    gatk2_install_path = arvados.util.tarball_extract(
        tarball=arvados.current_job()['script_parameters']['gatk_tbz'],
        path='gatk2')
    return gatk2_install_path
Beispiel #11
0
def one_task_per_bam_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each bam file in this job's input collection.

    Each new task will have an "input" parameter: a manifest
    containing one .bam file and (if available) the corresponding .bai
    index file.

    Files in the input collection that are not named *.bam or *.bai
    (as well as *.bai files that do not match any .bam file present)
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    bam = {}
    bai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.bam$', f.name()):
                bam[s.name(), f.name()] = f
            elif re.search(r'\.bai$', f.name()):
                bai[s.name(), f.name()] = f
    for ((s_name, f_name), bam_f) in bam.items():
        bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
        task_input = bam_f.as_manifest()
        if bai_f:
            task_input += bai_f.as_manifest()
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input
                }
            }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
Beispiel #12
0
def install_path():
    global gatk2_install_path
    if gatk2_install_path:
        return gatk2_install_path
    gatk2_install_path = arvados.util.tarball_extract(
        tarball = arvados.current_job()['script_parameters']['gatk_tbz'],
        path = 'gatk2')
    return gatk2_install_path
Beispiel #13
0
def create_task(sequence, params):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': params
    }
    task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    return task
def create_task(sequence, params):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': params
    }
    task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    return task
Beispiel #15
0
def git_checkout(url, version, path):
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    if not os.path.exists(path):
        run_command(["git", "clone", url, path],
                    cwd=os.path.dirname(path))
    run_command(["git", "checkout", version],
                cwd=path)
    return path
def main():
    # Get object representing the current task
    this_task = arvados.current_task()

    sort_by_r = re.compile(sort_by_regex)

    ################################################################################
    # Concatentate VCFs in numerically sorted order of sort_by_regex
    ################################################################################
    vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    output_prefix = arvados.current_job()['script_parameters']['output_prefix']
    out_file = output_prefix + ".vcf.gz"

    # Concatenate VCFs
    bcftools_concat_exit = bcftools.concat(sorted(vcf_files, key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))),
                                    os.path.join(out_dir, out_file))

    if bcftools_concat_exit != 0:
        print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "bcftools concat exited successfully, indexing"

        bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file))

        if bcftools_index_exit != 0:
            print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                         ).execute()
        else:
            print "bcftools index exited successfully, writing output to keep"


            # Write a new collection as output
            out = arvados.CollectionWriter()

            # Write out_dir to keep
            out.write_directory_tree(out_dir)

            # Commit the output to Keep.
            output_locator = out.finish()

            if validate_task_output(output_locator):
                print "Task output validated, setting output to %s" % (output_locator)

                # Use the resulting locator as the output for this task.
                this_task.set_output(output_locator)
            else:
                print "ERROR: Failed to validate task output (%s)" % (output_locator)
                arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                                 body={'success':False}
                                             ).execute()
Beispiel #17
0
def setup():
    global rtg_install_path
    if rtg_install_path:
        return rtg_install_path
    rtg_path = arvados.util.zipball_extract(
        zipball = arvados.current_job()['script_parameters']['rtg_binary_zip'],
        path = 'rtg')
    rtg_license_path = arvados.util.collection_extract(
        collection = arvados.current_job()['script_parameters']['rtg_license'],
        path = 'license',
        decompress = False)

    # symlink to rtg-license.txt
    license_txt_path = os.path.join(rtg_license_path, 'rtg-license.txt')
    try:
        os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt'))
    except OSError:
        if not os.path.exists(os.path.join(rtg_path,'rtg-license.txt')):
            os.symlink(license_txt_path, os.path.join(rtg_path,'rtg-license.txt'))

    rtg_install_path = rtg_path
    return rtg_path
Beispiel #18
0
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from 
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
	if not re.search(regex,name):
	    continue
	name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name)
        bed_lines = (line.split() for line in open(name_path, 'r'))
        # Start the biggest regions first
        def cmp_desc_region_size(a, b):
            return ((int(b[2]) - int(b[1])) -
                    (int(a[2]) - int(a[1])))
        for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size):
	    print bed_line
     	    new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'chrom': bed_line[0],
			'start': bed_line[1],
			'end': bed_line[2]
                        }
                    }
            arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit()
Beispiel #19
0
def setup():
    global rtg_install_path
    if rtg_install_path:
        return rtg_install_path
    rtg_path = arvados.util.zipball_extract(
        zipball=arvados.current_job()['script_parameters']['rtg_binary_zip'],
        path='rtg')
    rtg_license_path = arvados.util.collection_extract(
        collection=arvados.current_job()['script_parameters']['rtg_license'],
        path='license',
        decompress=False)

    # symlink to rtg-license.txt
    license_txt_path = os.path.join(rtg_license_path, 'rtg-license.txt')
    try:
        os.symlink(license_txt_path, os.path.join(rtg_path, 'rtg-license.txt'))
    except OSError:
        if not os.path.exists(os.path.join(rtg_path, 'rtg-license.txt')):
            os.symlink(license_txt_path,
                       os.path.join(rtg_path, 'rtg-license.txt'))

    rtg_install_path = rtg_path
    return rtg_path
def main():
    current_job = arvados.current_job()

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the interval_list collection to only those files relevant to gatk
    il_input_pdh = prepare_gatk_interval_list_collection(interval_list_coll=current_job['script_parameters']['interval_list_collection'])

    # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection
    output_locator = create_interval_lists(genome_chunks, il_input_pdh)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
Beispiel #21
0
def install_path():
    global picard_install_path
    if picard_install_path:
        return picard_install_path
    zipball = arvados.current_job()['script_parameters']['picard_zip']
    extracted = arvados.util.zipball_extract(zipball=zipball, path='picard')
    for f in os.listdir(extracted):
        if (re.search(r'^picard-tools-[\d\.]+$', f)
                and os.path.exists(os.path.join(extracted, f, '.'))):
            picard_install_path = os.path.join(extracted, f)
            break
    if not picard_install_path:
        raise Exception("picard-tools-{version} directory not found in %s" %
                        zipball)
    return picard_install_path
Beispiel #22
0
def install_path():
    global picard_install_path
    if picard_install_path:
        return picard_install_path
    zipball = arvados.current_job()['script_parameters']['picard_zip']
    extracted = arvados.util.zipball_extract(
        zipball = zipball,
        path = 'picard')
    for f in os.listdir(extracted):
        if (re.search(r'^picard-tools-[\d\.]+$', f) and
            os.path.exists(os.path.join(extracted, f, '.'))):
            picard_install_path = os.path.join(extracted, f)
            break
    if not picard_install_path:
        raise Exception("picard-tools-{version} directory not found in %s" %
                        zipball)
    return picard_install_path
Beispiel #23
0
def main():
    current_job = arvados.current_job()

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the interval_list collection to only those files relevant to gatk
    il_input_pdh = prepare_gatk_interval_list_collection(
        interval_list_coll=current_job['script_parameters']
        ['interval_list_collection'])

    # Create an interval_list file for each chunk based on the .interval_list in the interval_list collection
    output_locator = create_interval_lists(genome_chunks, il_input_pdh)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
Beispiel #24
0
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params,
                         validate_task_output):
    new_task_attrs = {
        'job_uuid': arvados.current_job()['uuid'],
        'created_by_job_task_uuid': arvados.current_task()['uuid'],
        'sequence': sequence,
        'parameters': parameters
    }
    # See if there is a task in reusable_tasks that can be reused
    ct_index = tuple(
        [parameters[index_param] for index_param in task_key_params])
    if len(reusable_tasks) == 0:
        print "No reusable tasks were available"
    elif ct_index in reusable_tasks:
        # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output
        reuse_task = reusable_tasks[ct_index]
        if validate_task_output(reuse_task['output']):
            print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (
                reuse_task['uuid'], reuse_task['job_uuid'],
                reuse_task['output'])
            # remove task from reusable_tasks as it won't be used more than once
            del reusable_tasks[ct_index]
            # copy relevant attrs from reuse_task so that the new tasks start already finished
            for attr in [
                    'success', 'output', 'progress', 'started_at',
                    'finished_at', 'parameters'
            ]:
                new_task_attrs[attr] = reuse_task[attr]
            # crunch seems to ignore the fact that the job says it is done and queue it anyway
            # signal ourselves to just immediately exit successfully when we are run
            new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid']
        else:
            print "Output %s for potential task reuse did not validate" % (
                reuse_task['output'])
    else:
        print "No reusable JobTask matched key parameters %s" % (
            list(ct_index))

    # Create the "new" task (may be new work or may be already finished work)
    new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if not new_task:
        raise errors.APIError("Attempt to create new job_task failed: [%s]" %
                              new_task_attrs)
    return new_task
def main():
    current_job = arvados.current_job()
    skip_sq_sn_regex = '_decoy$'
    if 'skip_sq_sn_regex' in current_job['script_parameters']:
        skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex']
    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the reference collection to only those files relevant to gatk
    ref_input_pdh = prepare_gatk_reference_collection(reference_coll=current_job['script_parameters']['reference_collection'])

    # Create an interval_list file for each chunk based on the .dict in the reference collection
    output_locator = create_interval_lists(genome_chunks, ref_input_pdh, skip_sq_sn_r)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True):
    if if_sequence != arvados.current_task()['sequence']:
        return
    api_client = arvados.api('v1')
    for i in range(num_classifiers_to_parameterize):
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'classifier_index':i,
                'time_to_wait':i*560
            }
        }
        api_client.job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        api_client.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                      body={'success':True}
                                      ).execute()
        exit(0)
Beispiel #27
0
def samtools_install_path():
    """
    Extract the samtools source tree, build the samtools binary, and
    return the path to the source tree.
    """
    global samtools_path
    if samtools_path:
        return samtools_path
    samtools_path = arvados.util.tarball_extract(
        tarball = arvados.current_job()['script_parameters']['samtools_tgz'],
        path = 'samtools')

    # build "samtools" binary
    lockfile = open(os.path.split(samtools_path)[0] + '.samtools-make.lock',
                    'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    arvados.util.run_command(['make', '-j16'], cwd=samtools_path)
    lockfile.close()

    return samtools_path
Beispiel #28
0
def install_path():
    """
    Extract the bwa source tree, build the bwa binary, and return the
    path to the source tree.
    """
    global bwa_install_path
    if bwa_install_path:
        return bwa_install_path

    bwa_install_path = arvados.util.tarball_extract(
        tarball=arvados.current_job()['script_parameters']['bwa_tbz'],
        path='bwa')

    # build "bwa" binary
    lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path)
    lockfile.close()

    return bwa_install_path
Beispiel #29
0
def samtools_install_path():
    """
    Extract the samtools source tree, build the samtools binary, and
    return the path to the source tree.
    """
    global samtools_path
    if samtools_path:
        return samtools_path
    samtools_path = arvados.util.tarball_extract(
        tarball=arvados.current_job()['script_parameters']['samtools_tgz'],
        path='samtools')

    # build "samtools" binary
    lockfile = open(
        os.path.split(samtools_path)[0] + '.samtools-make.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    arvados.util.run_command(['make', '-j16'], cwd=samtools_path)
    lockfile.close()

    return samtools_path
Beispiel #30
0
def get_file_path(parameter,regex):
    """
    Return the path to a file with (name) set in script parameters (parameter), using regex (regex):
    
    Basically to avoid: 
    ref_collection_id = this_job['script_parameters']['reference_index']
    ref_collection = coll(ref_collection_id)
    for file in ref_collection:
    if not re.search('.*f(ast)?a(.gz)?$',file):
        continue
    ref_file = file
    ref_path = os.path.join(arvados.get_job_param_mount("reference_index"),ref_file)
    """
    collection_id = arvados.current_job()['script_parameters'][parameter]
    collection_handle = Collection(collection_id)
    for file in collection_handle:
	if not re.search(regex,file):
	    continue
	out_file = file
    out_path = os.path.join(arvados.get_job_param_mount(parameter),out_file)
    return out_path
Beispiel #31
0
def install_path():
    """
    Extract the bwa source tree, build the bwa binary, and return the
    path to the source tree.
    """
    global bwa_install_path
    if bwa_install_path:
        return bwa_install_path

    bwa_install_path = arvados.util.tarball_extract(
        tarball = arvados.current_job()['script_parameters']['bwa_tbz'],
        path = 'bwa')

    # build "bwa" binary
    lockfile = open(os.path.split(bwa_install_path)[0] + '.bwa-make.lock',
                    'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    arvados.util.run_command(['make', '-j16'], cwd=bwa_install_path)
    lockfile.close()

    return bwa_install_path
Beispiel #32
0
def stream_extract(stream, path, files=[], decompress=True):
    """Retrieve a stream from Keep and extract it to a local
    directory.  Return the absolute path where the stream was
    extracted.

    stream -- StreamReader object
    path -- where to extract: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)

    files_got = []
    for f in stream.all_files():
        if (files == []
                or ((f.name() not in files_got) and
                    (f.name() in files or
                     (decompress and f.decompressed_name() in files)))):
            outname = f.decompressed_name() if decompress else f.name()
            files_got += [outname]
            if os.path.exists(os.path.join(path, outname)):
                os.unlink(os.path.join(path, outname))
            mkdir_dash_p(os.path.dirname(os.path.join(path, outname)))
            outfile = open(os.path.join(path, outname), 'wb')
            for buf in (f.readall_decompressed()
                        if decompress else f.readall()):
                outfile.write(buf)
            outfile.close()
    if len(files_got) < len(files):
        raise arvados.errors.AssertionError(
            "Wanted files %s but only got %s from %s" %
            (files, files_got, [z.name() for z in stream.all_files()]))
    lockfile.close()
    return path
Beispiel #33
0
def stream_extract(stream, path, files=[], decompress=True):
    """Retrieve a stream from Keep and extract it to a local
    directory.  Return the absolute path where the stream was
    extracted.

    stream -- StreamReader object
    path -- where to extract: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)

    files_got = []
    for f in stream.all_files():
        if (files == [] or
            ((f.name() not in files_got) and
             (f.name() in files or
              (decompress and f.decompressed_name() in files)))):
            outname = f.decompressed_name() if decompress else f.name()
            files_got += [outname]
            if os.path.exists(os.path.join(path, outname)):
                os.unlink(os.path.join(path, outname))
            mkdir_dash_p(os.path.dirname(os.path.join(path, outname)))
            outfile = open(os.path.join(path, outname), 'wb')
            for buf in (f.readall_decompressed() if decompress
                        else f.readall()):
                outfile.write(buf)
            outfile.close()
    if len(files_got) < len(files):
        raise arvados.errors.AssertionError(
            "Wanted files %s but only got %s from %s" %
            (files, files_got, [z.name() for z in stream.all_files()]))
    lockfile.close()
    return path
def main():
    current_job = arvados.current_job()
    skip_sq_sn_regex = '_decoy$'
    if 'skip_sq_sn_regex' in current_job['script_parameters']:
        skip_sq_sn_regex = current_job['script_parameters']['skip_sq_sn_regex']
    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    genome_chunks = int(current_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Limit the scope of the reference collection to only those files relevant to gatk
    ref_input_pdh = prepare_gatk_reference_collection(
        reference_coll=current_job['script_parameters']
        ['reference_collection'])

    # Create an interval_list file for each chunk based on the .dict in the reference collection
    output_locator = create_interval_lists(genome_chunks, ref_input_pdh,
                                           skip_sq_sn_r)

    # Use the resulting locator as the output for this task.
    arvados.current_task().set_output(output_locator)
def create_chunk_tasks(f_name,
                       chunk_input_pdh_names,
                       if_sequence,
                       task_input_pdh,
                       ref_input_pdh,
                       chunk_input_pdh,
                       pool=None):
    async_results = []
    for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
        # Create task for each CRAM / chunk
        job_uuid = arvados.current_job()['uuid']
        current_task_uuid = arvados.current_task()['uuid']
        new_task_attrs = {
            'job_uuid': job_uuid,
            'created_by_job_task_uuid': current_task_uuid,
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input_pdh,
                'ref': ref_input_pdh,
                'chunk': chunk_input_pdh
            }
        }
        async_result = pool.apply_async(arv_create_task, (
            new_task_attrs,
            "Created new task to process %s with chunk interval %s (job_uuid %s)"
            % (f_name, chunk_input_name, job_uuid)))
        async_results.append(async_result)

    for async_result in async_results:
        async_result.wait()
        try:
            (res, report) = async_result.get()
            if (not res) or (not 'qsequence' in res):
                raise InternalError("Could not create job task: %s" % res)
            else:
                print report + " qsequence %s" % res['qsequence']
        except Exception as e:
            raise InternalError("Exception creating job task: %s" % e)
Beispiel #36
0
    of.write("pwd: " + pwdinfo)

    lsinfo = sp.check_output(["ls", "-lahR"])
    of.write("directory structure:\n" + lsinfo)

    dfinfo = sp.check_output(["df", "-h"])
    of.write("df:\n" + dfinfo)

    meminfo = sp.check_output(["free", "-hm"])
    of.write("mem:\n" + meminfo)

    hostinfo = sp.check_output(["hostname"])
    of.write("host: " + hostinfo)


job = arv.current_job()
task = arv.current_task()

of = arv.CollectionWriter()
of.set_current_file_name("info.log")

whoinfo = sp.check_output(["whoami"])
of.write("user: "******"\n")

pwdinfo = sp.check_output(["pwd"])
of.write("pwd: " + pwdinfo + "\n")

lsinfo = sp.check_output(["ls", "-lahR"])
of.write("directory structure:\n" + lsinfo)

dfinfo = sp.check_output(["df", "-h"])
Beispiel #37
0
def chunked_tasks_per_cram_file(
        ref_input,
        job_input,
        interval_lists,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        reuse_tasks_retrieve_all=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        script=arvados.current_job()['script']):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # prepare interval lists
    cr = arvados.CollectionReader(interval_lists)
    chunk_interval_list = {}
    chunk_input_pdh_names = []
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.interval_list$', f.name()):
                chunk_interval_list[s.name(), f.name()] = f
    for ((s_name, f_name),
         chunk_interval_list_f) in sorted(chunk_interval_list.items()):
        chunk_input = chunk_interval_list_f.as_manifest()
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": chunk_input
            }).execute()
            chunk_input_pdh = r["portable_data_hash"]
            chunk_input_name = os.path.join(s_name, f_name)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        except:
            raise

    if len(chunk_input_pdh_names) == 0:
        raise errors.InvalidArgumentError(
            "No interval_list files found in %s" % (interval_lists))

    # prepare CRAM input collections
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        crai_f = crai.get(
            (s_name, re.sub(r'cram$', 'crai', f_name)),
            crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise errors.InvalidArgumentError(
                "No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": task_input
            }).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        if reuse_tasks:
            task_key_params = ['input', 'ref', 'chunk']
            # get candidates for task reuse
            job_filters = [
                ['script', '=', script],
                ['repository', '=',
                 arvados.current_job()['repository']],
                ['script_version', 'in git', oldest_git_commit_to_reuse],
                [
                    'docker_image_locator', 'in docker',
                    arvados.current_job()['docker_image_locator']
                ],
            ]
            if reuse_tasks_retrieve_all:
                # retrieve a full set of all possible reusable tasks
                reusable_tasks = get_reusable_tasks(if_sequence + 1,
                                                    task_key_params,
                                                    job_filters)
                print "Have %s tasks for potential reuse" % (
                    len(reusable_tasks))
            else:
                reusable_task_jobs = get_jobs_for_task_reuse(job_filters)
                print "Have %s jobs for potential task reuse" % (
                    len(reusable_task_jobs))
                reusable_task_job_uuids = [
                    job['uuid'] for job in reusable_task_jobs['items']
                ]

        for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
            # Create task for each CRAM / chunk
            new_task_params = {
                'input': task_input_pdh,
                'ref': ref_input,
                'chunk': chunk_input_pdh
            }
            print "Creating new task to process %s with chunk interval %s " % (
                f_name, chunk_input_name)
            if reuse_tasks:
                if reuse_tasks_retrieve_all:
                    task = create_or_reuse_task(if_sequence + 1,
                                                new_task_params,
                                                reusable_tasks,
                                                task_key_params,
                                                validate_task_output)
                else:
                    task = create_or_reuse_task_from_jobs(
                        if_sequence + 1, new_task_params,
                        reusable_task_job_uuids, task_key_params,
                        validate_task_output)
            else:
                task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
def one_task_per_interval(interval_count, validate_task_output,
                          if_sequence=0, and_end_task=True,
                          reuse_tasks=True,
                          interval_list_param="interval_list",
                          oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
                          task_key_params=['name', 'inputs', 'interval', 'ref'],
                          script=arvados.current_job()['script']):
    """
    Queue one task for each of interval_count intervals, splitting
    the genome chunk (described by the .interval_list file) evenly.

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param=interval_list_param)

    interval_reader = open(interval_list_file, mode="r")

    lines = interval_reader.readlines()
    sn_intervals = dict()
    sns = []
    total_len = 0
    for line in lines:
        if line[0] == '@':
            # skip all lines starting with '@'
            continue
        fields = line.split("\t")
        if len(fields) != 5:
            raise errors.InvalidArgumentError("interval_list %s has invalid line [%s] - expected 5 fields but got %s" % (interval_list_file, line, len(fields)))
        sn = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        length = int(end) - int(start) + 1
        total_len += int(length)
        sn_intervals[sn] = (start, end)
        sns.append(sn)

    print "Total chunk length is %s" % total_len
    interval_len = int(total_len / interval_count)
    intervals = []
    print "Splitting chunk into %s intervals of size ~%s" % (interval_count, interval_len)
    for interval_i in range(0, interval_count):
        interval_num = interval_i + 1
        intervals_count = 0
        remaining_len = interval_len
        interval = []
        while len(sns) > 0:
            sn = sns.pop(0)
            if not sn_intervals.has_key(sn):
                raise errors.ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end-start+1) > remaining_len:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_len + start - 1
                assert((end-start+1) <= remaining_len)
                sn_intervals[sn] = (end+1, real_end)
                sns.insert(0, sn)
            interval.append("%s:%s-%s" % (sn, start, end))
            remaining_len -= (end-start+1)
            intervals_count += 1
            if remaining_len <= 0:
                break
        if intervals_count > 0:
            intervals.append(interval)
        else:
            print "WARNING: skipping empty intervals for %s" % interval_input_name
    print "Have %s intervals" % (len(intervals))

    if reuse_tasks:
        # get candidates for task reuse
        job_filters = [
            ['script', '=', script],
            ['repository', '=', arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
        ]
        reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters)
        print "Have %s potentially reusable tasks" % (len(reusable_tasks))

    for interval in intervals:
        interval_str = ' '.join(interval)
        print "Creating task to process interval: [%s]" % interval_str
        new_task_params = arvados.current_task()['parameters']
        new_task_params['interval'] = interval_str
        if reuse_tasks:
            task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output)
        else:
            task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
def chunked_tasks_per_cram_file(ref_input, job_input, interval_lists, validate_task_output,
                                if_sequence=0, and_end_task=True,
                                reuse_tasks=True, reuse_tasks_retrieve_all=True,
                                interval_list_param="interval_list",
                                oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
                                script=arvados.current_job()['script']):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # prepare interval lists
    cr = arvados.CollectionReader(interval_lists)
    chunk_interval_list = {}
    chunk_input_pdh_names = []
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.interval_list$', f.name()):
                chunk_interval_list[s.name(), f.name()] = f
    for ((s_name, f_name), chunk_interval_list_f) in sorted(chunk_interval_list.items()):
        chunk_input = chunk_interval_list_f.as_manifest()
        try:
            r = arvados.api().collections().create(body={"manifest_text": chunk_input}).execute()
            chunk_input_pdh = r["portable_data_hash"]
            chunk_input_name = os.path.join(s_name, f_name)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        except:
            raise

    if len(chunk_input_pdh_names) == 0:
        raise errors.InvalidArgumentError("No interval_list files found in %s" % (interval_lists))

    # prepare CRAM input collections
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)),
                          crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)),
                                   None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise errors.InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={"manifest_text": task_input}).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        if reuse_tasks:
            task_key_params=['input', 'ref', 'chunk']
            # get candidates for task reuse
            job_filters = [
                ['script', '=', script],
                ['repository', '=', arvados.current_job()['repository']],
                ['script_version', 'in git', oldest_git_commit_to_reuse],
                ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
            ]
            if reuse_tasks_retrieve_all:
                # retrieve a full set of all possible reusable tasks
                reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params, job_filters)
                print "Have %s tasks for potential reuse" % (len(reusable_tasks))
            else:
                reusable_task_jobs = get_jobs_for_task_reuse(job_filters)
                print "Have %s jobs for potential task reuse" % (len(reusable_task_jobs))
                reusable_task_job_uuids = [job['uuid'] for job in reusable_task_jobs['items']]

        for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
            # Create task for each CRAM / chunk
            new_task_params = {
                'input': task_input_pdh,
                'ref': ref_input,
                'chunk': chunk_input_pdh
            }
            print "Creating new task to process %s with chunk interval %s " % (f_name, chunk_input_name)
            if reuse_tasks:
                if reuse_tasks_retrieve_all:
                    task = create_or_reuse_task(if_sequence + 1, new_task_params, reusable_tasks, task_key_params, validate_task_output)
                else:
                    task = create_or_reuse_task_from_jobs(if_sequence + 1, new_task_params, reusable_task_job_uuids, task_key_params, validate_task_output)
            else:
                task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
Beispiel #40
0
def zipball_extract(zipball, path):
    """Retrieve a zip archive from Keep and extract it to a local
    directory.  Return the absolute path where the archive was
    extracted. If the top level of the archive contained just one
    file or directory, return the absolute path of that single
    item.

    zipball -- collection locator
    path -- where to extract the archive: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == zipball:
            already_have_it = True
    except OSError:
        pass
    if not already_have_it:

        # emulate "rm -f" (i.e., if the file does not exist, we win)
        try:
            os.unlink(os.path.join(path, '.locator'))
        except OSError:
            if os.path.exists(os.path.join(path, '.locator')):
                os.unlink(os.path.join(path, '.locator'))

        for f in CollectionReader(zipball).all_files():
            if not re.search('\.zip$', f.name()):
                raise arvados.errors.NotImplementedError(
                    "zipball_extract cannot handle filename %s" % f.name())
            zip_filename = os.path.join(path, os.path.basename(f.name()))
            zip_file = open(zip_filename, 'wb')
            while True:
                buf = f.read(2**20)
                if len(buf) == 0:
                    break
                zip_file.write(buf)
            zip_file.close()

            p = subprocess.Popen(
                ["unzip", "-q", "-o", "-d", path, zip_filename],
                stdout=None,
                stdin=None,
                stderr=sys.stderr,
                shell=False,
                close_fds=True)
            p.wait()
            if p.returncode != 0:
                lockfile.close()
                raise arvados.errors.CommandFailedError("unzip exited %d" %
                                                        p.returncode)
            os.unlink(zip_filename)
        os.symlink(zipball, os.path.join(path, '.locator'))
    tld_extracts = [f for f in os.listdir(path) if f != '.locator']
    lockfile.close()
    if len(tld_extracts) == 1:
        return os.path.join(path, tld_extracts[0])
    return path
Beispiel #41
0
def zipball_extract(zipball, path):
    """Retrieve a zip archive from Keep and extract it to a local
    directory.  Return the absolute path where the archive was
    extracted. If the top level of the archive contained just one
    file or directory, return the absolute path of that single
    item.

    zipball -- collection locator
    path -- where to extract the archive: absolute, or relative to job tmp
    """
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == zipball:
            already_have_it = True
    except OSError:
        pass
    if not already_have_it:

        # emulate "rm -f" (i.e., if the file does not exist, we win)
        try:
            os.unlink(os.path.join(path, '.locator'))
        except OSError:
            if os.path.exists(os.path.join(path, '.locator')):
                os.unlink(os.path.join(path, '.locator'))

        for f in CollectionReader(zipball).all_files():
            if not re.search('\.zip$', f.name()):
                raise arvados.errors.NotImplementedError(
                    "zipball_extract cannot handle filename %s" % f.name())
            zip_filename = os.path.join(path, os.path.basename(f.name()))
            zip_file = open(zip_filename, 'wb')
            while True:
                buf = f.read(2**20)
                if len(buf) == 0:
                    break
                zip_file.write(buf)
            zip_file.close()

            p = subprocess.Popen(["unzip",
                                  "-q", "-o",
                                  "-d", path,
                                  zip_filename],
                                 stdout=None,
                                 stdin=None, stderr=sys.stderr,
                                 shell=False, close_fds=True)
            p.wait()
            if p.returncode != 0:
                lockfile.close()
                raise arvados.errors.CommandFailedError(
                    "unzip exited %d" % p.returncode)
            os.unlink(zip_filename)
        os.symlink(zipball, os.path.join(path, '.locator'))
    tld_extracts = [f for f in os.listdir(path) if f != '.locator']
    lockfile.close()
    if len(tld_extracts) == 1:
        return os.path.join(path, tld_extracts[0])
    return path
def one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex='_decoy$',
                           genome_chunks=200):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # setup multiprocessing pool
    pool_processes = cpu_count() - 1
    print 'Using %d processes to submit tasks\n' % pool_processes
    pool = Pool(processes=pool_processes)

    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    # Ensure we have a .fa reference file with corresponding .fai index and .dict
    reference_coll = arvados.current_job(
    )['script_parameters']['reference_collection']
    rcr = arvados.CollectionReader(reference_coll)
    ref_fasta = {}
    ref_fai = {}
    ref_dict = {}
    ref_input = None
    dict_reader = None
    for rs in rcr.all_streams():
        for rf in rs.all_files():
            if re.search(r'\.fa$', rf.name()):
                ref_fasta[rs.name(), rf.name()] = rf
            elif re.search(r'\.fai$', rf.name()):
                ref_fai[rs.name(), rf.name()] = rf
            elif re.search(r'\.dict$', rf.name()):
                ref_dict[rs.name(), rf.name()] = rf
    for ((s_name, f_name), fasta_f) in ref_fasta.items():
        fai_f = ref_fai.get(
            (s_name, re.sub(r'fa$', 'fai', f_name)),
            ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None))
        dict_f = ref_dict.get(
            (s_name, re.sub(r'fa$', 'dict', f_name)),
            ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None))
        if fasta_f and fai_f and dict_f:
            # found a set of all three!
            ref_input = fasta_f.as_manifest()
            ref_input += fai_f.as_manifest()
            ref_input += dict_f.as_manifest()
            dict_reader = dict_f
            break
    if ref_input is None:
        raise InvalidArgumentError(
            "Expected a reference fasta with fai and dict in reference_collection. Found [%s]"
            % ' '.join(rf.name() for rf in rs.all_files()))
    if dict_reader is None:
        raise InvalidArgumentError(
            "Could not find .dict file in reference_collection. Found [%s]" %
            ' '.join(rf.name() for rf in rs.all_files()))

    # Create a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={
            "manifest_text": ref_input
        }).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise

    # Load the dict data
    interval_header = ""
    dict_lines = dict_reader.readlines()
    dict_header = dict_lines.pop(0)
    if re.search(r'^@HD', dict_header) is None:
        raise InvalidArgumentError(
            "Dict file in reference collection does not have correct header: [%s]"
            % dict_header)
    interval_header += dict_header
    print "Dict header is %s" % dict_header
    sn_intervals = dict()
    sns = []
    skip_sns = []
    total_len = 0
    for sq in dict_lines:
        if re.search(r'^@SQ', sq) is None:
            raise InvalidArgumentError(
                "Dict file contains malformed SQ line: [%s]" % sq)
        interval_header += sq
        sn = None
        ln = None
        for tagval in sq.split("\t"):
            tv = tagval.split(":", 1)
            if tv[0] == "SN":
                sn = tv[1]
            if tv[0] == "LN":
                ln = tv[1]
            if sn and ln:
                break
        if not (sn and ln):
            raise InvalidArgumentError(
                "Dict file SQ entry missing required SN and/or LN parameters: [%s]"
                % sq)
        assert (sn and ln)
        if sn_intervals.has_key(sn):
            raise InvalidArgumentError(
                "Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq))
        if skip_sq_sn_r.search(sn):
            skip_sns.append(sn)
            continue
        sn_intervals[sn] = (1, int(ln))
        sns.append(sn)
        total_len += int(ln)
    total_sequences = len(sns)

    print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns),
                                                           skip_sq_sn_regex)

    # Chunk the genome into genome_chunks pieces
    # weighted by both number of base pairs and number of seqs
    print "Total sequences included: %s" % (total_sequences)
    print "Total genome length: %s" % (total_len)
    total_points = total_len + (total_sequences * weight_seq)
    chunk_points = int(total_points / genome_chunks)
    chunk_input_pdh_names = []
    print "Chunking genome into %s chunks of ~%s points" % (genome_chunks,
                                                            chunk_points)
    for chunk_i in range(0, genome_chunks):
        chunk_num = chunk_i + 1
        chunk_intervals_count = 0
        chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" %
                                                 (chunk_num, genome_chunks))
        print "Creating interval file for chunk %s" % chunk_num
        chunk_c = arvados.collection.CollectionWriter(num_retries=3)
        chunk_c.start_new_file(newfilename=chunk_input_name)
        # chunk_c.write(interval_header)
        remaining_points = chunk_points
        while len(sns) > 0:
            sn = sns.pop(0)
            remaining_points -= weight_seq
            if remaining_points <= 0:
                sns.insert(0, sn)
                break
            if not sn_intervals.has_key(sn):
                raise ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end - start + 1) > remaining_points:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_points + start - 1
                assert ((end - start + 1) <= remaining_points)
                sn_intervals[sn] = (end + 1, real_end)
                sns.insert(0, sn)
            #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn))
            interval = "%s\t%s\t%s\n" % (sn, start, end)
            remaining_points -= (end - start + 1)
            chunk_c.write(interval)
            chunk_intervals_count += 1
            if remaining_points <= 0:
                break
        if chunk_intervals_count > 0:
            chunk_input_pdh = chunk_c.finish()
            print "Chunk intervals file %s saved as %s" % (chunk_input_name,
                                                           chunk_input_pdh)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        else:
            print "WARNING: skipping empty intervals for %s" % chunk_input_name
    print "Have %s chunk collections: [%s]" % (len(
        chunk_input_pdh_names), ' '.join([x[0]
                                          for x in chunk_input_pdh_names]))

    # prepare CRAM input collections
    job_input = arvados.current_job()['script_parameters']['inputs_collection']
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        # Handle this CRAM file
        crai_f = crai.get(
            (s_name, re.sub(r'cram$', 'crai', f_name)),
            crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise InvalidArgumentError(
                "No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": task_input
            }).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        create_chunk_tasks(f_name,
                           chunk_input_pdh_names,
                           if_sequence,
                           task_input_pdh,
                           ref_input_pdh,
                           chunk_input_pdh,
                           pool=pool)

    print "Waiting for asynchronous requests to complete"
    pool.close()
    pool.join()

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()["script_parameters"]["reference_collection"]
    )
    job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"]
    interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"]
    interval_count = 1
    if "interval_count" in arvados.current_job()["script_parameters"]:
        interval_count = arvados.current_job()["script_parameters"]["interval_count"]

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.chunked_tasks_per_cram_file(
        ref_input_pdh,
        job_input_pdh,
        interval_lists_pdh,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=False,
        oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2",
        script="gatk-haplotypecaller-cram.py",
    )

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert this_task["sequence"] != 0

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if "reuse_job_task" in this_task["parameters"]:
        print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"]
        exit(0)

    ################################################################################
    # Phase IIb: Call Haplotypes!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk")
    cram_file = gatk_helper.mount_gatk_cram_input(input_param="input")
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    out_dir = hgi_arvados.prepare_out_dir()
    out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_filename = out_filename.replace(".bcf", "._cf")

    # HaplotypeCaller!
    gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename))

    if gatk_exit != 0:
        print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        print "Task output written to keep, validating it"
        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
Beispiel #44
0
#!/usr/bin/python
#

import arvados
import re

arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)

this_job = arvados.current_job()
this_task = arvados.current_task()
this_task_input = this_task['parameters']['input']

input_file = list(arvados.CollectionReader(this_task_input).all_files())[0]

out = arvados.CollectionWriter()
out.set_current_file_name(input_file.decompressed_name())
out.set_current_stream_name(input_file.stream_name())
for line in input_file.readlines():
    out.write("!!!" + line.upper())

this_task.set_output(out.finish())
Beispiel #45
0
  of.write("pwd: " + pwdinfo)

  lsinfo = sp.check_output(["ls", "-lahR"])
  of.write("directory structure:\n" + lsinfo)

  dfinfo = sp.check_output(["df", "-h"])
  of.write("df:\n" + dfinfo)

  meminfo = sp.check_output(["free", "-hm"])
  of.write("mem:\n" + meminfo)

  hostinfo = sp.check_output(["hostname"])
  of.write("host: " + hostinfo)


job = arv.current_job()
task = arv.current_task()

of = arv.CollectionWriter()
of.set_current_file_name("info.log")

whoinfo = sp.check_output(["whoami"])
of.write("user: "******"\n" )

pwdinfo = sp.check_output(["pwd"])
of.write("pwd: " + pwdinfo + "\n" )

lsinfo = sp.check_output(["ls", "-lahR"])
of.write("directory structure:\n" + lsinfo)

dfinfo = sp.check_output(["df", "-h"])
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)
    
    this_job = arvados.current_job()
    
    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True, 
                           skip_sq_sn_regex=skip_sq_sn_regex, 
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert(this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task 
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)


    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) 
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

#    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')
    
    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")
    
    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()
    
        if (
                (bcftools_mpileup_p is None) and
                (bcftools_norm_p is None) and 
                (part_tee_p is None) and
                (bcftools_view_headeronly_p is None) and 
                (bcftools_view_noheader_p is None)
        ):
            # no per-region processes are running (they have finished or 
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count)))
                part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"]
                bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp]
                bcftools_norm_cmd = ["bcftools", "norm", 
                                     "-f", ref_file, 
                                     "-Ou"]
                bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup",
                                        "-t", "AD,INFO/AD",
                                        "-C50", 
                                        "-pm2", 
                                        "-F0.1",
                                        "-d10000",
                                        "--gvcf", "1,2,3,4,5,10,15",
                                        "-f", ref_file,
                                        "-Ou",
                                        "-r", region,
                                        cram_file]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe()

                print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb')

                bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd,
                                                   stdout=bcftools_norm_stdin_pipe_write,
                                                   tag="bcftools mpileup %s" % (region_label))
                
                bcftools_norm_p = run_child_cmd(bcftools_norm_cmd,
                                                stdin=bcftools_norm_stdin_pipe_read,
                                                stdout=part_tee_stdin_pipe_write,
                                                tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(part_tee_cmd,
                                           stdin=part_tee_stdin_pipe_read,
                                           stdout=bcftools_view_headeronly_stdin_pipe_write,
                                           tag="tee %s" % (region_label))
                
                bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd,
                                                           stdin=bcftools_view_headeronly_stdin_pipe_read,
                                                           tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd,
                                                         stdout=current_concat_noheader_fifo_f,
                                                         tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p,
                                                       "bcftools mpileup %s" % (region_label),
                                                       close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(bcftools_norm_p,
                                                    "bcftools norm %s" % (region_label),
                                                    close_fds=[bcftools_norm_stdin_pipe_read, 
                                                               part_tee_stdin_pipe_write])
        
        part_tee_p = close_process_if_finished(part_tee_p,
                                               "tee %s" % (region_label),
                                               close_fds=[part_tee_stdin_pipe_read,
                                                          bcftools_view_headeronly_stdin_pipe_write],
                                               ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p,
                                                               "bcftools view -h %s" % (region_label),
                                                               close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p,
                                                             "bcftools view %s" % (region_label),
                                                             close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(region_concat_p,
                                                      "bcftools concat",
                                                      close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if (
            (region_concat_p is None)
            and (bcftools_view_noheader_p is None)
            and (bcftools_view_headeronly_p is None)
            and (part_tee_p is None)
            and (bcftools_norm_p is None)
            and (bcftools_mpileup_p is None)
            ):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration


    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe" 
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe()

    grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn]
    bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd,
                                                 stdout=grep_headeronly_stdin_pipe_write,
                                                 tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p,
                                                                 "bcftools concat (headeronly)",
                                                                 close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(grep_headeronly_p,
                                                      "grep (headeronly)",
                                                      close_fds=[grep_headeronly_stdin_pipe_read],
                                                      close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
            and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done! 
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"


    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and 
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(final_bgzip_p,
                                                      "final bgzip",
                                                      close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe()
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p,
                                                              "final bcftools view -Oz",
                                                              close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(final_concat_p,
                                                       "final cat (header+data)",
                                                       close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None)
                and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (final_out_file)
        final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file]
        final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p,
                                                              "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
Beispiel #47
0
def collection_extract(collection, path, files=[], decompress=True):
    """Retrieve a collection from Keep and extract it to a local
    directory.  Return the absolute path where the collection was
    extracted.

    collection -- collection locator
    path -- where to extract: absolute, or relative to job tmp
    """
    matches = re.search(r'^([0-9a-f]+)(\+[\w@]+)*$', collection)
    if matches:
        collection_hash = matches.group(1)
    else:
        collection_hash = hashlib.md5(collection).hexdigest()
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == collection_hash:
            already_have_it = True
    except OSError:
        pass

    # emulate "rm -f" (i.e., if the file does not exist, we win)
    try:
        os.unlink(os.path.join(path, '.locator'))
    except OSError:
        if os.path.exists(os.path.join(path, '.locator')):
            os.unlink(os.path.join(path, '.locator'))

    files_got = []
    for s in CollectionReader(collection).all_streams():
        stream_name = s.name()
        for f in s.all_files():
            if (files == [] or
                ((f.name() not in files_got) and
                 (f.name() in files or
                  (decompress and f.decompressed_name() in files)))):
                outname = f.decompressed_name() if decompress else f.name()
                files_got += [outname]
                if os.path.exists(os.path.join(path, stream_name, outname)):
                    continue
                mkdir_dash_p(os.path.dirname(os.path.join(path, stream_name, outname)))
                outfile = open(os.path.join(path, stream_name, outname), 'wb')
                for buf in (f.readall_decompressed() if decompress
                            else f.readall()):
                    outfile.write(buf)
                outfile.close()
    if len(files_got) < len(files):
        raise arvados.errors.AssertionError(
            "Wanted files %s but only got %s from %s" %
            (files, files_got,
             [z.name() for z in CollectionReader(collection).all_files()]))
    os.symlink(collection_hash, os.path.join(path, '.locator'))

    lockfile.close()
    return path
Beispiel #48
0
def collection_extract(collection, path, files=[], decompress=True):
    """Retrieve a collection from Keep and extract it to a local
    directory.  Return the absolute path where the collection was
    extracted.

    collection -- collection locator
    path -- where to extract: absolute, or relative to job tmp
    """
    matches = re.search(r'^([0-9a-f]+)(\+[\w@]+)*$', collection)
    if matches:
        collection_hash = matches.group(1)
    else:
        collection_hash = hashlib.md5(collection).hexdigest()
    if not re.search('^/', path):
        path = os.path.join(arvados.current_job().tmpdir, path)
    lockfile = open(path + '.lock', 'w')
    fcntl.flock(lockfile, fcntl.LOCK_EX)
    try:
        os.stat(path)
    except OSError:
        os.mkdir(path)
    already_have_it = False
    try:
        if os.readlink(os.path.join(path, '.locator')) == collection_hash:
            already_have_it = True
    except OSError:
        pass

    # emulate "rm -f" (i.e., if the file does not exist, we win)
    try:
        os.unlink(os.path.join(path, '.locator'))
    except OSError:
        if os.path.exists(os.path.join(path, '.locator')):
            os.unlink(os.path.join(path, '.locator'))

    files_got = []
    for s in CollectionReader(collection).all_streams():
        stream_name = s.name()
        for f in s.all_files():
            if (files == []
                    or ((f.name() not in files_got) and
                        (f.name() in files or
                         (decompress and f.decompressed_name() in files)))):
                outname = f.decompressed_name() if decompress else f.name()
                files_got += [outname]
                if os.path.exists(os.path.join(path, stream_name, outname)):
                    continue
                mkdir_dash_p(
                    os.path.dirname(os.path.join(path, stream_name, outname)))
                outfile = open(os.path.join(path, stream_name, outname), 'wb')
                for buf in (f.readall_decompressed()
                            if decompress else f.readall()):
                    outfile.write(buf)
                outfile.close()
    if len(files_got) < len(files):
        raise arvados.errors.AssertionError(
            "Wanted files %s but only got %s from %s" %
            (files, files_got,
             [z.name() for z in CollectionReader(collection).all_files()]))
    os.symlink(collection_hash, os.path.join(path, '.locator'))

    lockfile.close()
    return path
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)

    this_job = arvados.current_job()

    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex=skip_sq_sn_regex,
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(
        ref_file,
        error_exception=FileAccessError("reference fasta not readable: %s" %
                                        ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(
        ref_fai_file,
        error_exception=FileAccessError(
            "reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(
        chunk_file,
        error_exception=FileAccessError(
            "Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file,
                              error_exception=FileAccessError(
                                  "CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(
        cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(
        tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

    #    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.g.vcf" %
                               (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" %
                               (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend(
        [concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')

    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")

    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()

        if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None)
                and (part_tee_p is None)
                and (bcftools_view_headeronly_p is None)
                and (bcftools_view_noheader_p is None)):
            # no per-region processes are running (they have finished or
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num,
                                               total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(
                    tmp_dir, output_basename +
                    (".part_%s_of_%s.noheader.g.bcf" %
                     (current_region_num, total_region_count)))
                part_tee_cmd = [
                    "teepot", bcftools_view_noheader_input_fifo, "-"
                ]
                bcftools_view_noheader_cmd = [
                    "bcftools", "view", "-H", "-Ov",
                    bcftools_view_noheader_input_fifo
                ]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = [
                    "bcftools", "view", "-h", "-Oz", "-o",
                    concat_headeronly_tmp
                ]
                bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"]
                bcftools_mpileup_cmd = [
                    "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50",
                    "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15",
                    "-f", ref_file, "-Ou", "-r", region, cram_file
                ]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (
                    region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe(
                )

                print "Creating 'bcftools norm | tee' pipe for region %s" % (
                    region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()

                print "Creating 'tee | bcftools view -h' pipe for region %s" % (
                    region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe(
                )

                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (
                    bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo,
                                                      'wb')

                bcftools_mpileup_p = run_child_cmd(
                    bcftools_mpileup_cmd,
                    stdout=bcftools_norm_stdin_pipe_write,
                    tag="bcftools mpileup %s" % (region_label))

                bcftools_norm_p = run_child_cmd(
                    bcftools_norm_cmd,
                    stdin=bcftools_norm_stdin_pipe_read,
                    stdout=part_tee_stdin_pipe_write,
                    tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(
                    part_tee_cmd,
                    stdin=part_tee_stdin_pipe_read,
                    stdout=bcftools_view_headeronly_stdin_pipe_write,
                    tag="tee %s" % (region_label))

                bcftools_view_headeronly_p = run_child_cmd(
                    bcftools_view_headeronly_cmd,
                    stdin=bcftools_view_headeronly_stdin_pipe_read,
                    tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(
                    bcftools_view_noheader_cmd,
                    stdout=current_concat_noheader_fifo_f,
                    tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(
            bcftools_mpileup_p,
            "bcftools mpileup %s" % (region_label),
            close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(
            bcftools_norm_p,
            "bcftools norm %s" % (region_label),
            close_fds=[
                bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write
            ])

        part_tee_p = close_process_if_finished(
            part_tee_p,
            "tee %s" % (region_label),
            close_fds=[
                part_tee_stdin_pipe_read,
                bcftools_view_headeronly_stdin_pipe_write
            ],
            ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(
            bcftools_view_headeronly_p,
            "bcftools view -h %s" % (region_label),
            close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(
            bcftools_view_noheader_p,
            "bcftools view %s" % (region_label),
            close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(
            region_concat_p, "bcftools concat", close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if ((region_concat_p is None) and (bcftools_view_noheader_p is None)
                and (bcftools_view_headeronly_p is None)
                and (part_tee_p is None) and (bcftools_norm_p is None)
                and (bcftools_mpileup_p is None)):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir,
                                              output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (
        concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [
                concat_headeronly_tmps[region] for region in regions
        ]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir,
                                        output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe"
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe(
    )

    grep_headeronly_cmd = [
        "egrep", "-v", "^[#][#](bcftools|mpileup|reference)"
    ]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = [
        "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn
    ]
    bcftools_concat_headeronly_p = run_child_cmd(
        bcftools_concat_headeronly_cmd,
        stdout=grep_headeronly_stdin_pipe_write,
        tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(
            bcftools_concat_headeronly_p,
            "bcftools concat (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(
            grep_headeronly_p,
            "grep (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_read],
            close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
                and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done!
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"

    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (
            final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd,
                                      tag="final bgzip",
                                      stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(
                final_bgzip_p, "final bgzip", close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe(
        )
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = [
            "bcftools", "view", "-Oz", "-o", penultimate_out_file
        ]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(
            final_bcftools_view_cmd,
            tag="final bcftools view -Oz",
            stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(
            final_concat_cmd,
            tag="final cat (header+data)",
            stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(
                final_bcftools_view_p,
                "final bcftools view -Oz",
                close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(
                final_concat_p,
                "final cat (header+data)",
                close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None) and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (
            final_out_file)
        final_bcftools_reheader_cmd = [
            "bcftools", "reheader", "-h", final_headeronly_tmp, "-o",
            final_out_file, penultimate_out_file
        ]
        final_bcftools_reheader_p = run_child_cmd(
            final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(
                final_bcftools_reheader_p, "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                   group_by_regex, max_gvcfs_to_combine,
                                                   if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(interval_count, validate_task_output,
                                      reuse_tasks=True,
                                      oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
                                      if_sequence=1, and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert(this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import arvados
import subprocess
import crunchutil.subst as subst
import shutil
import os
import sys
import time

if len(arvados.current_task()['parameters']) > 0:
    p = arvados.current_task()['parameters']
else:
    p = arvados.current_job()['script_parameters']

t = arvados.current_task().tmpdir

os.unlink("/usr/local/share/bcbio-nextgen/galaxy")
os.mkdir("/usr/local/share/bcbio-nextgen/galaxy")
shutil.copy("/usr/local/share/bcbio-nextgen/config/bcbio_system.yaml",
            "/usr/local/share/bcbio-nextgen/galaxy")

with open("/usr/local/share/bcbio-nextgen/galaxy/tool_data_table_conf.xml",
          "w") as f:
    f.write('''<tables>
    <!-- Locations of indexes in the BWA mapper format -->
    <table name="bwa_indexes" comment_char="#">
        <columns>value, dbkey, name, path</columns>
        <file path="tool-data/bwa_index.loc" />
 def run_method(self):
     arvados.current_job()
Beispiel #53
0
def one_task_per_interval(
        interval_count,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        task_key_params=['name', 'inputs', 'interval', 'ref'],
        script=arvados.current_job()['script']):
    """
    Queue one task for each of interval_count intervals, splitting
    the genome chunk (described by the .interval_list file) evenly.

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param=interval_list_param)

    interval_reader = open(interval_list_file, mode="r")

    lines = interval_reader.readlines()
    sn_intervals = dict()
    sns = []
    total_len = 0
    for line in lines:
        if line[0] == '@':
            # skip all lines starting with '@'
            continue
        fields = line.split("\t")
        if len(fields) != 5:
            raise errors.InvalidArgumentError(
                "interval_list %s has invalid line [%s] - expected 5 fields but got %s"
                % (interval_list_file, line, len(fields)))
        sn = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        length = int(end) - int(start) + 1
        total_len += int(length)
        sn_intervals[sn] = (start, end)
        sns.append(sn)

    print "Total chunk length is %s" % total_len
    interval_len = int(total_len / interval_count)
    intervals = []
    print "Splitting chunk into %s intervals of size ~%s" % (interval_count,
                                                             interval_len)
    for interval_i in range(0, interval_count):
        interval_num = interval_i + 1
        intervals_count = 0
        remaining_len = interval_len
        interval = []
        while len(sns) > 0:
            sn = sns.pop(0)
            if not sn_intervals.has_key(sn):
                raise errors.ValueError(
                    "sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end - start + 1) > remaining_len:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_len + start - 1
                assert ((end - start + 1) <= remaining_len)
                sn_intervals[sn] = (end + 1, real_end)
                sns.insert(0, sn)
            interval.append("%s:%s-%s" % (sn, start, end))
            remaining_len -= (end - start + 1)
            intervals_count += 1
            if remaining_len <= 0:
                break
        if intervals_count > 0:
            intervals.append(interval)
        else:
            print "WARNING: skipping empty intervals for %s" % interval_input_name
    print "Have %s intervals" % (len(intervals))

    if reuse_tasks:
        # get candidates for task reuse
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]
        reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params,
                                            job_filters)
        print "Have %s potentially reusable tasks" % (len(reusable_tasks))

    for interval in intervals:
        interval_str = ' '.join(interval)
        print "Creating task to process interval: [%s]" % interval_str
        new_task_params = arvados.current_task()['parameters']
        new_task_params['interval'] = interval_str
        if reuse_tasks:
            task = create_or_reuse_task(if_sequence + 1, new_task_params,
                                        reusable_tasks, task_key_params,
                                        validate_task_output)
        else:
            task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #54
0
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        debug = False

        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        if "arv:debug" in job_order_object:
            debug = job_order_object["arv:debug"]
            del job_order_object["arv:debug"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache(
            api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}),
                                          output_name=output_name, output_tags=output_tags)

        make_fs_access = functools.partial(CollectionFsAccess,
                                 collection_cache=runner.collection_cache)

        t = load_tool(toolpath, runner.arv_make_tool,
                      fetcher_constructor=functools.partial(CollectionFetcher,
                                                  api_client=runner.api,
                                                  fs_access=make_fs_access(""),
                                                  num_retries=runner.num_retries))

        if debug:
            logger.setLevel(logging.DEBUG)
            logging.getLogger('arvados').setLevel(logging.DEBUG)
            logging.getLogger("cwltool").setLevel(logging.DEBUG)

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = debug
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        args.make_fs_access = make_fs_access
        args.trash_intermediate = False
        args.intermediate_output_ttl = 0
        args.priority = arvados_cwl.DEFAULT_PRIORITY
        args.do_validate = True
        args.disable_js_validation = False

        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()
def one_task_per_cram_file(if_sequence=0, and_end_task=True, 
                           skip_sq_sn_regex='_decoy$', 
                           genome_chunks=200):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # setup multiprocessing pool
    pool_processes = cpu_count() - 1
    print 'Using %d processes to submit tasks\n' % pool_processes
    pool = Pool(processes=pool_processes)

    skip_sq_sn_r = re.compile(skip_sq_sn_regex)

    # Ensure we have a .fa reference file with corresponding .fai index and .dict
    reference_coll = arvados.current_job()['script_parameters']['reference_collection']
    rcr = arvados.CollectionReader(reference_coll)
    ref_fasta = {}
    ref_fai = {}
    ref_dict = {}
    ref_input = None
    dict_reader = None
    for rs in rcr.all_streams():
        for rf in rs.all_files():
            if re.search(r'\.fa$', rf.name()):
                ref_fasta[rs.name(), rf.name()] = rf
            elif re.search(r'\.fai$', rf.name()):
                ref_fai[rs.name(), rf.name()] = rf
            elif re.search(r'\.dict$', rf.name()):
                ref_dict[rs.name(), rf.name()] = rf
    for ((s_name, f_name), fasta_f) in ref_fasta.items():
        fai_f = ref_fai.get((s_name, re.sub(r'fa$', 'fai', f_name)), 
                            ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), 
                                        None))
        dict_f = ref_dict.get((s_name, re.sub(r'fa$', 'dict', f_name)), 
                              ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), 
                                           None))
        if fasta_f and fai_f and dict_f:
            # found a set of all three! 
            ref_input = fasta_f.as_manifest()
            ref_input += fai_f.as_manifest()
            ref_input += dict_f.as_manifest()
            dict_reader = dict_f
            break
    if ref_input is None:
        raise InvalidArgumentError("Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files()))
    if dict_reader is None:
        raise InvalidArgumentError("Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files()))

    # Create a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={"manifest_text": ref_input}).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise 

    # Load the dict data
    interval_header = ""
    dict_lines = dict_reader.readlines()
    dict_header = dict_lines.pop(0)
    if re.search(r'^@HD', dict_header) is None:
        raise InvalidArgumentError("Dict file in reference collection does not have correct header: [%s]" % dict_header)
    interval_header += dict_header
    print "Dict header is %s" % dict_header
    sn_intervals = dict()
    sns = []
    skip_sns = []
    total_len = 0
    for sq in dict_lines:
        if re.search(r'^@SQ', sq) is None:
            raise InvalidArgumentError("Dict file contains malformed SQ line: [%s]" % sq)
        interval_header += sq
        sn = None
        ln = None
        for tagval in sq.split("\t"):
            tv = tagval.split(":", 1)
            if tv[0] == "SN":
                sn = tv[1]
            if tv[0] == "LN":
                ln = tv[1]
            if sn and ln:
                break
        if not (sn and ln):
            raise InvalidArgumentError("Dict file SQ entry missing required SN and/or LN parameters: [%s]" % sq)
        assert(sn and ln)
        if sn_intervals.has_key(sn):
            raise InvalidArgumentError("Dict file has duplicate SQ entry for SN %s: [%s]" % (sn, sq))
        if skip_sq_sn_r.search(sn):
            skip_sns.append(sn)
            continue
        sn_intervals[sn] = (1, int(ln))
        sns.append(sn)
        total_len += int(ln)
    total_sequences = len(sns)

    print "Skipped %s SQs with SNs matching regex [%s]" % (len(skip_sns), skip_sq_sn_regex)

    # Chunk the genome into genome_chunks pieces
    # weighted by both number of base pairs and number of seqs
    print "Total sequences included: %s" % (total_sequences)
    print "Total genome length: %s" % (total_len)
    total_points = total_len + (total_sequences * weight_seq)
    chunk_points = int(total_points / genome_chunks)
    chunk_input_pdh_names = []
    print "Chunking genome into %s chunks of ~%s points" % (genome_chunks, chunk_points)
    for chunk_i in range(0, genome_chunks):
        chunk_num = chunk_i + 1
        chunk_intervals_count = 0
        chunk_input_name = dict_reader.name() + (".%s_of_%s.region_list" % (chunk_num, genome_chunks))
        print "Creating interval file for chunk %s" % chunk_num
        chunk_c = arvados.collection.CollectionWriter(num_retries=3)
        chunk_c.start_new_file(newfilename=chunk_input_name)
        # chunk_c.write(interval_header)
        remaining_points = chunk_points
        while len(sns) > 0:
            sn = sns.pop(0)
            remaining_points -= weight_seq
            if remaining_points <= 0:
                sns.insert(0, sn)
                break
            if not sn_intervals.has_key(sn):
                raise ValueError("sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end-start+1) > remaining_points:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_points + start - 1
                assert((end-start+1) <= remaining_points)
                sn_intervals[sn] = (end+1, real_end)
                sns.insert(0, sn)
            #interval = "%s\t%s\t%s\t+\t%s\n" % (sn, start, end, "interval_%s_of_%s_%s" % (chunk_num, genome_chunks, sn))
            interval = "%s\t%s\t%s\n" % (sn, start, end)
            remaining_points -= (end-start+1)
            chunk_c.write(interval)
            chunk_intervals_count += 1
            if remaining_points <= 0:
                break
        if chunk_intervals_count > 0:
            chunk_input_pdh = chunk_c.finish()
            print "Chunk intervals file %s saved as %s" % (chunk_input_name, chunk_input_pdh)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        else:
            print "WARNING: skipping empty intervals for %s" % chunk_input_name
    print "Have %s chunk collections: [%s]" % (len(chunk_input_pdh_names), ' '.join([x[0] for x in chunk_input_pdh_names]))

    # prepare CRAM input collections
    job_input = arvados.current_job()['script_parameters']['inputs_collection']
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        # Handle this CRAM file
        crai_f = crai.get((s_name, re.sub(r'cram$', 'crai', f_name)), 
                          crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), 
                                   None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise InvalidArgumentError("No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={"manifest_text": task_input}).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise 

        create_chunk_tasks(f_name, chunk_input_pdh_names, 
                           if_sequence, task_input_pdh, ref_input_pdh, chunk_input_pdh, 
                           pool=pool)

    print "Waiting for asynchronous requests to complete"
    pool.close()
    pool.join()

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={'success':True}
                                         ).execute()
        exit(0)
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list
        script="gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=', arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                       group_by_regex,
                                                       if_sequence=0, and_end_task=True,
                                                       create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
Beispiel #57
0
#!/usr/bin/python
#

import arvados
import re

arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)

this_job = arvados.current_job()
this_task = arvados.current_task()
this_task_input = this_task['parameters']['input']

input_file = list( arvados.CollectionReader(this_task_input).all_files() )[0]

out = arvados.CollectionWriter()
out.set_current_file_name(input_file.decompressed_name())
out.set_current_stream_name(input_file.stream_name())
for line in input_file.readlines():
  out.write( "!!!" + line.upper() )

this_task.set_output(out.finish())