def mount_gatk_cram_input(input_param="input"):
    # Get single CRAM file for this task
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise errors.InvalidArgumentError(
            "Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise errors.FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise errors.FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)
    return cram_file
Ejemplo n.º 2
0
def mount_gatk_gvcf_inputs(inputs_param="inputs"):
    # Get input gVCFs for this task
    print "Mounting task input collection"
    inputs_dir = arvados.get_task_param_mount(inputs_param)

    # Sanity check input gVCFs
    input_gvcf_files = []
    for f in arvados.util.listdir_recursive(inputs_dir):
        if re.search(r'\.vcf\.gz$', f):
            input_gvcf_files.append(os.path.join(inputs_dir, f))
        elif re.search(r'\.tbi$', f):
            pass
        elif re.search(r'\.interval_list$', f):
            pass
        else:
            print "WARNING: collection contains unexpected file %s" % f
    if len(input_gvcf_files) == 0:
        raise errors.InvalidArgumentError("Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)" % inputs_dir)

    # Ensure we can read the gVCF files and that they each have an index
    for gvcf_file in input_gvcf_files:
        if not os.access(gvcf_file, os.R_OK):
            raise errors.FileAccessError("gVCF file not readable: %s" % gvcf_file)

        # Ensure we have corresponding .tbi index and can read it as well
        (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file)
        assert(gvcf_file_ext == ".gz")
        tbi_file = gvcf_file_base + ".gz.tbi"
        if not os.access(tbi_file, os.R_OK):
            tbi_file = gvcf_file_base + ".tbi"
            if not os.access(tbi_file, os.R_OK):
                raise errors.FileAccessError("No readable gVCF index file for gVCF file: %s" % gvcf_file)
    return input_gvcf_files
Ejemplo n.º 3
0
def mount_gatk_cram_input(input_param="input"):
    # Get single CRAM file for this task
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise errors.InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise errors.FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise errors.FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)
    return cram_file
Ejemplo n.º 4
0
def mount_gatk_reference(ref_param="ref"):
    # Get reference FASTA
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount(ref_param)

    # Sanity check reference FASTA
    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise errors.InvalidArgumentError("No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise errors.FileAccessError("reference FASTA file not readable: %s" % ref_file)
    # TODO: could check readability of .fai and .dict as well?
    return ref_file
def mount_gatk_reference(ref_param="ref"):
    # Get reference FASTA
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount(ref_param)

    # Sanity check reference FASTA
    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise errors.InvalidArgumentError(
            "No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise errors.FileAccessError("reference FASTA file not readable: %s" %
                                     ref_file)
    # TODO: could check readability of .fai and .dict as well?
    return ref_file
def mount_gatk_gvcf_inputs(inputs_param="inputs"):
    # Get input gVCFs for this task
    print "Mounting task input collection"
    inputs_dir = ""
    if inputs_param in arvados.current_task()['parameters']:
        inputs_dir = arvados.get_task_param_mount(inputs_param)
    else:
        inputs_dir = arvados.get_job_param_mount(inputs_param)

    # Sanity check input gVCFs
    input_gvcf_files = []
    for f in arvados.util.listdir_recursive(inputs_dir):
        if re.search(r'\.vcf\.gz$', f):
            input_gvcf_files.append(os.path.join(inputs_dir, f))
        elif re.search(r'\.tbi$', f):
            pass
        elif re.search(r'\.interval_list$', f):
            pass
        else:
            print "WARNING: collection contains unexpected file %s" % f
    if len(input_gvcf_files) == 0:
        raise errors.InvalidArgumentError(
            "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)"
            % inputs_dir)

    # Ensure we can read the gVCF files and that they each have an index
    for gvcf_file in input_gvcf_files:
        if not os.access(gvcf_file, os.R_OK):
            raise errors.FileAccessError("gVCF file not readable: %s" %
                                         gvcf_file)

        # Ensure we have corresponding .tbi index and can read it as well
        (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file)
        assert (gvcf_file_ext == ".gz")
        tbi_file = gvcf_file_base + ".gz.tbi"
        if not os.access(tbi_file, os.R_OK):
            tbi_file = gvcf_file_base + ".tbi"
            if not os.access(tbi_file, os.R_OK):
                raise errors.FileAccessError(
                    "No readable gVCF index file for gVCF file: %s" %
                    gvcf_file)
    return input_gvcf_files
Ejemplo n.º 7
0
def mount_single_gatk_interval_list_input(interval_list_param="interval_list"):
    # Get interval_list for this task
    print "Mounting task input collection to get interval_list"
    interval_list_dir = arvados.get_task_param_mount(interval_list_param)
    print "Interval_List collection mounted at %s" % (interval_list_dir)

    # Sanity check input interval_list (there can be only one)
    input_interval_lists = []
    for f in arvados.util.listdir_recursive(interval_list_dir):
        if re.search(r'\.interval_list$', f):
            input_interval_lists.append(os.path.join(interval_list_dir, f))
    if len(input_interval_lists) != 1:
        raise errors.InvalidArgumentError("Expected exactly one interval_list in input collection (found %s)" % len(input_interval_lists))

    assert(len(input_interval_lists) == 1)
    interval_list_file = input_interval_lists[0]

    if not os.access(interval_list_file, os.R_OK):
        raise errors.FileAccessError("interval_list file not readable: %s" % interval_list_file)

    return interval_list_file
def mount_single_gatk_interval_list_input(interval_list_param="interval_list"):
    # Get interval_list for this task
    print "Mounting task input collection to get interval_list"
    interval_list_dir = arvados.get_task_param_mount(interval_list_param)
    print "Interval_List collection mounted at %s" % (interval_list_dir)

    # Sanity check input interval_list (there can be only one)
    input_interval_lists = []
    for f in arvados.util.listdir_recursive(interval_list_dir):
        if re.search(r'\.interval_list$', f):
            input_interval_lists.append(os.path.join(interval_list_dir, f))
    if len(input_interval_lists) != 1:
        raise errors.InvalidArgumentError(
            "Expected exactly one interval_list in input collection (found %s)"
            % len(input_interval_lists))

    assert (len(input_interval_lists) == 1)
    interval_list_file = input_interval_lists[0]

    if not os.access(interval_list_file, os.R_OK):
        raise errors.FileAccessError("interval_list file not readable: %s" %
                                     interval_list_file)

    return interval_list_file
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)

    this_job = arvados.current_job()

    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex=skip_sq_sn_regex,
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(
        ref_file,
        error_exception=FileAccessError("reference fasta not readable: %s" %
                                        ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(
        ref_fai_file,
        error_exception=FileAccessError(
            "reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(
        chunk_file,
        error_exception=FileAccessError(
            "Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file,
                              error_exception=FileAccessError(
                                  "CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(
        cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(
        tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

    #    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.g.vcf" %
                               (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" %
                               (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend(
        [concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')

    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")

    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()

        if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None)
                and (part_tee_p is None)
                and (bcftools_view_headeronly_p is None)
                and (bcftools_view_noheader_p is None)):
            # no per-region processes are running (they have finished or
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num,
                                               total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(
                    tmp_dir, output_basename +
                    (".part_%s_of_%s.noheader.g.bcf" %
                     (current_region_num, total_region_count)))
                part_tee_cmd = [
                    "teepot", bcftools_view_noheader_input_fifo, "-"
                ]
                bcftools_view_noheader_cmd = [
                    "bcftools", "view", "-H", "-Ov",
                    bcftools_view_noheader_input_fifo
                ]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = [
                    "bcftools", "view", "-h", "-Oz", "-o",
                    concat_headeronly_tmp
                ]
                bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"]
                bcftools_mpileup_cmd = [
                    "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50",
                    "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15",
                    "-f", ref_file, "-Ou", "-r", region, cram_file
                ]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (
                    region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe(
                )

                print "Creating 'bcftools norm | tee' pipe for region %s" % (
                    region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()

                print "Creating 'tee | bcftools view -h' pipe for region %s" % (
                    region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe(
                )

                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (
                    bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo,
                                                      'wb')

                bcftools_mpileup_p = run_child_cmd(
                    bcftools_mpileup_cmd,
                    stdout=bcftools_norm_stdin_pipe_write,
                    tag="bcftools mpileup %s" % (region_label))

                bcftools_norm_p = run_child_cmd(
                    bcftools_norm_cmd,
                    stdin=bcftools_norm_stdin_pipe_read,
                    stdout=part_tee_stdin_pipe_write,
                    tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(
                    part_tee_cmd,
                    stdin=part_tee_stdin_pipe_read,
                    stdout=bcftools_view_headeronly_stdin_pipe_write,
                    tag="tee %s" % (region_label))

                bcftools_view_headeronly_p = run_child_cmd(
                    bcftools_view_headeronly_cmd,
                    stdin=bcftools_view_headeronly_stdin_pipe_read,
                    tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(
                    bcftools_view_noheader_cmd,
                    stdout=current_concat_noheader_fifo_f,
                    tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(
            bcftools_mpileup_p,
            "bcftools mpileup %s" % (region_label),
            close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(
            bcftools_norm_p,
            "bcftools norm %s" % (region_label),
            close_fds=[
                bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write
            ])

        part_tee_p = close_process_if_finished(
            part_tee_p,
            "tee %s" % (region_label),
            close_fds=[
                part_tee_stdin_pipe_read,
                bcftools_view_headeronly_stdin_pipe_write
            ],
            ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(
            bcftools_view_headeronly_p,
            "bcftools view -h %s" % (region_label),
            close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(
            bcftools_view_noheader_p,
            "bcftools view %s" % (region_label),
            close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(
            region_concat_p, "bcftools concat", close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if ((region_concat_p is None) and (bcftools_view_noheader_p is None)
                and (bcftools_view_headeronly_p is None)
                and (part_tee_p is None) and (bcftools_norm_p is None)
                and (bcftools_mpileup_p is None)):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir,
                                              output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (
        concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [
                concat_headeronly_tmps[region] for region in regions
        ]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir,
                                        output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe"
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe(
    )

    grep_headeronly_cmd = [
        "egrep", "-v", "^[#][#](bcftools|mpileup|reference)"
    ]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = [
        "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn
    ]
    bcftools_concat_headeronly_p = run_child_cmd(
        bcftools_concat_headeronly_cmd,
        stdout=grep_headeronly_stdin_pipe_write,
        tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(
            bcftools_concat_headeronly_p,
            "bcftools concat (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(
            grep_headeronly_p,
            "grep (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_read],
            close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
                and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done!
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"

    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (
            final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd,
                                      tag="final bgzip",
                                      stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(
                final_bgzip_p, "final bgzip", close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe(
        )
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = [
            "bcftools", "view", "-Oz", "-o", penultimate_out_file
        ]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(
            final_bcftools_view_cmd,
            tag="final bcftools view -Oz",
            stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(
            final_concat_cmd,
            tag="final cat (header+data)",
            stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(
                final_bcftools_view_p,
                "final bcftools view -Oz",
                close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(
                final_concat_p,
                "final cat (header+data)",
                close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None) and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (
            final_out_file)
        final_bcftools_reheader_cmd = [
            "bcftools", "reheader", "-h", final_headeronly_tmp, "-o",
            final_out_file, penultimate_out_file
        ]
        final_bcftools_reheader_p = run_child_cmd(
            final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(
                final_bcftools_reheader_p, "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)
    
    this_job = arvados.current_job()
    
    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True, 
                           skip_sq_sn_regex=skip_sq_sn_regex, 
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert(this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(ref_fai_file, error_exception=FileAccessError("reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(chunk_file, error_exception=FileAccessError("Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task 
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file, error_exception=FileAccessError("CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)


    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) 
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

#    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend([concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')
    
    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")
    
    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()
    
        if (
                (bcftools_mpileup_p is None) and
                (bcftools_norm_p is None) and 
                (part_tee_p is None) and
                (bcftools_view_headeronly_p is None) and 
                (bcftools_view_noheader_p is None)
        ):
            # no per-region processes are running (they have finished or 
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count)))
                part_tee_cmd = ["teepot", bcftools_view_noheader_input_fifo, "-"]
                bcftools_view_noheader_cmd = ["bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = ["bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp]
                bcftools_norm_cmd = ["bcftools", "norm", 
                                     "-f", ref_file, 
                                     "-Ou"]
                bcftools_mpileup_cmd = ["bcftools-gvcf", "mpileup",
                                        "-t", "AD,INFO/AD",
                                        "-C50", 
                                        "-pm2", 
                                        "-F0.1",
                                        "-d10000",
                                        "--gvcf", "1,2,3,4,5,10,15",
                                        "-f", ref_file,
                                        "-Ou",
                                        "-r", region,
                                        cram_file]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe()

                print "Creating 'bcftools norm | tee' pipe for region %s" % (region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view -h' pipe for region %s" % (region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe()
                
                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb')

                bcftools_mpileup_p = run_child_cmd(bcftools_mpileup_cmd,
                                                   stdout=bcftools_norm_stdin_pipe_write,
                                                   tag="bcftools mpileup %s" % (region_label))
                
                bcftools_norm_p = run_child_cmd(bcftools_norm_cmd,
                                                stdin=bcftools_norm_stdin_pipe_read,
                                                stdout=part_tee_stdin_pipe_write,
                                                tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(part_tee_cmd,
                                           stdin=part_tee_stdin_pipe_read,
                                           stdout=bcftools_view_headeronly_stdin_pipe_write,
                                           tag="tee %s" % (region_label))
                
                bcftools_view_headeronly_p = run_child_cmd(bcftools_view_headeronly_cmd,
                                                           stdin=bcftools_view_headeronly_stdin_pipe_read,
                                                           tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(bcftools_view_noheader_cmd,
                                                         stdout=current_concat_noheader_fifo_f,
                                                         tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(bcftools_mpileup_p,
                                                       "bcftools mpileup %s" % (region_label),
                                                       close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(bcftools_norm_p,
                                                    "bcftools norm %s" % (region_label),
                                                    close_fds=[bcftools_norm_stdin_pipe_read, 
                                                               part_tee_stdin_pipe_write])
        
        part_tee_p = close_process_if_finished(part_tee_p,
                                               "tee %s" % (region_label),
                                               close_fds=[part_tee_stdin_pipe_read,
                                                          bcftools_view_headeronly_stdin_pipe_write],
                                               ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(bcftools_view_headeronly_p,
                                                               "bcftools view -h %s" % (region_label),
                                                               close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(bcftools_view_noheader_p,
                                                             "bcftools view %s" % (region_label),
                                                             close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(region_concat_p,
                                                      "bcftools concat",
                                                      close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if (
            (region_concat_p is None)
            and (bcftools_view_noheader_p is None)
            and (bcftools_view_headeronly_p is None)
            and (part_tee_p is None)
            and (bcftools_norm_p is None)
            and (bcftools_mpileup_p is None)
            ):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration


    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [concat_headeronly_tmps[region] for region in regions]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe" 
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe()

    grep_headeronly_cmd = ["egrep", "-v", "^[#][#](bcftools|mpileup|reference)"]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = ["bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn]
    bcftools_concat_headeronly_p = run_child_cmd(bcftools_concat_headeronly_cmd,
                                                 stdout=grep_headeronly_stdin_pipe_write,
                                                 tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(bcftools_concat_headeronly_p,
                                                                 "bcftools concat (headeronly)",
                                                                 close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(grep_headeronly_p,
                                                      "grep (headeronly)",
                                                      close_fds=[grep_headeronly_stdin_pipe_read],
                                                      close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
            and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done! 
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"


    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and 
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(final_bgzip_p,
                                                      "final bgzip",
                                                      close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe()
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = ["bcftools", "view", "-Oz", "-o", penultimate_out_file]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(final_bcftools_view_p,
                                                              "final bcftools view -Oz",
                                                              close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(final_concat_p,
                                                       "final cat (header+data)",
                                                       close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None)
                and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (final_out_file)
        final_bcftools_reheader_cmd = ["bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file]
        final_bcftools_reheader_p = run_child_cmd(final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(final_bcftools_reheader_p,
                                                              "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done! 
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
Ejemplo n.º 11
0
outdir = os.path.join(task.tmpdir, "output")
os.makedirs(outdir)
os.chdir(outdir)

if infile_parts is None:
    print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file
    sys.exit(1)

cr = arvados.CollectionReader(infile_parts.group(1))
streamname = infile_parts.group(3)[1:]
filename = infile_parts.group(4)[1:]

if streamname is not None:
    subprocess.call(["mkdir", "-p", streamname])
    os.chdir(streamname)
else:
    streamname = '.'

m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE)

if m is not None:
    rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')])
    if rc == 0:
        task.set_output(robust_put.upload(outdir))
    else:
        sys.exit(rc)
else:
    streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0]
    filereader = streamreader.files()[filename]
    task.set_output(streamname + filereader.as_manifest()[1:])
Ejemplo n.º 12
0
def main():

    this_job = arvados.current_job()

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise FileAccessError("reference FASTA file not readable: %s" %
                              ref_file)
    # TODO: could check readability of .fai and .dict as well?

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list.txt$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    if not os.access(chunk_file, os.R_OK):
        raise FileAccessError("Chunk intervals file not readable: %s" %
                              chunk_file)

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise


#    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz")
    config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf")
    lock_file = os.path.join(arvados.current_task().tmpdir,
                             "run-bt-mpileup.lock")

    if not os.path.exists(RUNNER_CONFIG_TEMPLATE):
        raise FileAccessError("No runner configuration template at %s" %
                              RUNNER_CONFIG_TEMPLATE)
    # generate config
    runner_config_text = jinja2.Environment(loader=jinja2.FileSystemLoader(
        "/")).get_template(RUNNER_CONFIG_TEMPLATE).render(
            fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file)
    with open(config_file, "wb") as fh:
        fh.write(runner_config_text)
    # report configuration
    print "Generated runner config to %s:\n%s" % (config_file,
                                                  runner_config_text)

    # Call run-bt-mpileup
    runner_p = subprocess.Popen([
        "run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5",
        "+lock", lock_file, "-o", out_dir
    ],
                                stdin=None,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                close_fds=True,
                                shell=False)

    while runner_p.poll() is None:
        line = runner_p.stdout.readline()
        # only print '#### unfinished' lines or things that are errors or warnings
        if re.search(r'\d+\s+unfinished', line) or re.search(
                r'(FATAL|ERROR|WARNING)', line, flags=re.IGNORECASE):
            print "RUNNER: %s" % line.rstrip()

    runner_exit = runner_p.wait()
    if runner_exit != 0:
        print "WARNING: runner exited with exit code %s" % runner_exit

    # clean up out_dir
    try:
        os.remove(os.path.join(out_dir, "run-bt-mpileup.lock"))
        os.remove(os.path.join(out_dir, "mpileup.conf"))
        os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz"))
    except:
        print "WARNING: could not remove some output files!"
        pass

    out_bcf = os.path.join(
        out_dir,
        os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) +
        ".bcf")
    try:
        os.rename(os.path.join(out_dir, "all.bcf"), out_bcf)
        os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi")
        os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"),
                  out_bcf + ".filt.vchk")
        os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk")
    except:
        print "WARNING: could not rename some output files!"
        pass

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)
def main():

    this_job = arvados.current_job()

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert this_task["sequence"] != 0

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount("ref")

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r"\.fa$", f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise FileAccessError("reference FASTA file not readable: %s" % ref_file)
    # TODO: could check readability of .fai and .dict as well?

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount("chunk")

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r"\.region_list.txt$", f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    if not os.access(chunk_file, os.R_OK):
        raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file)

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount("input")

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r"\.cram$", f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert cram_file_ext == ".cram"
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, "out")
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    #    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz")
    config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf")
    lock_file = os.path.join(arvados.current_task().tmpdir, "run-bt-mpileup.lock")

    if not os.path.exists(RUNNER_CONFIG_TEMPLATE):
        raise FileAccessError("No runner configuration template at %s" % RUNNER_CONFIG_TEMPLATE)
    # generate config
    runner_config_text = (
        jinja2.Environment(loader=jinja2.FileSystemLoader("/"))
        .get_template(RUNNER_CONFIG_TEMPLATE)
        .render(fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file)
    )
    with open(config_file, "wb") as fh:
        fh.write(runner_config_text)
    # report configuration
    print "Generated runner config to %s:\n%s" % (config_file, runner_config_text)

    # Call run-bt-mpileup
    runner_p = subprocess.Popen(
        ["run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5", "+lock", lock_file, "-o", out_dir],
        stdin=None,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        close_fds=True,
        shell=False,
    )

    while runner_p.poll() is None:
        line = runner_p.stdout.readline()
        # only print '#### unfinished' lines or things that are errors or warnings
        if re.search(r"\d+\s+unfinished", line) or re.search(r"(FATAL|ERROR|WARNING)", line, flags=re.IGNORECASE):
            print "RUNNER: %s" % line.rstrip()

    runner_exit = runner_p.wait()
    if runner_exit != 0:
        print "WARNING: runner exited with exit code %s" % runner_exit

    # clean up out_dir
    try:
        os.remove(os.path.join(out_dir, "run-bt-mpileup.lock"))
        os.remove(os.path.join(out_dir, "mpileup.conf"))
        os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz"))
    except:
        print "WARNING: could not remove some output files!"
        pass

    out_bcf = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".bcf")
    try:
        os.rename(os.path.join(out_dir, "all.bcf"), out_bcf)
        os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi")
        os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"), out_bcf + ".filt.vchk")
        os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk")
    except:
        print "WARNING: could not rename some output files!"
        pass

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)
Ejemplo n.º 14
0
if infile_parts is None:
    print >> sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file
    sys.exit(1)

cr = arvados.CollectionReader(infile_parts.group(1))
streamname = infile_parts.group(3)[1:]
filename = infile_parts.group(4)[1:]

if streamname is not None:
    subprocess.call(["mkdir", "-p", streamname])
    os.chdir(streamname)
else:
    streamname = '.'

m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$',
             arvados.get_task_param_mount('input'), re.IGNORECASE)

if m is not None:
    rc = subprocess.call(
        ["dtrx", "-r", "-n", "-q",
         arvados.get_task_param_mount('input')])
    if rc == 0:
        task.set_output(robust_put.upload(outdir))
    else:
        sys.exit(rc)
else:
    streamreader = filter(lambda s: s.name() == streamname,
                          cr.all_streams())[0]
    filereader = streamreader.files()[filename]
    task.set_output(streamname + filereader.as_manifest()[1:])
def main():

    this_job = arvados.current_job()

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert(this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise FileAccessError("reference FASTA file not readable: %s" % ref_file)
    # TODO: could check readability of .fai and .dict as well?

    # Get genome chunk intervals file
    # chunk_file = None
    # print "Mounting chunk collection"
    # chunk_dir = arvados.get_task_param_mount('chunk')

    # for f in arvados.util.listdir_recursive(chunk_dir):
    #     if re.search(r'\.region_list.txt$', f):
    #         chunk_file = os.path.join(chunk_dir, f)
    # if chunk_file is None:
    #     raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # # Ensure we can read the chunk file
    # if not os.access(chunk_file, os.R_OK):
    #     raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file)

    # Get single CRAM file for this task 
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) 
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
#    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.bcf")
    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + ".g.bcf")

    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)

    # Call bcftools
    runner_p = subprocess.Popen(bash_cmd_pipe, 
        stdin=None,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        close_fds=True,
        shell=True)

    while runner_p.poll() is None:
        line = runner_p.stdout.readline()
        print "BCFTOOLS: %s" % line.rstrip()

    runner_exit = runner_p.wait()
    if runner_exit != 0:
        print "WARNING: runner exited with exit code %s" % runner_exit

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)