def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, transform, upload.
    Previously, this was not monolithic; change came in due to #126/#134.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        if inputs.native_adam_path is None:
            hdfs_dir = "/data/"
        else:
            hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split(
            '/')[-1]

        if not inputs.run_local:
            download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample,
                          hdfs_snps, hdfs_bam)
        else:
            copy_files([inputs.sample, inputs.dbsnp], inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        adam_snps = hdfs_dir + "/snps.var.adam"
        adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input,
                     adam_snps, spark_on_toil)

        adam_output = hdfs_prefix + ".processed.bam"
        adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir,
                       adam_output, spark_on_toil)

        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            upload_data(job, master_ip, inputs, adam_output, out_file,
                        spark_on_toil)
        else:
            local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir,
                                                         sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Exemple #2
0
def test_copy_files(tmpdir):
    from toil_lib.files import copy_files
    work_dir = str(tmpdir)
    os.mkdir(os.path.join(work_dir, 'test'))
    fpath = os.path.join(work_dir, 'output_file')
    with open(fpath, 'wb') as fout:
        fout.write(os.urandom(1024))
    copy_files([fpath], os.path.join(work_dir, 'test'))
    assert os.path.exists(os.path.join(work_dir, 'test', 'output_file'))
Exemple #3
0
def consolidate_output(job, config, mutect, pindel, muse):
    """
    Combine the contents of separate tarball outputs into one via streaming

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str mutect: MuTect tarball FileStoreID
    :param str pindel: Pindel tarball FileStoreID
    :param str muse: MuSe tarball FileStoreID
    """
    work_dir = job.fileStore.getLocalTempDir()
    mutect_tar, pindel_tar, muse_tar = None, None, None
    if mutect:
        mutect_tar = job.fileStore.readGlobalFile(
            mutect, os.path.join(work_dir, 'mutect.tar.gz'))
    if pindel:
        pindel_tar = job.fileStore.readGlobalFile(
            pindel, os.path.join(work_dir, 'pindel.tar.gz'))
    if muse:
        muse_tar = job.fileStore.readGlobalFile(
            muse, os.path.join(work_dir, 'muse.tar.gz'))
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar is mutect_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'mutect',
                                os.path.basename(tarinfo.name))
                        elif tar is pindel_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'pindel',
                                os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(
                                config.uuid, 'muse',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(job=job,
                    fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)
Exemple #4
0
def rsem_quantification(job, config, star_output):
    """
    Unpack STAR results and run RSEM (and saving BAM from STAR)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param tuple(FileID, FileID, FileID, FileID)|tuple(FileID, FileID, FileID) star_output: FileStoreIDs from STAR
    :return: FileStoreID results from RSEM postprocess and STAR log
    :rtype: tuple(FileID, FileID, FileID)
    """
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(16, config.cores)
    if config.wiggle:
        transcriptome_id, sorted_id, wiggle_id, log_id = flatten(star_output)
        wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg')
        job.fileStore.readGlobalFile(wiggle_id, wiggle_path)
        if urlparse(config.output_dir).scheme == 's3':
            s3am_upload(fpath=wiggle_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        else:
            copy_files(file_paths=[wiggle_path], output_dir=config.output_dir)
    else:
        transcriptome_id, sorted_id, log_id = star_output
    # Save sorted bam if flag is selected
    if config.save_bam and not config.bamqc:  # if config.bamqc is selected, bam is being saved in run_bam_qc
        bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam')
        job.fileStore.readGlobalFile(sorted_id, bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[bam_path], output_dir=config.output_dir)
    # Declare RSEM and RSEM post-process jobs
    disk = 5 * transcriptome_id.size
    rsem_output = job.wrapJobFn(run_rsem,
                                transcriptome_id,
                                config.rsem_ref,
                                paired=config.paired,
                                cores=cores,
                                disk=disk)
    rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, rsem_output.rv(0),
                                     rsem_output.rv(1))
    job.addChild(rsem_output)
    rsem_output.addChild(rsem_postprocess)
    # Save STAR log
    log_path = os.path.join(work_dir, 'Log.final.out')
    job.fileStore.readGlobalFile(log_id, log_path)
    tarball_files(tar_name='star.tar.gz',
                  file_paths=[log_path],
                  output_dir=work_dir)
    star_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'star.tar.gz'))
    return rsem_postprocess.rv(), star_id
Exemple #5
0
def run_bam_qc(job, aligned_bam_id, config):
    """
    Run BAM QC as specified by California Kids Cancer Comparison (CKCC)

    :param JobFunctionWrappingJob job:
    :param str aligned_bam_id: FileStoreID of sorted bam from STAR
    :param Namespace config: Argparse Namespace object containing argument inputs
        Must contain:
            config.uuid str: UUID of input sample
            config.save_bam bool: True/False depending on whether to save bam
            config.output_dir str: Path to save bam
            config.ssec str: Path to encryption key for secure upload to S3
    :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar
    :rtype: tuple(bool, str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(
        aligned_bam_id,
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    docker_call(tool='hbeale/treehouse_bam_qc:1.0',
                work_dir=work_dir,
                parameters=['runQC.sh', str(job.cores)])

    # Tar Output files
    output_names = [
        'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf',
        'rnaAligned.out.md.sorted.geneBodyCoverage.txt'
    ]
    if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')):
        output_names.append('readDist.txt_PASS_qc.txt')
        fail_flag = False
    else:
        output_names.append('readDist.txt_FAIL_qc.txt')
        fail_flag = True
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='bam_qc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)

    # Save output BAM
    if config.save_bam:
        bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam')
        new_bam_path = os.path.join(work_dir,
                                    config.uuid + '.sortedByCoord.md.bam')
        os.rename(bam_path, new_bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=new_bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[new_bam_path], output_dir=config.output_dir)

    return fail_flag, job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'bam_qc.tar.gz'))
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, transform, upload.
    Previously, this was not monolithic; change came in due to #126/#134.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        if inputs.native_adam_path is None:
            hdfs_dir = "/data/"
        else:
            hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split('/')[-1]

        if not inputs.run_local:
            download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample, hdfs_snps, hdfs_bam)
        else:
            copy_files([inputs.sample, inputs.dbsnp], inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        adam_snps = hdfs_dir + "/snps.var.adam"
        adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input, adam_snps, spark_on_toil)

        adam_output = hdfs_prefix + ".processed.bam"
        adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir, adam_output, spark_on_toil)

        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            upload_data(job, master_ip, inputs, adam_output, out_file, spark_on_toil)
        else:
            local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir, sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Exemple #7
0
def consolidate_output(job, config, kallisto_output, graphical_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str kallisto_output: FileStoreID for Kallisto output
    :param str graphical_output: FileStoreID for output of graphing step
    """
    job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    graphical_tar, kallisto_tar = None, None
    # Retrieve output file paths to consolidate
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(
            kallisto_output, os.path.join(work_dir, 'kallisto_output.tar.gz'))
    if graphical_output:
        graphical_tar = job.fileStore.readGlobalFile(
            graphical_output, os.path.join(work_dir,
                                           'single_cell_plots.tar.gz'))
    # I/O
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [graphical_tar, kallisto_tar] if x is not None]
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == kallisto_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, os.path.basename(tarinfo.name))
                        elif tar == graphical_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'plots',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(
            file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')],
            output_dir=config.output_dir)
def consolidate_output(job, config, mutect, pindel, muse):
    """
    Combine the contents of separate tarball outputs into one via streaming

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str mutect: MuTect tarball FileStoreID
    :param str pindel: Pindel tarball FileStoreID
    :param str muse: MuSe tarball FileStoreID
    """
    work_dir = job.fileStore.getLocalTempDir()
    mutect_tar, pindel_tar, muse_tar = None, None, None
    if mutect:
        mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz'))
    if pindel:
        pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz'))
    if muse:
        muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz'))
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar is mutect_tar:
                            tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name))
                        elif tar is pindel_tar:
                            tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir))
        s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)
Exemple #9
0
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None):
    """
    Uploads a file from the FileStore to an output directory on the local filesystem or S3.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str filename: basename for file
    :param str file_id: FileStoreID
    :param str output_dir: Amazon S3 URL or local path
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    :return:
    """
    job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir))
    work_dir = job.fileStore.getLocalTempDir()
    filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename))
    if urlparse(output_dir).scheme == 's3':
        s3am_upload(job=job, fpath=os.path.join(work_dir, filepath),
                    s3_dir=output_dir,
                    s3_key_path=s3_key_path)
    elif os.path.exists(os.path.join(output_dir, filename)):
        job.fileStore.logToMaster("File already exists: {}".format(filename))
    else:
        mkdir_p(output_dir)
        copy_files([filepath], output_dir)
Exemple #10
0
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None):
    """
    Uploads a file from the FileStore to an output directory on the local filesystem or S3.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str filename: basename for file
    :param str file_id: FileStoreID
    :param str output_dir: Amazon S3 URL or local path
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    :return:
    """
    job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir))
    work_dir = job.fileStore.getLocalTempDir()
    filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename))
    if urlparse(output_dir).scheme == 's3':
        s3am_upload(fpath=os.path.join(work_dir, filepath),
                    s3_dir=output_dir,
                    s3_key_path=s3_key_path)
    elif os.path.exists(os.path.join(output_dir, filename)):
        job.fileStore.logToMaster("File already exists: {}".format(filename))
    else:
        mkdir_p(output_dir)
        copy_files([filepath], output_dir)
Exemple #11
0
def consolidate_output(job, config, chunk_infos):
    #prep
    start = time.time()
    uuid = config.uuid
    work_dir = job.fileStore.getLocalTempDir()
    out_tar = os.path.join(work_dir, '{}.tar.gz'.format(config.uuid))

    log(job, "{}".format(datetime.datetime.now()), uuid, 'consolidate_output')
    log(job, "consolidating {} files".format(len(chunk_infos)), uuid,
        'consolidate_output')

    # build tarball
    out_tars = [out_tar]
    output_file_count = 0
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for ci in chunk_infos:
            file_id = ci[CI_OUTPUT_FILE_ID]
            tar_file = os.path.join(work_dir,
                                    "{}.tar.gz".format(ci[CI_CHUNK_INDEX]))
            job.fileStore.readGlobalFile(file_id, tar_file)
            out_tars.append(tar_file)
            with tarfile.open(tar_file, 'r') as f_in:
                for tarinfo in f_in:
                    if config.minimal_output and (
                        (tarinfo.name.endswith("bam")
                         or tarinfo.name.endswith("sam")
                         or tarinfo.name.endswith("bai"))
                            and ID_MERGED not in tarinfo.name):
                        log(
                            job,
                            "(Minimal Output) Skipping output file: {}".format(
                                tarinfo.name), uuid, 'consolidate_output')
                        continue
                    if config.minimal_cpecan_output and tarinfo.name.endswith(
                            "gz"):
                        log(
                            job,
                            "(Minimal cPecan Output) Skipping output file: {}".
                            format(tarinfo.name), uuid, 'consolidate_output')
                        continue
                    log(job, "file {}".format(tarinfo.name), uuid,
                        'consolidate_output')
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        f_out.addfile(tarinfo, fileobj=f_in_file)
                        output_file_count += 1
    log(
        job,
        "Consolidated {} files in {} tarballs".format(output_file_count,
                                                      len(out_tars)), uuid,
        'consolidate_output')

    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        log(job, "Uploading {} to S3: {}".format(out_tar, config.output_dir),
            uuid, 'consolidate_output')
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.maxCores)
    else:
        log(job, "Moving {} to output dir: {}".format(out_tar,
                                                      config.output_dir), uuid,
            'consolidate_output')
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)

    # log
    log_generic_job_debug(job,
                          config.uuid,
                          "consolidate_output",
                          work_dir=work_dir)
    log_time(job, "consolidate_output", start, config.uuid)
    log(job, "{}".format(datetime.datetime.now()), uuid, 'END')

    # return location (calculated the same whether s3:// or file://
    return os.path.join(config.output_dir, os.path.basename(out_tar))
Exemple #12
0
def prepare_input(job, sample, config, enqueue_consolidation=True):

    # job prep
    config = argparse.Namespace(**vars(config))
    uuid, url, contig_name, reference_url, params_url = sample
    config.uuid = uuid
    config.contig_name = contig_name
    config.reference_url = reference_url
    config.params_url = params_url
    if config.intermediate_file_location is not None:
        config.intermediate_file_location = os.path.join(
            config.intermediate_file_location, uuid)
        mkdir_p(config.intermediate_file_location)
    work_dir = job.fileStore.getLocalTempDir()
    start = time.time()
    log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START')
    log(
        job,
        "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}"
        .format(url, contig_name, reference_url,
                params_url), uuid, 'prepare_input')

    # todo global resource estimation
    config.maxCores = min(config.maxCores, multiprocessing.cpu_count())
    config.defaultCores = min(MP_CPU, config.maxCores)
    config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95))
    #config.disk

    # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported

    #ref fasta
    if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '',
                                                  1)
        ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            ref_genome_fileid, os.path.join(work_dir, ref_genome_filename))
    else:
        download_url(reference_url, work_dir=work_dir)
        ref_genome_filename = os.path.basename(reference_url)
        ref_genome_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, ref_genome_filename))
    ref_genome_size = os.stat(os.path.join(work_dir,
                                           ref_genome_filename)).st_size
    config.reference_genome_fileid = ref_genome_fileid

    #params
    if params_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1)
    else:
        download_url(params_url, work_dir=work_dir)
        params_filename = os.path.basename(params_url)
        params_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, params_filename))
    config.params_fileid = params_fileid

    # download bam
    if url.startswith(TOIL_JOBSTORE_PROTOCOL):
        bam_filename = "{}.input.{}.bam".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1),
            os.path.join(work_dir, bam_filename))
    else:
        download_url(url, work_dir=work_dir)
        bam_filename = os.path.basename(url)
    data_bam_location = os.path.join("/data", bam_filename)
    workdir_bam_location = os.path.join(work_dir, bam_filename)

    # index the bam
    _index_bam(job, config, work_dir, bam_filename)

    # sanity check
    workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai")
    if not os.path.isfile(workdir_bai_location):
        raise UserError("BAM index file not created for {}: {}".format(
            bam_filename, workdir_bai_location))

    # get start and end location
    start_idx = sys.maxint
    end_idx = 0
    with closing(
            pysam.AlignmentFile(
                workdir_bam_location,
                'rb' if bam_filename.endswith("bam") else 'r')) as aln:
        for read in aln.fetch():
            align_start = read.reference_start
            align_end = read.reference_end
            start_idx = min([start_idx, align_start])
            end_idx = max([end_idx, align_end])
    log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx,
                                               end_idx), uuid, 'prepare_input')

    # get reads from positions
    chunk_infos = list()
    idx = start_idx
    while idx < end_idx:
        ci = {CI_UUID: uuid}
        ci[CI_CHUNK_BOUNDARY_START] = idx
        chunk_start = idx - config.partition_margin
        ci[CI_CHUNK_START] = chunk_start
        idx += config.partition_size
        ci[CI_CHUNK_BOUNDARY_END] = idx
        chunk_end = idx + config.partition_margin
        ci[CI_CHUNK_END] = chunk_end
        chunk_infos.append(ci)

    # enqueue jobs
    log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid,
        'prepare_input')
    idx = 0
    enqueued_jobs = 0
    returned_tarballs = list()
    for ci in chunk_infos:
        #prep
        ci[CI_CHUNK_INDEX] = idx
        chunk_start = ci[CI_CHUNK_START]
        chunk_end = ci[CI_CHUNK_END]
        chunk_position_description = "{}:{}-{}".format(config.contig_name,
                                                       chunk_start, chunk_end)
        bam_split_command = [
            "view", "-b", data_bam_location, chunk_position_description
        ]
        chunk_name = "{}.{}.bam".format(config.uuid, idx)

        #write chunk
        chunk_location = os.path.join(work_dir, chunk_name)
        with open(chunk_location, 'w') as out:
            docker_call(job,
                        config,
                        work_dir,
                        bam_split_command,
                        DOCKER_SAMTOOLS_IMG,
                        DOCKER_SAMTOOLS_TAG,
                        outfile=out)

        #document read count
        chunk_size = os.stat(chunk_location).st_size
        ci[CI_CHUNK_SIZE] = chunk_size
        ci[CI_REF_FA_SIZE] = ref_genome_size
        read_count = prepare_input__get_bam_read_count(job, work_dir,
                                                       chunk_name)
        ci[CI_READ_COUNT] = read_count
        log(
            job,
            "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format(
                chunk_position_description, idx, chunk_size,
                int(chunk_size / 1024 / 1024),
                read_count), uuid, 'prepare_input')
        if config.intermediate_file_location is not None:
            copy_files(file_paths=[chunk_location],
                       output_dir=config.intermediate_file_location)

        # enqueue marginPhase job
        if read_count > 0:
            chunk_fileid = job.fileStore.writeGlobalFile(chunk_location)
            mp_cores = config.defaultCores
            mp_mem = int(
                min(
                    int(chunk_size * MP_MEM_BAM_FACTOR +
                        ref_genome_size * MP_MEM_REF_FACTOR),
                    config.maxMemory))
            mp_disk = int(
                min(
                    int(chunk_size * MP_DSK_BAM_FACTOR +
                        ref_genome_size * MP_DSK_REF_FACTOR +
                        (0 if config.cpecan_probabilities else
                         MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk))
            log(
                job,
                "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format(
                    mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem,
                    int(mp_mem / 1024 / 1024 / 1024)),
                "{}.{}".format(uuid, idx), 'prepare_input')
            mp_mem = str(int(mp_mem / 1024)) + "K"
            mp_disk = str(int(mp_disk) / 1024) + "K"
            margin_phase_job = job.addChildJobFn(run_margin_phase,
                                                 config,
                                                 chunk_fileid,
                                                 ci,
                                                 memory=mp_mem,
                                                 cores=mp_cores,
                                                 disk=mp_disk)
            returned_tarballs.append(margin_phase_job.rv())
            enqueued_jobs += 1
        idx += 1

    log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input')

    # enqueue merging and consolidation job
    merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs)
    final_return_value = merge_job.rv()
    if enqueue_consolidation:
        consolidation_job = merge_job.addFollowOnJobFn(consolidate_output,
                                                       config, merge_job.rv())
        final_return_value = consolidation_job.rv()

    # log
    log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir)
    log_time(job, "prepare_input", start, config.uuid)

    # return appropriate output
    return final_return_value
Exemple #13
0
def run_margin_phase(job, config, chunk_file_id, chunk_info):
    # prep
    start = time.time()
    work_dir = job.fileStore.getLocalTempDir()
    chunk_idx = chunk_info[CI_CHUNK_INDEX]
    chunk_identifier = "{}.{}".format(config.uuid, chunk_idx)
    chunk_name = "{}.in.bam".format(chunk_identifier)
    chunk_location = os.path.join(work_dir, chunk_name)
    log(job, str(datetime.datetime.now()), chunk_identifier,
        'run_margin_phase')

    # download bam chunk
    job.fileStore.readGlobalFile(chunk_file_id, chunk_location)
    if not os.path.isfile(chunk_location):
        raise UserError("Failed to download chunk {} from {}".format(
            chunk_name, chunk_file_id))

    # download references
    #ref genome
    genome_reference_name = "reference.fa"
    genome_reference_location = os.path.join(work_dir, genome_reference_name)
    job.fileStore.readGlobalFile(config.reference_genome_fileid,
                                 genome_reference_location)
    if not os.path.isfile(genome_reference_location):
        raise UserError(
            "Failed to download genome reference {} from {}".format(
                os.path.basename(config.reference_genome),
                config.reference_genome_fileid))
    # params
    params_name = "params.json"
    params_location = os.path.join(work_dir, params_name)
    job.fileStore.readGlobalFile(config.params_fileid, params_location)
    if not os.path.isfile(params_location):
        raise UserError("Failed to download params {} from {}".format(
            os.path.basename(config.params), config.params_fileid))

    # do we want to run cPecan?
    cpecan_prob_location = None
    if config.cpecan_probabilities:
        cpecan_prob_location = run_margin_phase__run_cpecan_alignment(
            job, config, chunk_identifier, work_dir, chunk_name,
            genome_reference_name)

    # run marginPhase
    params = [
        os.path.join("/data", chunk_name),
        os.path.join("/data", genome_reference_name),
        os.path.join("/data", params_name), "-o",
        os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag',
        "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START],
                          chunk_info[CI_CHUNK_BOUNDARY_END])
    ]
    if cpecan_prob_location is not None:
        params.extend([
            '--singleNuclProbDir',
            os.path.join("/data", cpecan_prob_location)
        ])
    docker_call(job, config, work_dir, params, config.margin_phase_image,
                config.margin_phase_tag)
    log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG),
                          chunk_identifier, 'margin_phase',
                          [chunk_location, genome_reference_location])
    log_location = os.path.join(work_dir,
                                "marginPhase.{}.log".format(chunk_identifier))
    os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location)

    # document output
    log(job, "Output files after marginPhase:", chunk_identifier,
        'run_margin_phase')
    output_file_locations = glob.glob(
        os.path.join(work_dir, "{}*".format(chunk_identifier)))
    output_file_locations.append(log_location)
    found_vcf, found_sam = False, False
    for f in output_file_locations:
        log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier,
            'run_margin_phase')
        if f.endswith(VCF_SUFFIX): found_vcf = True
        if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True
    if cpecan_prob_location is not None:
        cpecan_tarball = glob.glob(
            os.path.join(work_dir, cpecan_prob_location, "*.tar.gz"))
        if len(cpecan_tarball) == 0:
            # todo why has tarball_files failed in this location?
            log(job, "Found no cpecan output tarball! Trying alt location.",
                chunk_identifier, 'run_margin_phase')
            cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz"))

        if len(cpecan_tarball) == 0:
            log(job, "Found no cpecan output tarball!", chunk_identifier,
                'run_margin_phase')
        elif len(cpecan_tarball) > 1:
            log(
                job, "Found {} cpecan output tarballs: {}".format(
                    len(cpecan_tarball), cpecan_tarball), chunk_identifier,
                'run_margin_phase')
        else:
            log(job,
                "Saving cpecan output tarball: {}".format(cpecan_tarball[0]),
                chunk_identifier, 'run_margin_phase')
            output_file_locations.append(cpecan_tarball[0])

    # tarball the output and save
    tarball_name = "{}.tar.gz".format(chunk_identifier)
    tarball_files(tar_name=tarball_name,
                  file_paths=output_file_locations,
                  output_dir=work_dir)

    # validate output, retry if not
    if not (found_sam and found_vcf):
        if "retry_attempts" not in config:
            config.retry_attempts = 1
        else:
            config.retry_attempts += 1
            if config.retry_attempts > MAX_RETRIES:
                log(job, "", chunk_identifier, 'run_margin_phase')
                error = "Failed to generate appropriate output files {} times".format(
                    MAX_RETRIES)
                log(job, error, chunk_identifier, 'run_margin_phase')
                # this enables us to "recover" in the face of failure during a run
                if CONTINUE_AFTER_FAILURE:
                    output_file_id = job.fileStore.writeGlobalFile(
                        os.path.join(work_dir, tarball_name))
                    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id
                    return chunk_info
                raise UserError("{}:{}".format(chunk_identifier, error))

        log(
            job, "Missing output files.  Attepmting retry {}".format(
                config.retry_attempts), chunk_identifier, 'run_margin_phase')
        log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase')
        log(job, "", chunk_identifier, 'run_margin_phase')
        with open(log_location, 'r') as input:
            for line in input:
                log(job, "\t\t{}".format(line.rstrip()), chunk_identifier,
                    'run_margin_phase')

        # new job
        retry_job = job.addChildJobFn(
            run_margin_phase,
            config,
            chunk_file_id,
            chunk_info,
            memory=str(int(config.maxMemory / 1024)) + "K",
            cores=job.cores,
            disk=job.disk)
        # save failed output
        if config.intermediate_file_location is not None:
            tarball_fail_name = "{}.FAILURE.{}.tar.gz".format(
                chunk_identifier, config.retry_attempts)
            os.rename(os.path.join(work_dir, tarball_name),
                      os.path.join(work_dir, tarball_fail_name))
            copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)],
                       output_dir=config.intermediate_file_location)

        log_generic_job_debug(job,
                              config.uuid,
                              'run_margin_phase',
                              work_dir=work_dir)
        return retry_job.rv()

    # if successfull, save output
    if config.intermediate_file_location is not None:
        copy_files(file_paths=[os.path.join(work_dir, tarball_name)],
                   output_dir=config.intermediate_file_location)
    output_file_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, tarball_name))
    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id

    # log
    log_generic_job_debug(job,
                          config.uuid,
                          'run_margin_phase',
                          work_dir=work_dir)
    log_time(job, "run_margin_phase", start, chunk_identifier)
    return chunk_info
Exemple #14
0
def consolidate_output(job, config, kallisto_output, rsem_star_output,
                       fastqc_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param FileID kallisto_output: FileStoreID for Kallisto output
    :param tuple(FileID, FileID, FileID)|tuple(FileID, FileID, FileID, bool, FileID) rsem_star_output:
            FileStoreIDs for RSEM and STAR output, and a flag/FileID if run with bamQC
    :param FileID fastqc_output: FileStoreID for FastQC output
    """
    job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    config.uuid = 'SINGLE-END.' + config.uuid if not config.paired else config.uuid
    # Retrieve output file paths to consolidate
    rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar = None, None, None, None, None, None
    if rsem_star_output:
        if config.bamqc:
            rsem_id, hugo_id, star_id, fail_flag, bamqc_id = flatten(
                rsem_star_output)
            bamqc_tar = job.fileStore.readGlobalFile(
                bamqc_id, os.path.join(work_dir, 'bamqc.tar.gz'))
            config.uuid = 'FAIL.' + config.uuid if fail_flag else config.uuid
        else:
            rsem_id, hugo_id, star_id = flatten(rsem_star_output)
        rsem_tar = job.fileStore.readGlobalFile(
            rsem_id, os.path.join(work_dir, 'rsem.tar.gz'))
        hugo_tar = job.fileStore.readGlobalFile(
            hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz'))
        star_tar = job.fileStore.readGlobalFile(
            star_id, os.path.join(work_dir, 'star.tar.gz'))
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(
            kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz'))
    if fastqc_output:
        fastqc_tar = job.fileStore.readGlobalFile(
            fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz'))
    # I/O
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [
        x for x in
        [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar]
        if x is not None
    ]
    with tarfile.open(out_tar, 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == rsem_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'RSEM',
                                os.path.basename(tarinfo.name))
                        elif tar == hugo_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'RSEM', 'Hugo',
                                os.path.basename(tarinfo.name))
                        elif tar == kallisto_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'Kallisto',
                                os.path.basename(tarinfo.name))
                        elif tar == bamqc_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'bamQC',
                                os.path.basename(tarinfo.name))
                        elif tar == fastqc_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'fastQC',
                                os.path.basename(tarinfo.name))
                        elif tar == star_tar:
                            tarinfo.name = os.path.join(
                                config.uuid, 'QC', 'STAR',
                                os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(
            config.uuid, config.output_dir))
        s3am_upload(fpath=out_tar,
                    s3_dir=config.output_dir,
                    num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(
            config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(
            file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')],
            output_dir=config.output_dir)
Exemple #15
0
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, call, upload.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        if not inputs.run_local:
            _log.info("Downloading input BAM %s to %s.", bam_name, hdfs_bam)
            call_conductor(job,
                           master_ip,
                           inputs.sample,
                           hdfs_bam,
                           container='fnothaft/conductor',
                           memory=inputs.memory)
        else:
            copy_files([inputs.sample], inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        _log.info("Converting input BAM to ADAM.")
        call_adam(job,
                  master_ip, ["transform", hdfs_bam, adam_input],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        avocado_output = hdfs_prefix + ".gt.adam"
        _log.info("Calling variants with avocado.")
        call_avocado(
            job,
            master_ip,
            ["biallelicGenotyper", "-is_not_grc", adam_input, avocado_output],
            memory=inputs.memory,
            container='fnothaft/avocado')

        output_vcf = hdfs_prefix + ".vcf"
        _log.info("Converting output ADAM Genotypes to VCF.")
        call_adam(job,
                  master_ip, [
                      "adam2vcf", avocado_output, output_vcf, "-single",
                      "-sort_on_save", "-stringency", "LENIENT"
                  ],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            _log.info("Uploading output VCF %s to %s.", output_vcf, out_file)
            call_conductor(job,
                           master_ip,
                           output_vcf,
                           out_file,
                           memory=inputs.memory,
                           container='fnothaft/conductor')
            remove_file(master_ip, output_vcf, spark_on_toil)
        else:
            local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir,
                                                         sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Exemple #16
0
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, call, upload.
    """
    master_ip = MasterAddress(master_ip)

    fastq_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(fastq_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_reads = hdfs_dir + "/" + fastq_name

        if not inputs.run_local:
            _log.info("Downloading input reads %s to %s.", fastq_name,
                      hdfs_reads)
            call_conductor(job,
                           master_ip,
                           inputs.sample,
                           hdfs_reads,
                           container='fnothaft/conductor',
                           memory=inputs.memory)

            index_exts = ['', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']
            hdfs_index = os.path.join(hdfs_prefix, 'reference.fa')
            for ext in index_exts:
                index_path = inputs.index + ext
                hdfs_index_ext = hdfs_index + ext
                _log.info("Downloading index file %s to %s.", index_path,
                          hdfs_index_ext)
                call_conductor(job,
                               master_ip,
                               index_path,
                               hdfs_index_ext,
                               container='fnothaft/conductor',
                               memory=inputs.memory)

            sd_path = inputs.index.replace('.fa', '.dict')
            hdfs_sd = hdfs_index.replace('.fa', '.dict')
            _log.info("Downloading sequence dictionary %s to %s.", sd_path,
                      hdfs_sd)
            call_conductor(job,
                           master_ip,
                           sd_path,
                           hdfs_sd,
                           container='fnothaft/conductor',
                           memory=inputs.memory)

        else:
            copy_files([inputs.sample], inputs.local_dir)

        aligned_output = hdfs_prefix + ".bam"
        _log.info("Aligning reads with Cannoli and BWA.")
        call_cannoli(job,
                     master_ip, [
                         "bwa", "-single", hdfs_reads, aligned_output,
                         inputs.sample_id, '-use_docker', '-docker_image',
                         'fnothaft/bwa:debug-3', '-index', hdfs_index,
                         '-add_indices', '-sequence_dictionary', hdfs_sd
                     ],
                     memory=inputs.memory,
                     container='fnothaft/cannoli:1508-1509')
        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            _log.info("Uploading output BAM %s to %s.", aligned_output,
                      out_file)
            call_conductor(job,
                           master_ip,
                           aligned_output,
                           out_file,
                           memory=inputs.memory,
                           container='fnothaft/conductor')
            remove_file(master_ip, output_vcf, spark_on_toil)
        else:
            local_adam_output = "%s/%s.bam" % (inputs.local_dir, sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise