def test_upload_and_download_with_encryption(tmpdir):
    from toil_scripts.lib.urls import s3am_upload
    from toil_scripts.lib.urls import download_url
    from boto.s3.connection import S3Connection, Bucket, Key
    work_dir = str(tmpdir)
    # Create temporary encryption key
    key_path = os.path.join(work_dir, 'foo.key')
    subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32',
                           'of={}'.format(key_path)])
    # Create test file
    upload_fpath = os.path.join(work_dir, 'upload_file')
    with open(upload_fpath, 'wb') as fout:
        fout.write(os.urandom(1024))
    # Upload file
    s3_dir = 's3://cgl-driver-projects/test'
    s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path)
    # Download the file
    url = 'https://s3-us-west-2.amazonaws.com/cgl-driver-projects/test/upload_file'
    download_url(url=url, name='download_file', work_dir=work_dir, s3_key_path=key_path)
    download_fpath = os.path.join(work_dir, 'download_file')
    assert os.path.exists(download_fpath)
    assert filecmp.cmp(upload_fpath, download_fpath)
    # Delete the Key
    conn = S3Connection()
    b = Bucket(conn, 'cgl-driver-projects')
    k = Key(b)
    k.key = 'test/upload_file'
    k.delete()
def rsem_quantification(job, config, star_output):
    """
    Unpack STAR results and run RSEM (and saving BAM from STAR)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param tuple(str, str) star_output: FileStoreIDs from STARs output
    :return: FileStoreID results from RSEM postprocess
    :rtype: str
    """
    cores = min(16, config.cores)
    disk = '2G' if config.ci_test else '40G'
    transcriptome_id, sorted_id = star_output
    # Save sorted bam if flag is selected
    if config.save_bam:
        work_dir = job.fileStore.getLocalTempDir()
        bam_path = os.path.join(work_dir, '{}.sorted.bam'.format(config.uuid))
        sorted_bam = job.fileStore.readGlobalFile(sorted_id, bam_path)
        if config.s3_output_dir and config.ssec:
            s3am_upload(fpath=sorted_bam, s3_dir=config.s3_output_dir, s3_key_path=config.ssec)
        if config.output_dir:
            move_files(file_paths=[sorted_bam], output_dir=config.output_dir)
    # Declare RSEM and RSEM post-process jobs
    rsem_output = job.wrapJobFn(run_rsem, config.cores, transcriptome_id, config.rsem_ref, paired=config.paired,
                                cores=cores, disk=disk)
    rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, config.uuid, rsem_output.rv(0), rsem_output.rv(1))
    job.addChild(rsem_output)
    rsem_output.addChild(rsem_postprocess)
    return rsem_postprocess.rv()
Exemple #3
0
def test_upload_and_download_with_encryption(tmpdir):
    from toil_scripts.lib.urls import s3am_upload
    from toil_scripts.lib.urls import download_url
    from boto.s3.connection import S3Connection, Bucket, Key
    work_dir = str(tmpdir)
    # Create temporary encryption key
    key_path = os.path.join(work_dir, 'foo.key')
    subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32',
                           'of={}'.format(key_path)])
    # Create test file
    upload_fpath = os.path.join(work_dir, 'upload_file')
    with open(upload_fpath, 'wb') as fout:
        fout.write(os.urandom(1024))
    # Upload file
    random_key = os.path.join('test/', str(uuid4()), 'upload_file')
    s3_url = os.path.join('s3://cgl-driver-projects/', random_key)
    try:
        s3_dir = os.path.split(s3_url)[0]
        s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path)
        # Download the file
        download_url(url=s3_url, name='download_file', work_dir=work_dir, s3_key_path=key_path)
        download_fpath = os.path.join(work_dir, 'download_file')
        assert os.path.exists(download_fpath)
        assert filecmp.cmp(upload_fpath, download_fpath)
    finally:
        # Delete the Key. Key deletion never fails so we don't need to catch any exceptions
        with closing(S3Connection()) as conn:
            b = Bucket(conn, 'cgl-driver-projects')
            k = Key(b)
            k.key = random_key
            k.delete()
def upload_or_move(job, work_dir, output_dir, output, ssec=None):

    # are we moving this into a local dir, or up to s3?
    if output_dir.startswith('s3://'):
        s3am_upload(fpath=os.path.join(work_dir, output),
                    s3_dir=output_dir,
                    s3_key_path=ssec)
    else:
        # FIXME: undefined function
        make_directory(output_dir)
        move_to_output_dir(work_dir, output_dir, output)
def upload_or_move_hc(work_dir, output_dir, output, ssec=None):
    # are we moving this into a local dir, or up to s3?
    if output_dir.startswith('s3://'):
        #if ssec is None:
        #    raise ValueError('s3 output_dir provided, but ssec is missing')
        s3am_upload(fpath=os.path.join(work_dir, output),
                    s3_dir=output_dir,
                    s3_key_path=ssec)
    else:
        # FIXME: undefined function                                                                                                  
        make_directory(output_dir)
        move_to_output_dir(work_dir, output_dir, output)
def run_bwa(job, inputs, ids):
    """
    Aligns two fastqs into a BAMFILE via BWA

    :param JobFunctionWrappingJob job: Passed by Toil automatically
    :param Namespace inputs: Input arguments (see main)
    :param list ids: list of FileStore IDs (R1, R2, reference inputs)
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_names = ['r1.fq.gz', 'r2.fq.gz', 'ref.fa', 'ref.fa.amb', 'ref.fa.ann',
                  'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa', 'ref.fa.fai']
    if inputs.alt:
        file_names.append('ref.fa.alt')

    for fileStoreID, name in zip(ids, file_names):
        job.fileStore.readGlobalFile(fileStoreID, os.path.join(work_dir, name))

    # Add read group line
    rg = "@RG\\tID:{0}\\tLB:{1}\\tPL:{2}\\tPU:{3}\\tSM:{0}".format(inputs.uuid, inputs.library,
                                                                   inputs.platform, inputs.program_unit)

    # BWA Options
    opt_args = []
    if not inputs.skip_sort:
        opt_args.append('-s')
    if inputs.trim:
        opt_args.append('-a')
    # Call: bwakit
    parameters = (['-t', str(inputs.cores),
                   '-R', rg] +
                  opt_args +
                  ['-o', '/data/aligned',
                   '/data/ref.fa',
                   '/data/r1.fq.gz',
                   '/data/r2.fq.gz'])
    outputs = {'aligned.aln.bam': inputs.mock_bam}

    docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e',
                parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir)

    # BWA insists on adding an `*.aln.sam` suffix, so rename the output file
    output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid))
    os.rename(os.path.join(work_dir, 'aligned.aln.bam'),
              output_file)

    # Either write file to local output directory or upload to S3 cloud storage
    job.fileStore.logToMaster('Aligned sample: {}'.format(inputs.uuid))
    if inputs.output_dir:
        move_files([output_file], inputs.output_dir)
    if inputs.s3_dir:
        s3am_upload(output_file, inputs.s3_dir, s3_key_path=inputs.ssec)
def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output):
    """
    Combines the contents of the outputs into one tarball and places in output directory or s3

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str kallisto_output: FileStoreID for Kallisto output
    :param tuple(str, str) rsem_output: FileStoreIDs for RSEM output
    :param str fastqc_output: FileStoreID for FastQC output
    """
    job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve output file paths to consolidate
    rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None
    if rsem_output:
        rsem_id, hugo_id = rsem_output
        rsem_tar = job.fileStore.readGlobalFile(rsem_id, os.path.join(work_dir, 'rsem.tar.gz'))
        hugo_tar = job.fileStore.readGlobalFile(hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    if kallisto_output:
        kallisto_tar = job.fileStore.readGlobalFile(kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz'))
    if fastqc_output:
        fastqc_tar = job.fileStore.readGlobalFile(fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz'))
    # I/O
    if not config.paired:
        config.uuid = 'SINGLE-END.{}'.format(config.uuid)
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == rsem_tar:
                            tarinfo.name = os.path.join(config.uuid, 'RSEM', os.path.basename(tarinfo.name))
                        elif tar == hugo_tar:
                            tarinfo.name = os.path.join(config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name))
                        elif tar == kallisto_tar:
                            tarinfo.name = os.path.join(config.uuid, 'Kallisto', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output directory
    if config.output_dir:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
    # Upload to S3
    if config.s3_output_dir:
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir))
        s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)
def upload_or_move(job, work_dir, input_args, output):

    # are we moving this into a local dir, or up to s3?
    if input_args['output_dir']:
        # get output path and
        output_dir = input_args['output_dir']
        # FIXME: undefined function
        make_directory(output_dir)
        move_to_output_dir(work_dir, output_dir, output)

    elif input_args['s3_dir']:
        s3am_upload(fpath=os.path.join(work_dir, output),
                    s3_dir=input_args['s3_dir'],
                    s3_key_path=input_args['ssec'])

    else:
        raise ValueError('No output_directory or s3_dir defined. Cannot determine where to store %s' % output)
def consolidate_output(job, config, mutect, pindel, muse):
    """
    Combine the contents of separate tarball outputs into one via streaming

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param str mutect: MuTect tarball FileStoreID
    :param str pindel: Pindel tarball FileStoreID
    :param str muse: MuSe tarball FileStoreID
    """
    work_dir = job.fileStore.getLocalTempDir()
    mutect_tar, pindel_tar, muse_tar = None, None, None
    if mutect:
        mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz'))
    if pindel:
        pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz'))
    if muse:
        muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz'))
    out_tar = os.path.join(work_dir, config.uuid + '.tar.gz')
    # Consolidate separate tarballs into one as streams (avoids unnecessary untaring)
    tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None]
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in tar_list:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar is mutect_tar:
                            tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name))
                        elif tar is pindel_tar:
                            tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output location
    if urlparse(config.output_dir).scheme == 's3':
        job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir))
        s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores)
    else:
        job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
        mkdir_p(config.output_dir)
        copy_files(file_paths=[out_tar], output_dir=config.output_dir)