Esempio n. 1
0
def test_upload_and_download_with_encryption(tmpdir):
    from toil_lib.urls import s3am_upload
    from toil_lib.urls import download_url
    from boto.s3.connection import S3Connection, Bucket, Key
    work_dir = str(tmpdir)
    # Create temporary encryption key
    key_path = os.path.join(work_dir, 'foo.key')
    subprocess.check_call([
        'dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path)
    ])
    # Create test file
    upload_fpath = os.path.join(work_dir, 'upload_file')
    with open(upload_fpath, 'wb') as fout:
        fout.write(os.urandom(1024))
    # Upload file
    random_key = os.path.join('test/', str(uuid4()), 'upload_file')
    s3_url = os.path.join('s3://cgl-driver-projects/', random_key)
    try:
        s3_dir = os.path.split(s3_url)[0]
        s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path)
        # Download the file
        download_url(url=s3_url,
                     name='download_file',
                     work_dir=work_dir,
                     s3_key_path=key_path)
        download_fpath = os.path.join(work_dir, 'download_file')
        assert os.path.exists(download_fpath)
        assert filecmp.cmp(upload_fpath, download_fpath)
    finally:
        # Delete the Key. Key deletion never fails so we don't need to catch any exceptions
        with closing(S3Connection()) as conn:
            b = Bucket(conn, 'cgl-driver-projects')
            k = Key(b)
            k.key = random_key
            k.delete()
Esempio n. 2
0
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'),
                  (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'),
                  (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(job=job, url=url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = [
        'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v',
        'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
        '-o', '/data/output.vcf.gz'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=variant_command,
        tool=
        'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c'
    )

    # Part 2: QC
    qc_command = [
        '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m',
        'annotation.m53'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=qc_command,
        tool=
        'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz',
                  file_paths=[output_tsv, output_vcf],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
Esempio n. 3
0
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(job=job,
                 url=inputs.gtf,
                 work_dir=work_dir,
                 name='annotation.gtf')
    download_url(job=job,
                 url=inputs.gtf_pickle,
                 work_dir=work_dir,
                 name='annotation.gtf.pickle')
    # Call Spladder
    command = [
        '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n',
        '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a',
        'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n',
        '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'
    ]
    docker_call(job=job,
                work_dir=work_dir,
                parameters=command,
                sudo=inputs.sudo,
                tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder',
                                 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz',
                  file_paths=[output_pickle, output_filt, output],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'spladder.tar.gz'))
Esempio n. 4
0
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False):
    """
    Performs alignment of fastqs to bam via STAR

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None)
    :param str star_index_url: STAR index tarball
    :param bool wiggle: If True, will output a wiggle file and return it
    :return: FileStoreID from RSEM
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir)
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    os.remove(os.path.join(work_dir, 'starIndex.tar.gz'))
    # Determine tarball structure - star index contains are either in a subdir or in the tarball itself
    star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data'
    # Parameter handling for paired / single-end data
    parameters = ['--runThreadN', str(job.cores),
                  '--genomeDir', star_index,
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1']
    if wiggle:
        parameters.extend(['--outWigType', 'bedGraph',
                           '--outWigStrand', 'Unstranded',
                           '--outWigReferencesPrefix', 'chr'])
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq'])
    # Call: STAR Mapping
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Write to fileStore
    transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam'))
    sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    if wiggle:
        wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg'))
        return transcriptome_id, sorted_id, wiggle_id
    else:
        return transcriptome_id, sorted_id
Esempio n. 5
0
def run_rsem(job, bam_id, rsem_ref_url, paired=True):
    """
    RNA quantification with RSEM

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str bam_id: FileStoreID of transcriptome bam for quantification
    :param str rsem_ref_url: URL of RSEM reference (tarball)
    :param bool paired: If True, uses parameters for paired end data
    :return: FileStoreIDs for RSEM's gene and isoform output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir
    ])
    os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz'))
    # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix
    rsem_files = []
    for root, directories, files in os.walk(work_dir):
        rsem_files.extend([os.path.join(root, x) for x in files])
    # "grp" is a required RSEM extension that should exist in the RSEM reference
    ref_prefix = [
        os.path.basename(os.path.splitext(x)[0]) for x in rsem_files
        if 'grp' in x
    ][0]
    ref_folder = os.path.join('/data',
                              os.listdir(work_dir)[0]) if len(
                                  os.listdir(work_dir)) == 1 else '/data'
    # I/O
    job.fileStore.readGlobalFile(bam_id,
                                 os.path.join(work_dir, 'transcriptome.bam'))
    output_prefix = 'rsem'
    # Call: RSEM
    parameters = [
        '--quiet', '--no-qualities', '-p',
        str(job.cores), '--forward-prob', '0.5', '--seed-length', '25',
        '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam',
        os.path.join(ref_folder, ref_prefix), output_prefix
    ]
    if paired:
        parameters = ['--paired-end'] + parameters
    docker_call(
        tool=
        'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21',
        parameters=parameters,
        work_dir=work_dir)
    os.rename(os.path.join(work_dir, output_prefix + '.genes.results'),
              os.path.join(work_dir, 'rsem_gene.tab'))
    os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'),
              os.path.join(work_dir, 'rsem_isoform.tab'))
    # Write to FileStore
    gene_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_gene.tab'))
    isoform_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_isoform.tab'))
    return gene_id, isoform_id
Esempio n. 6
0
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf')
    download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle')
    # Call Spladder
    command = ['--insert_ir=y',
               '--insert_es=y',
               '--insert_ni=y',
               '--remove_se=n',
               '--validate_sg=n',
               '-b', 'alignment.bam',
               '-o ', '/data',
               '-a', 'annotation.gtf',
               '-v', 'y',
               '-c', '3',
               '-M', 'single',
               '-T', 'n',
               '-n', '50',
               '-P', 'y',
               '-p', 'n',
               '--sparse_bam', 'y']
    docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
Esempio n. 7
0
def star(job, inputs, r1_cutadapt, r2_cutadapt):
    """
    Performs alignment of fastqs to BAM via STAR

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str r1_cutadapt: FileStore ID of read 1 fastq
    :param str r2_cutadapt: FileStore ID of read 2 fastq
    """
    job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(inputs.cores, 16)
    # Retrieve files
    job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq'))
    job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq'))
    # Get starIndex
    download_url(inputs.star_index, work_dir, 'starIndex.tar.gz')
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    # Parameters
    parameters = ['--runThreadN', str(cores),
                  '--genomeDir', '/data/starIndex',
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1',
                  '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']
    # Call: STAR Map
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Call Samtools Index
    index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam']
    docker_call(work_dir=work_dir, parameters=index_command,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')
    # fileStore
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai'))
    job.fileStore.deleteGlobalFile(r1_cutadapt)
    job.fileStore.deleteGlobalFile(r2_cutadapt)
    # Launch children and follow-on
    vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv()
    spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv()
    job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
Esempio n. 8
0
def run_kallisto(job, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=kallisto_index_url,
                 name='kallisto_hg38.idx',
                 work_dir=work_dir)
    # Retrieve files
    parameters = [
        'quant', '-i', '/data/kallisto_hg38.idx', '-t',
        str(job.cores), '-o', '/data/', '-b', '100'
    ]
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        job.fileStore.readGlobalFile(
            r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq'))
        parameters.extend(
            ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'])
    else:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        parameters.extend(
            ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq'])

    # Call: Kallisto
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86',
        work_dir=work_dir,
        parameters=parameters)
    # Tar output files together and store in fileStore
    output_files = [
        os.path.join(work_dir, x)
        for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']
    ]
    tarball_files(tar_name='kallisto.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'kallisto.tar.gz'))
Esempio n. 9
0
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = ['mpileup',
                       '-f', 'genome.fa',
                       '-l', 'positions.tsv',
                       '-v', 'alignment.bam',
                       '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
                       '-o', '/data/output.vcf.gz']
    docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')

    # Part 2: QC
    qc_command = ['-o', 'qc',
                  '-n', 'alignment.bam',
                  '-a', 'annotation.gtf',
                  '-m', 'annotation.m53']
    docker_call(work_dir=work_dir, parameters=qc_command,
                tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
Esempio n. 10
0
def prepare_input(job, sample, config, enqueue_consolidation=True):

    # job prep
    config = argparse.Namespace(**vars(config))
    uuid, url, contig_name, reference_url, params_url = sample
    config.uuid = uuid
    config.contig_name = contig_name
    config.reference_url = reference_url
    config.params_url = params_url
    if config.intermediate_file_location is not None:
        config.intermediate_file_location = os.path.join(
            config.intermediate_file_location, uuid)
        mkdir_p(config.intermediate_file_location)
    work_dir = job.fileStore.getLocalTempDir()
    start = time.time()
    log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START')
    log(
        job,
        "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}"
        .format(url, contig_name, reference_url,
                params_url), uuid, 'prepare_input')

    # todo global resource estimation
    config.maxCores = min(config.maxCores, multiprocessing.cpu_count())
    config.defaultCores = min(MP_CPU, config.maxCores)
    config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95))
    #config.disk

    # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported

    #ref fasta
    if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '',
                                                  1)
        ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            ref_genome_fileid, os.path.join(work_dir, ref_genome_filename))
    else:
        download_url(reference_url, work_dir=work_dir)
        ref_genome_filename = os.path.basename(reference_url)
        ref_genome_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, ref_genome_filename))
    ref_genome_size = os.stat(os.path.join(work_dir,
                                           ref_genome_filename)).st_size
    config.reference_genome_fileid = ref_genome_fileid

    #params
    if params_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1)
    else:
        download_url(params_url, work_dir=work_dir)
        params_filename = os.path.basename(params_url)
        params_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, params_filename))
    config.params_fileid = params_fileid

    # download bam
    if url.startswith(TOIL_JOBSTORE_PROTOCOL):
        bam_filename = "{}.input.{}.bam".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1),
            os.path.join(work_dir, bam_filename))
    else:
        download_url(url, work_dir=work_dir)
        bam_filename = os.path.basename(url)
    data_bam_location = os.path.join("/data", bam_filename)
    workdir_bam_location = os.path.join(work_dir, bam_filename)

    # index the bam
    _index_bam(job, config, work_dir, bam_filename)

    # sanity check
    workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai")
    if not os.path.isfile(workdir_bai_location):
        raise UserError("BAM index file not created for {}: {}".format(
            bam_filename, workdir_bai_location))

    # get start and end location
    start_idx = sys.maxint
    end_idx = 0
    with closing(
            pysam.AlignmentFile(
                workdir_bam_location,
                'rb' if bam_filename.endswith("bam") else 'r')) as aln:
        for read in aln.fetch():
            align_start = read.reference_start
            align_end = read.reference_end
            start_idx = min([start_idx, align_start])
            end_idx = max([end_idx, align_end])
    log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx,
                                               end_idx), uuid, 'prepare_input')

    # get reads from positions
    chunk_infos = list()
    idx = start_idx
    while idx < end_idx:
        ci = {CI_UUID: uuid}
        ci[CI_CHUNK_BOUNDARY_START] = idx
        chunk_start = idx - config.partition_margin
        ci[CI_CHUNK_START] = chunk_start
        idx += config.partition_size
        ci[CI_CHUNK_BOUNDARY_END] = idx
        chunk_end = idx + config.partition_margin
        ci[CI_CHUNK_END] = chunk_end
        chunk_infos.append(ci)

    # enqueue jobs
    log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid,
        'prepare_input')
    idx = 0
    enqueued_jobs = 0
    returned_tarballs = list()
    for ci in chunk_infos:
        #prep
        ci[CI_CHUNK_INDEX] = idx
        chunk_start = ci[CI_CHUNK_START]
        chunk_end = ci[CI_CHUNK_END]
        chunk_position_description = "{}:{}-{}".format(config.contig_name,
                                                       chunk_start, chunk_end)
        bam_split_command = [
            "view", "-b", data_bam_location, chunk_position_description
        ]
        chunk_name = "{}.{}.bam".format(config.uuid, idx)

        #write chunk
        chunk_location = os.path.join(work_dir, chunk_name)
        with open(chunk_location, 'w') as out:
            docker_call(job,
                        config,
                        work_dir,
                        bam_split_command,
                        DOCKER_SAMTOOLS_IMG,
                        DOCKER_SAMTOOLS_TAG,
                        outfile=out)

        #document read count
        chunk_size = os.stat(chunk_location).st_size
        ci[CI_CHUNK_SIZE] = chunk_size
        ci[CI_REF_FA_SIZE] = ref_genome_size
        read_count = prepare_input__get_bam_read_count(job, work_dir,
                                                       chunk_name)
        ci[CI_READ_COUNT] = read_count
        log(
            job,
            "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format(
                chunk_position_description, idx, chunk_size,
                int(chunk_size / 1024 / 1024),
                read_count), uuid, 'prepare_input')
        if config.intermediate_file_location is not None:
            copy_files(file_paths=[chunk_location],
                       output_dir=config.intermediate_file_location)

        # enqueue marginPhase job
        if read_count > 0:
            chunk_fileid = job.fileStore.writeGlobalFile(chunk_location)
            mp_cores = config.defaultCores
            mp_mem = int(
                min(
                    int(chunk_size * MP_MEM_BAM_FACTOR +
                        ref_genome_size * MP_MEM_REF_FACTOR),
                    config.maxMemory))
            mp_disk = int(
                min(
                    int(chunk_size * MP_DSK_BAM_FACTOR +
                        ref_genome_size * MP_DSK_REF_FACTOR +
                        (0 if config.cpecan_probabilities else
                         MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk))
            log(
                job,
                "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format(
                    mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem,
                    int(mp_mem / 1024 / 1024 / 1024)),
                "{}.{}".format(uuid, idx), 'prepare_input')
            mp_mem = str(int(mp_mem / 1024)) + "K"
            mp_disk = str(int(mp_disk) / 1024) + "K"
            margin_phase_job = job.addChildJobFn(run_margin_phase,
                                                 config,
                                                 chunk_fileid,
                                                 ci,
                                                 memory=mp_mem,
                                                 cores=mp_cores,
                                                 disk=mp_disk)
            returned_tarballs.append(margin_phase_job.rv())
            enqueued_jobs += 1
        idx += 1

    log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input')

    # enqueue merging and consolidation job
    merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs)
    final_return_value = merge_job.rv()
    if enqueue_consolidation:
        consolidation_job = merge_job.addFollowOnJobFn(consolidate_output,
                                                       config, merge_job.rv())
        final_return_value = consolidation_job.rv()

    # log
    log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir)
    log_time(job, "prepare_input", start, config.uuid)

    # return appropriate output
    return final_return_value
Esempio n. 11
0
def test_download_url(tmpdir):
    from toil_lib.urls import download_url
    work_dir = str(tmpdir)
    download_url(work_dir=work_dir, url='www.google.com', name='testy')
    assert os.path.exists(os.path.join(work_dir, 'testy'))
Esempio n. 12
0
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False, sort=True):
    """
    Performs alignment of fastqs to bam via STAR

    --limitBAMsortRAM step added to deal with memory explosion when sorting certain samples.
    The value was chosen to complement the recommended amount of memory to have when running STAR (60G)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None)
    :param str star_index_url: STAR index tarball
    :param bool wiggle: If True, will output a wiggle file and return it
    :return: FileStoreID from RSEM
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(job, url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir)
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    os.remove(os.path.join(work_dir, 'starIndex.tar.gz'))
    # Determine tarball structure - star index contains are either in a subdir or in the tarball itself
    star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data'
    # Parameter handling for paired / single-end data
    parameters = ['--runThreadN', str(job.cores),
                  '--genomeDir', star_index,
                  '--outFileNamePrefix', 'rna',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1',
                  '--limitBAMsortRAM', '49268954168']
    # Modify paramaters based on function arguments
    if sort:
        parameters.extend(['--outSAMtype', 'BAM', 'SortedByCoordinate'])
        aligned_bam = 'rnaAligned.sortedByCoord.out.bam'
    else:
        parameters.extend(['--outSAMtype', 'BAM', 'Unsorted'])
        aligned_bam = 'rnaAligned.out.bam'
    if wiggle:
        parameters.extend(['--outWigType', 'bedGraph',
                           '--outWigStrand', 'Unstranded',
                           '--outWigReferencesPrefix', 'chr'])
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq'])
    # Call: STAR Mapping
    dockerCall(job=job, tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
               workDir=work_dir, parameters=parameters)
    # Check output bam isnt size zero if sorted
    aligned_bam_path = os.path.join(work_dir, aligned_bam)
    if sort:
        assert(os.stat(aligned_bam_path).st_size > 0, 'Aligned bam failed to sort. Ensure sufficient memory is free.')
    # Write to fileStore
    aligned_id = job.fileStore.writeGlobalFile(aligned_bam_path)
    transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam'))
    log_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaLog.final.out'))
    sj_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSJ.out.tab'))
    if wiggle:
        wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg'))
        return transcriptome_id, aligned_id, wiggle_id, log_id, sj_id
    else:
        return transcriptome_id, aligned_id, log_id, sj_id
Esempio n. 13
0
def star(job, inputs, r1_cutadapt, r2_cutadapt):
    """
    Performs alignment of fastqs to BAM via STAR

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str r1_cutadapt: FileStore ID of read 1 fastq
    :param str r2_cutadapt: FileStore ID of read 2 fastq
    """
    job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(inputs.cores, 16)
    # Retrieve files
    job.fileStore.readGlobalFile(r1_cutadapt,
                                 os.path.join(work_dir, 'R1_cutadapt.fastq'))
    job.fileStore.readGlobalFile(r2_cutadapt,
                                 os.path.join(work_dir, 'R2_cutadapt.fastq'))
    # Get starIndex
    download_url(job=job,
                 url=inputs.star_index,
                 work_dir=work_dir,
                 name='starIndex.tar.gz')
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir
    ])
    # Parameters
    parameters = [
        '--runThreadN',
        str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix',
        'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped',
        'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes',
        'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout',
        '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999',
        '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20',
        '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000',
        '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1',
        '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq',
        '/data/R2_cutadapt.fastq'
    ]
    # Call: STAR Map
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
        work_dir=work_dir,
        parameters=parameters)
    # Call Samtools Index
    index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam']
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=index_command,
        tool=
        'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c'
    )
    # fileStore
    bam_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    bai_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai'))
    job.fileStore.deleteGlobalFile(r1_cutadapt)
    job.fileStore.deleteGlobalFile(r2_cutadapt)
    # Launch children and follow-on
    vcqc_id = job.addChildJobFn(variant_calling_and_qc,
                                inputs,
                                bam_id,
                                bai_id,
                                cores=2,
                                disk='30G').rv()
    spladder_id = job.addChildJobFn(spladder,
                                    inputs,
                                    bam_id,
                                    bai_id,
                                    disk='30G').rv()
    job.addFollowOnJobFn(consolidate_output_tarballs,
                         inputs,
                         vcqc_id,
                         spladder_id,
                         disk='30G')
Esempio n. 14
0
def run_single_cell(job, sample, config):
    """
    Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo:
    https://github.com/pachterlab/scRNA-Seq-TCC-prep).  Output includes TCC matrix from kallisto process.

    :param job: toil job
    :param config: configuration for toil job
    :param sample: a [UUID, url(s)] pair as constructed by parse_samples
    """
    # Common logic (for handling pre- and post- Kallisto data)
    config = argparse.Namespace(**vars(config))  # why?
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    work_dir = job.fileStore.getLocalTempDir()
    # Get input files
    uuid, type, urls = sample
    config.uuid = uuid
    # Handle kallisto output file (only works w/ one file for now)
    if type == "plot":
        filename = os.path.basename(urls[0])
        download_url(job, url=urls[0], name=filename, work_dir=work_dir)
        tar = tarfile.open(name=os.path.join(work_dir, filename))
        root_dir = rstrip(
            os.path.basename(urls[0]), ".tar.gz"
        )  # post, kallisto, plots folders are in this root folder, with same name as the archive
        kallisto_output = None  # could just forward the kallisto output
        post_processing_output = None  # same with this

        # method that, given the location of the file in the tar, writes it to the global job store
        def tarToGlobal(folder, path):
            with closing(tar.extractfile(os.path.join(root_dir, folder,
                                                      path))) as file:
                data = file.read()
                with job.fileStore.writeGlobalFileStream() as (stream, id):
                    stream.write(data)
                    return id

        tcc_matrix_id = tarToGlobal("post", TCC_MATRIX_FILENAME)
        pwise_dist_l1_id = tarToGlobal("post", PWISE_DIST_FILENAME)
        nonzero_ec_id = tarToGlobal("post", NONZERO_EC_FILENAME)
        kallisto_matrix_id = tarToGlobal("post", KALLISTO_MATRIX_FILENAME)
        matrix_tsv_id = tarToGlobal("kallisto", "matrix.tsv")
        matrix_cells_id = tarToGlobal("kallisto", "matrix.cells")
    # Handle fastq file(s)
    else:
        input_location = os.path.join(work_dir, "fastq_input")
        os.mkdir(input_location)
        for url in urls:
            if url.endswith('.tar') or url.endswith('.tar.gz'):
                tar_path = os.path.join(work_dir, os.path.basename(url))
                download_url(job, url=url, work_dir=work_dir)
                subprocess.check_call(
                    ['tar', '-xvf', tar_path, '-C', input_location])
                os.remove(tar_path)
            elif url.endswith('.gz'):
                download_url(job, url=url, work_dir=input_location)
                subprocess.check_call([
                    'gunzip',
                    os.path.join(input_location, os.path.basename(url))
                ])
            else:
                job.fileStore.logToMaster("Download url " + str(url))
                download_url(job, url=url, work_dir=input_location)
        # Generate configuration JSON
        with open(os.path.join(work_dir, "config.json"), 'w') as config_file:
            config_file.write(build_patcherlab_config(config))
        # Get Kallisto index
        download_url(job,
                     url=config.kallisto_index,
                     name='kallisto_index.idx',
                     work_dir=work_dir)
        # Create other locations for patcherlab stuff
        os.mkdir(os.path.join(work_dir, "tcc"))
        os.mkdir(os.path.join(work_dir, "output"))
        if type == "pseudo":
            # Call docker image
            dockerCall(job,
                       tool='quay.io/ucsc_cgl/kallisto_sc:latest',
                       workDir=work_dir,
                       parameters=["/data/config.json"])
        else:  # quantification of quake brain-style paired end fastqs, each for a different cell
            require(type == "quant",
                    "invalid type " + type + " found in manifest ")
            os.mkdir(os.path.join(work_dir, "quant_output"))
            # Call docker image
            dockerCall(job,
                       tool='kallisto_sc_quant',
                       workDir=work_dir,
                       parameters=[
                           "/data/kallisto_index.idx", "/data/quant_output",
                           str(config.cores), "/data/fastq_input"
                       ])
            # Consolidate abundances for the various cells
            quant_output = os.path.join(work_dir, "quant_output")
            consolidated = os.path.join(work_dir, "quant_consolidated")
            os.mkdir(consolidated)
            for output_folder in os.listdir(quant_output):
                shutil.copy(
                    os.path.join(quant_output, output_folder, "abundance.tsv"),
                    os.path.join(consolidated, output_folder + ".tsv"))
            # quant to pseudo
            quant_to_pseudo(None, consolidated, os.path.join(work_dir, "tcc"))
            # run post-processing
            save_dir = os.path.join(work_dir, "save")
            os.mkdir(save_dir)
            prep_tcc_matrix(
                job,
                threads=config.cores,
                tcc_output_dir=os.path.join(work_dir, "tcc"),
                save_dir=save_dir
            )  # this should be the same as specified in build_pachterlab_config. It may be worth refactoring so that these don't have to be manually synced, although there's no reason for these values to ever change and thus become desynced.
        # Irrespective of whether quant or pseudo, because of quant-to-pseudo conversion
        # Build tarfile of output
        output_files = glob(os.path.join(work_dir, "tcc", "*"))
        tarball_files(tar_name='kallisto_output.tar.gz',
                      file_paths=output_files,
                      output_dir=work_dir)
        kallisto_output = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'kallisto_output.tar.gz'))
        # Consolidate post-processing output
        tcc_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', TCC_MATRIX_FILENAME))
        pwise_dist_l1_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', PWISE_DIST_FILENAME))
        nonzero_ec_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', NONZERO_EC_FILENAME))
        kallisto_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'tcc', 'matrix.ec'))
        post_processing_output = {
            TCC_MATRIX_FILENAME: tcc_matrix_id,
            PWISE_DIST_FILENAME: pwise_dist_l1_id,
            NONZERO_EC_FILENAME: nonzero_ec_id,
            KALLISTO_MATRIX_FILENAME:
            kallisto_matrix_id  # technically redundant
        }
        # Prepare files to send to plots for SC3
        matrix_tsv_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, "tcc", "matrix.tsv"))
        matrix_cells_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, "tcc", "matrix.cells"))
    # Graphing step
    if config.generate_graphs:
        graphical_output = job.addChildJobFn(run_data_analysis, config,
                                             tcc_matrix_id, pwise_dist_l1_id,
                                             nonzero_ec_id, kallisto_matrix_id,
                                             matrix_tsv_id,
                                             matrix_cells_id).rv()
        job.addFollowOnJobFn(consolidate_output, config, kallisto_output,
                             graphical_output, post_processing_output)
    else:
        # converts to UUID name scheme and transfers to output location
        consolidate_output(job,
                           config,
                           kallisto_output=kallisto_output,
                           graphical_output=None,
                           post_processing_output=post_processing_output)
Esempio n. 15
0
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, transform, upload.
    Previously, this was not monolithic; change came in due to #126/#134.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        if inputs.native_adam_path is None:
            hdfs_dir = "/data/"
        else:
            hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split(
            '/')[-1]

        if not inputs.run_local:
            download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample,
                          hdfs_snps, hdfs_bam)
        else:
            download_url(job, inputs.sample, work_dir=inputs.local_dir)
            download_url(job, inputs.dbsnp, work_dir=inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        adam_snps = hdfs_dir + "/snps.var.adam"
        adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input,
                     adam_snps, spark_on_toil)

        adam_output = hdfs_prefix + ".processed.bam"
        adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir,
                       adam_output, spark_on_toil)

        if inputs.output_dir:
            out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

            if not inputs.run_local:
                upload_data(job, master_ip, inputs, adam_output, out_file,
                            spark_on_toil)
            else:
                local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir,
                                                             sample_name)
                move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Esempio n. 16
0
def run_single_cell(job, sample, config):
    """
    Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo:
    https://github.com/pachterlab/scRNA-Seq-TCC-prep).  Output includes TCC matrix from kallisto process.

    :param job: toil job
    :param config: configuration for toil job
    :param sample: list of samples as constucted by 'parse_samples' function
    """
    config = argparse.Namespace(**vars(config))
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    work_dir = job.fileStore.getLocalTempDir()
    # Generate configuration JSON
    with open(os.path.join(work_dir, "config.json"), 'w') as config_file:
        config_file.write(build_patcherlab_config(config))
    # Get Kallisto index
    download_url(job,
                 url=config.kallisto_index,
                 name='kallisto_index.idx',
                 work_dir=work_dir)
    # Get input files
    input_location = os.path.join(work_dir, "fastq_input")
    os.mkdir(input_location)
    uuid, urls = sample
    config.uuid = uuid
    for url in urls:
        if url.endswith('.tar') or url.endswith('.tar.gz'):
            tar_path = os.path.join(work_dir, os.path.basename(url))
            download_url(job, url=url, work_dir=work_dir)
            subprocess.check_call(
                ['tar', '-xvf', tar_path, '-C', input_location])
            os.remove(tar_path)
        else:
            download_url(job, url=url, work_dir=input_location)
    # Create other locations for patcherlab stuff
    os.mkdir(os.path.join(work_dir, "tcc"))
    os.mkdir(os.path.join(work_dir, "output"))

    # Call docker image
    dockerCall(job,
               tool='quay.io/ucsc_cgl/kallisto_sc:latest',
               workDir=work_dir,
               parameters=["/data/config.json"])

    # Build tarfile of output
    output_files = [
        os.path.join(work_dir, "tcc", x)
        for x in ['run_info.json', 'matrix.tsv', 'matrix.ec', 'matrix.cells']
    ]
    tarball_files(tar_name='kallisto_output.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    kallisto_output = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'kallisto_output.tar.gz'))
    # Graphing step
    if config.generate_graphs:
        tcc_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'TCC_matrix.dat'))
        pwise_dist_l1_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'pwise_dist_L1.dat'))
        nonzero_ec_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'nonzero_ec.dat'))
        kallisto_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'tcc', 'matrix.ec'))

        graphical_output = job.addChildJobFn(run_data_analysis, config,
                                             tcc_matrix_id, pwise_dist_l1_id,
                                             nonzero_ec_id,
                                             kallisto_matrix_id).rv()

        job.addFollowOnJobFn(consolidate_output, config, kallisto_output,
                             graphical_output)
    else:
        # converts to UUID name scheme and transfers to output location
        consolidate_output(job,
                           config,
                           kallisto_output=kallisto_output,
                           graphical_output=None)
Esempio n. 17
0
def docker_call(job,
                tool,
                parameters=None,
                work_dir='.',
                rm=True,
                detached=False,
                env=None,
                outfile=None,
                inputs=None,
                outputs=None,
                docker_parameters=None,
                check_output=False,
                mock=None,
                defer=None,
                container_name=None,
                mounts=None):
    """
    Calls Docker, passing along parameters and tool.

    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
    :param list[str] parameters: Command line arguments to be passed to the tool
    :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data
    :param bool rm: Should the container be run with the --rm flag (Should it be removed upon
           container exit)? rm and detached are mutually exclusive in Docker.  This is the flag
           passed to docker and is independent of the defer flag.  If this is set to True and
           `defer` is None, `defer` takes the value `docker_call.RM`.
    :param bool detached: Should the container be run with the --detached flag (Should it be run in
           detached mode)? See `rm` above.
    :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G'))
    :param file outfile: Pipe output of Docker call to file handle
    :param list[str] inputs: A list of the input files.
    :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None
           or a url. The value is only used if mock=True
    :param dict[str,str] docker_parameters: Parameters to pass to docker
    :param bool check_output: When True, this function returns docker's output
    :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by
           the environment variable.
    :param int defer: What action should be taken on the container upon job completion?
           docker_call.FORGO will leave the container untouched.
           docker_call.STOP will attempt to stop the container with `docker stop` (useful for
           debugging).
           docker_call.RM will stop the container and then forcefully remove it from the system
           using `docker rm -f`.
           The default value is None and that shadows docker_call.FORGO, unless rm is true.
    :param str container_name: An optional name for your container.
    :param dict mounts: A dictionary of data volumes to mount into the Docker container containing host paths
           as keys and the corresponding container paths as values
    """
    from toil_lib.urls import download_url

    if mock is None:
        mock = mock_mode()
    if parameters is None:
        parameters = []
    if inputs is None:
        inputs = []
    if outputs is None:
        outputs = {}

    # Docker does not allow the --rm flag to be used when the container is run in detached mode.
    require(not (rm and detached), "Conflicting options 'rm' and 'detached'.")
    # Ensure the user has passed a valid value for defer
    require(
        defer in (None, docker_call.FORGO, docker_call.STOP, docker_call.RM),
        'Please provide a valid value for defer.')

    for filename in inputs:
        assert (os.path.isfile(os.path.join(work_dir, filename)))

    if mock:
        for filename, url in outputs.items():
            file_path = os.path.join(work_dir, filename)
            if url is None:
                # create mock file
                if not os.path.exists(file_path):
                    f = open(file_path, 'w')
                    f.write("contents")  # FIXME
                    f.close()

            else:
                file_path = os.path.join(work_dir, filename)
                if not os.path.exists(file_path):
                    outfile = download_url(job,
                                           url,
                                           work_dir=work_dir,
                                           name=filename,
                                           mock=False)
                assert os.path.exists(file_path)
        return

    if not container_name:
        container_name = _get_container_name(job)
    base_docker_call = [
        'docker', 'run', '--log-driver=none', '-v',
        '{}:/data'.format(os.path.abspath(work_dir))
    ]
    if mounts:
        require(isinstance(mounts, dict),
                "'mounts' parameter must be a dictionary object")
        for k, v in mounts.iteritems():
            base_docker_call.extend(['-v', k + ':' + v])

    # Defer the permission fixing function.  We call this explicitly later on in this function, but
    # we defer it as well to handle unexpected job failure.
    job.defer(_fix_permissions, base_docker_call, tool, work_dir)

    base_docker_call.extend(['--name', container_name])
    if rm:
        base_docker_call.append('--rm')
        if defer is None:
            defer = docker_call.RM
    elif detached:
        base_docker_call += ['-d']
    # Defer the container on-exit action
    job.defer(_docker_kill, container_name, action=defer)

    if env:
        for e, v in env.iteritems():
            base_docker_call.extend(['-e', '{}={}'.format(e, v)])
    if docker_parameters:
        base_docker_call += docker_parameters

    _log.debug("Calling docker with %s." %
               " ".join(base_docker_call + [tool] + parameters))

    call = base_docker_call + [tool] + parameters

    if outfile:
        subprocess.check_call(call, stdout=outfile)
    else:
        if check_output:
            return subprocess.check_output(call)
        else:
            subprocess.check_call(call)
    # Fix root ownership of output files
    _fix_permissions(base_docker_call, tool, work_dir)

    for filename in outputs.keys():
        if not os.path.isabs(filename):
            filename = os.path.join(work_dir, filename)
        assert (os.path.isfile(filename))
Esempio n. 18
0
def docker_call(tool,
                parameters=None,
                work_dir='.',
                rm=True,
                env=None,
                outfile=None,
                inputs=None,
                outputs=None,
                docker_parameters=None,
                check_output=False,
                mock=None):
    """
    Calls Docker, passing along parameters and tool.

    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
    :param list[str] parameters: Command line arguments to be passed to the tool
    :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data
    :param bool rm: Set to True to pass `--rm` flag.
    :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G'))
    :param bool sudo: If True, prepends `sudo` to the docker call
    :param file outfile: Pipe output of Docker call to file handle
    :param list[str] inputs: A list of the input files.
    :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None
                                  or a url. The value is only used if mock=True
    :param dict[str,str] docker_parameters: Parameters to pass to docker
    :param bool check_output: When True, this function returns docker's output
    :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by
                      the environment variable.
    """
    from toil_lib.urls import download_url

    if mock is None:
        mock = mock_mode()
    if parameters is None:
        parameters = []
    if inputs is None:
        inputs = []
    if outputs is None:
        outputs = {}

    for filename in inputs:
        assert (os.path.isfile(os.path.join(work_dir, filename)))

    if mock:
        for filename, url in outputs.items():
            file_path = os.path.join(work_dir, filename)
            if url is None:
                # create mock file
                if not os.path.exists(file_path):
                    f = open(file_path, 'w')
                    f.write("contents")  # FIXME
                    f.close()

            else:
                file_path = os.path.join(work_dir, filename)
                if not os.path.exists(file_path):
                    outfile = download_url(url,
                                           work_dir=work_dir,
                                           name=filename)
                assert os.path.exists(file_path)
        return

    base_docker_call = [
        'docker', 'run', '--log-driver=none', '-v',
        '{}:/data'.format(os.path.abspath(work_dir))
    ]
    if rm:
        base_docker_call.append('--rm')
    if env:
        for e, v in env.iteritems():
            base_docker_call.extend(['-e', '{}={}'.format(e, v)])
    if docker_parameters:
        base_docker_call += docker_parameters

    _log.debug("Calling docker with %s." %
               " ".join(base_docker_call + [tool] + parameters))

    docker_call = base_docker_call + [tool] + parameters

    try:
        if outfile:
            subprocess.check_call(docker_call, stdout=outfile)
        else:
            if check_output:
                return subprocess.check_output(docker_call)
            else:
                subprocess.check_call(docker_call)
    # Fix root ownership of output files
    except:
        # Panic avoids hiding the exception raised in the try block
        with panic():
            _fix_permissions(base_docker_call, tool, work_dir)
    else:
        _fix_permissions(base_docker_call, tool, work_dir)

    for filename in outputs.keys():
        if not os.path.isabs(filename):
            filename = os.path.join(work_dir, filename)
        assert (os.path.isfile(filename))
Esempio n. 19
0
def docker_call(tool=None,
                tools=None,
                parameters=None,
                work_dir='.',
                rm=True,
                env=None,
                outfile=None,
                errfile=None,
                inputs=None,
                outputs=None,
                docker_parameters=None,
                check_output=False,
                return_stderr=False,
                mock=None):
    """
    Calls Docker, passing along parameters and tool.

    :param (str tool | str tools): str tool name of the Docker image to be used (e.g. tool='quay.io/ucsc_cgl/samtools')
                     OR str tools of the Docker images and order to be used when piping commands to
                     Docker. (e.g. 'quay.io/ucsc_cgl/samtools'). Both tool and tools are mutually
                     exclusive parameters to docker_call.
    :param list[str] parameters: Command line arguments to be passed to the tool
    :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data
    :param bool rm: Set to True to pass `--rm` flag.
    :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G'))
    :param bool sudo: If True, prepends `sudo` to the docker call
    :param file outfile: Pipe stdout of Docker call to file handle
    :param file errfile: Pipe stderr of Docker call to file handle
    :param list[str] inputs: A list of the input files.
    :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None
                                  or a url. The value is only used if mock=True
    :param dict[str,str] docker_parameters: Parameters to pass to docker
    :param bool check_output: When True, this function returns docker's output
    :param bool return_stderr: When True, this function includes stderr in docker's output
    :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by
                      the environment variable.

    Pipes in docker commands:
    Running a pipe in docker in 'pipe-in-single-container' mode produces command structure
        docker '... | ... | ...' where each '...' command corresponds to each element in the 'parameters'
        argument that uses a docker container. This is the most efficient method if you want to run a pipe of
        commands where each command uses the same docker container.
    
    Example for running command 'head -c 1M /dev/urandom | gzip | gunzip | md5sum 1>&2':
        Running 'pipe-in-single-container' mode:
            command= ['head -c 1M /dev/urandom', 'gzip', 'gunzip', 'md5sum 1>&2']
            docker_work_dir=curr_work_dir
            docker_tools='ubuntu'
            stdout = docker_call(work_dir=docker_work_dir, parameters=command, tools=docker_tools, check_output=True)
    """
    from toil_lib.urls import download_url

    if mock is None:
        mock = mock_mode()
    if parameters is None:
        parameters = []
    if inputs is None:
        inputs = []
    if outputs is None:
        outputs = {}

    for filename in inputs:
        assert(os.path.isfile(os.path.join(work_dir, filename)))

    if mock:
        for filename, url in outputs.items():
            file_path = os.path.join(work_dir, filename)
            if url is None:
                # create mock file
                if not os.path.exists(file_path):
                    f = open(file_path, 'w')
                    f.write("contents") # FIXME
                    f.close()

            else:
                file_path = os.path.join(work_dir, filename)
                if not os.path.exists(file_path):
                    outfile = download_url(url, work_dir=work_dir, name=filename)
                assert os.path.exists(file_path)
        return
    
    base_docker_call = ['docker', 'run',
                        '--log-driver=none',
                        '-v', '{}:/data'.format(os.path.abspath(work_dir))]
    if rm:
        base_docker_call.append('--rm')
    if env:
        for e, v in env.iteritems():
            base_docker_call.extend(['-e', '{}={}'.format(e, v)])
    
    if docker_parameters:
        base_docker_call += docker_parameters
   
    docker_call = []
    
    require(bool(tools) != bool(tool), 'Either "tool" or "tools" must contain a value, but not both')
    
    # Pipe functionality
    #   each element in the parameters list must represent a sub-pipe command
    if bool(tools):
        # If tools is set then format the docker call in the 'pipe-in-single-container' mode
        docker_call = " ".join(base_docker_call + ['--entrypoint /bin/bash', tools, '-c \'{}\''.format(" | ".join(parameters))])
        _log.debug("Calling docker with %s." % docker_call)
        
    else:        
        docker_call = " ".join(base_docker_call + [tool] + parameters)
        _log.debug("Calling docker with %s." % docker_call)

    
    try:
        if outfile:
            if errfile:
                subprocess.check_call(docker_call, stdout=outfile, stderr=errfile, shell=True)
            else:
                subprocess.check_call(docker_call, stdout=outfile, shell=True)
        else:
            if check_output:
                if return_stderr:
                    return subprocess.check_output(docker_call, shell=True, stderr=subprocess.STDOUT)
                else:
                    return subprocess.check_output(docker_call, shell=True)
            else:
                subprocess.check_call(docker_call, shell=True)
    # Fix root ownership of output files
    except:
        # Panic avoids hiding the exception raised in the try block
        with panic():
            _fix_permissions(base_docker_call, tool, tools, work_dir)
    else:
        _fix_permissions(base_docker_call, tool, tools, work_dir)

    for filename in outputs.keys():
        if not os.path.isabs(filename):
            filename = os.path.join(work_dir, filename)
        assert(os.path.isfile(filename))