def chunk_vg(xg_path, path_name, out_dir, chunks, chunk_i, overwrite):
    """ use vg find to make one chunk of the graph """
    chunk = chunks[chunk_i]
    vg_chunk_path = chunk_base_name(chunk[0], out_dir, chunk_i, ".vg")
    if overwrite or not os.path.isfile(vg_chunk_path):
        first_node = xg_path_node_id(xg_path, chunk[0], int(chunk[1]), out_dir)
        # xg_path query takes 0-based inclusive coordinates, so we
        # subtract 1 below to convert from BED chunk (0-based exlcusive)
        last_node = xg_path_node_id(xg_path, chunk[0], chunk[2] - 1, out_dir)
        assert first_node > 0 and last_node >= first_node
        # todo: would be cleaner to not have to pad context here
        
        with open(vg_chunk_path, "w") as vg_chunk_path_stream:
            command = ['find', '-x', os.path.basename(xg_path), '-r', str(first_node)+':'+str(last_node), '-c', '1']
            docker_call(work_dir=out_dir, parameters=command,
                        tool='quay.io/ucsc_cgl/vg:latest',
                        outfile=vg_chunk_path_stream)
        
        # but because we got a context, manually go in and make sure
        # our path starts at first_node by deleting everything before
        left_path_padding = xg_path_predecessors(xg_path, path_name, first_node,
                                                 out_dir, context = 1)
        for destroy_id in left_path_padding:
            # destroy should take node list
            destroy_list = vg_chunk_path + ".destroy"

            with open(destroy_list, "w") as destroy_list_stream:
                command = ['vg mod -y {} {}'.format(str(destroy_id), os.path.basename(vg_chunk_path)),
                            'vg mod -o -']
                docker_call(work_dir=out_dir, parameters=command,
                            tools='quay.io/ucsc_cgl/vg:latest',
                            outfile=destroy_list_stream)
            
            run("mv {} {}".format(
                vg_chunk_path + ".destroy", vg_chunk_path))
Exemple #2
0
def call_conductor(master_ip, src, dst, memory=None, override_parameters=None):
    """
    Invokes the Conductor container to copy files between S3 and HDFS and vice versa.
    Find Conductor at https://github.com/BD2KGenomics/conductor.

    :param masterIP: The Spark leader IP address.
    :param src: URL of file to copy.
    :param src: URL of location to copy file to.
    :param memory: Gigabytes of memory to provision for Spark driver/worker.
    :param override_parameters: Parameters passed by the user, that override our defaults.

    :type masterIP: MasterAddress
    :type src: string
    :type dst: string
    :type memory: int or None
    :type override_parameters: list of string or None
    """

    arguments = ["-C", src, dst]

    docker_call(
        rm=False,
        tool="quay.io/ucsc_cgl/conductor",
        docker_parameters=master_ip.docker_parameters(["--net=host"]),
        parameters=_make_parameters(
            master_ip,
            [],  # no conductor specific spark configuration
            memory,
            arguments,
            override_parameters),
        mock=False)
Exemple #3
0
def run_fastqc(job, r1_id, r2_id):
    """
    Run Fastqc on the input reads

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2
    :return: FileStoreID of fastQC output (tarball)
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
    parameters = ['/data/R1.fastq']
    output_names = ['R1_fastqc.html', 'R1_fastqc.zip']
    if r2_id:
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['-t', '2', '/data/R2.fastq'])
        output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip'])
    docker_call(
        tool=
        'quay.io/ucsc_cgl/fastqc:0.11.5--be13567d00cd4c586edf8ae47d991815c8c72a49',
        work_dir=work_dir,
        parameters=parameters)
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='fastqc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'fastqc.tar.gz'))
Exemple #4
0
def run_bwa_index(job, ref_id):
    """
    Use BWA to create reference index files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str ref_id: FileStoreID for the reference genome
    :return: FileStoreIDs for BWA index files
    :rtype: tuple(str, str, str, str, str)
    """
    job.fileStore.logToMaster('Created BWA index files')
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa'))
    command = ['index', '/data/ref.fa']
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=command,
        tool=
        'quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c'
    )
    ids = {}
    for output in [
            'ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa'
    ]:
        ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile(
            os.path.join(work_dir, output)))
    return ids['amb'], ids['ann'], ids['bwt'], ids['pac'], ids['sa']
Exemple #5
0
def run_print_reads(job,
                    table,
                    indel_bam,
                    indel_bai,
                    ref,
                    ref_dict,
                    fai,
                    mem,
                    unsafe=False):
    """
    Creates BAM that has had the base quality scores recalibrated

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str table: Recalibration table FileStoreID
    :param str indel_bam: Indel interval FileStoreID
    :param str indel_bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreID for the processed bam
    :rtype: tuple(str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, table, indel_bam, indel_bai]
    inputs = [
        'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.recal.table',
        'sample.indel.bam', 'sample.indel.bai'
    ]
    for file_store_id, name in zip(file_ids, inputs):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Call: GATK -- PrintReads
    parameters = [
        '-T', 'PrintReads', '-nct',
        str(job.cores), '-R', '/data/ref.fasta', '--emit_original_quals', '-I',
        '/data/sample.indel.bam', '-BQSR', '/data/sample.recal.table', '-o',
        '/data/sample.bqsr.bam'
    ]
    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])
    docker_call(
        tool=
        'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
        inputs=inputs,
        outputs={
            'sample.bqsr.bam': None,
            'sample.bqsr.bai': None
        },
        work_dir=work_dir,
        parameters=parameters,
        env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write ouptut to file store
    bam_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'sample.bqsr.bam'))
    bai_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'sample.bqsr.bai'))
    return bam_id, bai_id
Exemple #6
0
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'),
                  (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'),
                  (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(job=job, url=url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = [
        'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v',
        'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
        '-o', '/data/output.vcf.gz'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=variant_command,
        tool=
        'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c'
    )

    # Part 2: QC
    qc_command = [
        '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m',
        'annotation.m53'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=qc_command,
        tool=
        'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz',
                  file_paths=[output_tsv, output_vcf],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
Exemple #7
0
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(job=job,
                 url=inputs.gtf,
                 work_dir=work_dir,
                 name='annotation.gtf')
    download_url(job=job,
                 url=inputs.gtf_pickle,
                 work_dir=work_dir,
                 name='annotation.gtf.pickle')
    # Call Spladder
    command = [
        '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n',
        '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a',
        'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n',
        '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'
    ]
    docker_call(job=job,
                work_dir=work_dir,
                parameters=command,
                sudo=inputs.sudo,
                tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder',
                                 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz',
                  file_paths=[output_pickle, output_filt, output],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'spladder.tar.gz'))
Exemple #8
0
def run_rsem(job, bam_id, rsem_ref_url, paired=True):
    """
    RNA quantification with RSEM

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str bam_id: FileStoreID of transcriptome bam for quantification
    :param str rsem_ref_url: URL of RSEM reference (tarball)
    :param bool paired: If True, uses parameters for paired end data
    :return: FileStoreIDs for RSEM's gene and isoform output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir
    ])
    os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz'))
    # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix
    rsem_files = []
    for root, directories, files in os.walk(work_dir):
        rsem_files.extend([os.path.join(root, x) for x in files])
    # "grp" is a required RSEM extension that should exist in the RSEM reference
    ref_prefix = [
        os.path.basename(os.path.splitext(x)[0]) for x in rsem_files
        if 'grp' in x
    ][0]
    ref_folder = os.path.join('/data',
                              os.listdir(work_dir)[0]) if len(
                                  os.listdir(work_dir)) == 1 else '/data'
    # I/O
    job.fileStore.readGlobalFile(bam_id,
                                 os.path.join(work_dir, 'transcriptome.bam'))
    output_prefix = 'rsem'
    # Call: RSEM
    parameters = [
        '--quiet', '--no-qualities', '-p',
        str(job.cores), '--forward-prob', '0.5', '--seed-length', '25',
        '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam',
        os.path.join(ref_folder, ref_prefix), output_prefix
    ]
    if paired:
        parameters = ['--paired-end'] + parameters
    docker_call(
        tool=
        'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21',
        parameters=parameters,
        work_dir=work_dir)
    os.rename(os.path.join(work_dir, output_prefix + '.genes.results'),
              os.path.join(work_dir, 'rsem_gene.tab'))
    os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'),
              os.path.join(work_dir, 'rsem_isoform.tab'))
    # Write to FileStore
    gene_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_gene.tab'))
    isoform_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_isoform.tab'))
    return gene_id, isoform_id
Exemple #9
0
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False):
    """
    Performs alignment of fastqs to bam via STAR

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None)
    :param str star_index_url: STAR index tarball
    :param bool wiggle: If True, will output a wiggle file and return it
    :return: FileStoreID from RSEM
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir)
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    os.remove(os.path.join(work_dir, 'starIndex.tar.gz'))
    # Determine tarball structure - star index contains are either in a subdir or in the tarball itself
    star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data'
    # Parameter handling for paired / single-end data
    parameters = ['--runThreadN', str(job.cores),
                  '--genomeDir', star_index,
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1']
    if wiggle:
        parameters.extend(['--outWigType', 'bedGraph',
                           '--outWigStrand', 'Unstranded',
                           '--outWigReferencesPrefix', 'chr'])
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['--readFilesIn', '/data/R1.fastq'])
    # Call: STAR Mapping
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Write to fileStore
    transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam'))
    sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    if wiggle:
        wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg'))
        return transcriptome_id, sorted_id, wiggle_id
    else:
        return transcriptome_id, sorted_id
Exemple #10
0
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id):
    """
    Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform.
    These are two-column files: Genes and Quantifications.
    HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: UUID to mark the samples with
    :param str rsem_gene_id: FileStoreID of rsem_gene_ids
    :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids
    :return: FileStoreID from RSEM post process tarball
    :rytpe: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    # I/O
    job.fileStore.readGlobalFile(rsem_gene_id,
                                 os.path.join(work_dir, 'rsem_gene.tab'),
                                 mutable=True)
    job.fileStore.readGlobalFile(rsem_isoform_id,
                                 os.path.join(work_dir, 'rsem_isoform.tab'),
                                 mutable=True)
    # Convert RSEM files into individual .tab files.
    docker_call(tool='jvivian/rsem_postprocess',
                parameters=[uuid],
                work_dir=work_dir)
    os.rename(os.path.join(work_dir, 'rsem_gene.tab'),
              os.path.join(work_dir, 'rsem_genes.results'))
    os.rename(os.path.join(work_dir, 'rsem_isoform.tab'),
              os.path.join(work_dir, 'rsem_isoforms.results'))
    output_files = [
        'rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab',
        'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab',
        'rsem_genes.results', 'rsem_isoforms.results'
    ]
    # Perform HUGO gene / isoform name mapping
    genes = [x for x in output_files if 'rsem.genes' in x]
    isoforms = [x for x in output_files if 'rsem.isoform' in x]
    command = ['-g'] + genes + ['-i'] + isoforms
    docker_call(tool='jvivian/gencode_hugo_mapping',
                parameters=command,
                work_dir=work_dir)
    hugo_files = [
        os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1]
        for x in genes + isoforms
    ]
    # Create tarballs for outputs
    tarball_files('rsem.tar.gz',
                  file_paths=[os.path.join(work_dir, x) for x in output_files],
                  output_dir=work_dir)
    tarball_files('rsem_hugo.tar.gz',
                  [os.path.join(work_dir, x) for x in hugo_files],
                  output_dir=work_dir)
    rsem_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem.tar.gz'))
    hugo_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    return rsem_id, hugo_id
Exemple #11
0
def run_realigner_target_creator(job,
                                 bam,
                                 bai,
                                 ref,
                                 ref_dict,
                                 fai,
                                 phase,
                                 mills,
                                 mem,
                                 unsafe=False):
    """
    Creates intervals file needed for indel realignment

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: Sample BAM FileStoreID
    :param str bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str phase: Phase VCF FileStoreID
    :param str mills: Mills VCF FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreID for the processed bam
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, bam, bai, phase, mills]
    inputs = [
        'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.bam',
        'sample.bam.bai', 'phase.vcf', 'mills.vcf'
    ]
    for file_store_id, name in zip(file_ids, inputs):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Call: GATK -- RealignerTargetCreator
    parameters = [
        '-T', 'RealignerTargetCreator', '-nt',
        str(job.cores), '-R', '/data/ref.fasta', '-I', '/data/sample.bam',
        '-known', '/data/phase.vcf', '-known', '/data/mills.vcf',
        '--downsampling_type', 'NONE', '-o', '/data/sample.intervals'
    ]
    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])
    docker_call(
        tool=
        'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
        inputs=inputs,
        outputs={'sample.intervals': None},
        work_dir=work_dir,
        parameters=parameters,
        env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write to fileStore
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'sample.intervals'))
Exemple #12
0
def run_bam_qc(job, aligned_bam_id, config):
    """
    Run BAM QC as specified by California Kids Cancer Comparison (CKCC)

    :param JobFunctionWrappingJob job:
    :param str aligned_bam_id: FileStoreID of sorted bam from STAR
    :param Namespace config: Argparse Namespace object containing argument inputs
        Must contain:
            config.uuid str: UUID of input sample
            config.save_bam bool: True/False depending on whether to save bam
            config.output_dir str: Path to save bam
            config.ssec str: Path to encryption key for secure upload to S3
    :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar
    :rtype: tuple(bool, str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(
        aligned_bam_id,
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    docker_call(tool='hbeale/treehouse_bam_qc:1.0',
                work_dir=work_dir,
                parameters=['runQC.sh', str(job.cores)])

    # Tar Output files
    output_names = [
        'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf',
        'rnaAligned.out.md.sorted.geneBodyCoverage.txt'
    ]
    if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')):
        output_names.append('readDist.txt_PASS_qc.txt')
        fail_flag = False
    else:
        output_names.append('readDist.txt_FAIL_qc.txt')
        fail_flag = True
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='bam_qc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)

    # Save output BAM
    if config.save_bam:
        bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam')
        new_bam_path = os.path.join(work_dir,
                                    config.uuid + '.sortedByCoord.md.bam')
        os.rename(bam_path, new_bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=new_bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[new_bam_path], output_dir=config.output_dir)

    return fail_flag, job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'bam_qc.tar.gz'))
Exemple #13
0
def run_base_recalibration(job,
                           indel_bam,
                           indel_bai,
                           ref,
                           ref_dict,
                           fai,
                           dbsnp,
                           mem,
                           unsafe=False):
    """
    Creates recal table used in Base Quality Score Recalibration

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str indel_bam: Indel interval FileStoreID
    :param str indel_bai: Bam Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str dbsnp: DBSNP VCF FileStoreID
    :param str mem: Memory value to be passed to children. Needed for CI tests
    :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreID for the processed bam
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [ref, fai, ref_dict, indel_bam, indel_bai, dbsnp]
    inputs = [
        'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.indel.bam',
        'sample.indel.bai', 'dbsnp.vcf'
    ]
    for file_store_id, name in zip(file_ids, inputs):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Call: GATK -- IndelRealigner
    parameters = [
        '-T', 'BaseRecalibrator', '-nct',
        str(job.cores), '-R', '/data/ref.fasta', '-I',
        '/data/sample.indel.bam', '-knownSites', '/data/dbsnp.vcf', '-o',
        '/data/sample.recal.table'
    ]
    if unsafe:
        parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])
    docker_call(
        tool=
        'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
        inputs=inputs,
        outputs={'sample.recal.table': None},
        work_dir=work_dir,
        parameters=parameters,
        env=dict(JAVA_OPTS='-Xmx{}'.format(mem)))
    # Write output to file store
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'sample.recal.table'))
def download_bam(job, gdc_id, disk='40G'):
    work_dir = job.fileStore.getLocalTempDir()
    output_dir = os.path.join(work_dir, gdc_id)

    job.fileStore.logToMaster('Downloading: ' + gdc_id)
    parameters = ['download', '-d', '/data', gdc_id]
    docker_call(tool='jvivian/gdc-client', work_dir=work_dir, parameters=parameters)

    sample = glob(os.path.join(output_dir, '*.bam'))[0]
    bam_id = job.fileStore.writeGlobalFile(sample)

    job.addChildJobFn(process_bam_and_upload, bam_id, gdc_id, disk='80G')
def star(job, inputs, r1_cutadapt, r2_cutadapt):
    """
    Performs alignment of fastqs to BAM via STAR

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str r1_cutadapt: FileStore ID of read 1 fastq
    :param str r2_cutadapt: FileStore ID of read 2 fastq
    """
    job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(inputs.cores, 16)
    # Retrieve files
    job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq'))
    job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq'))
    # Get starIndex
    download_url(inputs.star_index, work_dir, 'starIndex.tar.gz')
    subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir])
    # Parameters
    parameters = ['--runThreadN', str(cores),
                  '--genomeDir', '/data/starIndex',
                  '--outFileNamePrefix', 'rna',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--outSAMunmapped', 'Within',
                  '--quantMode', 'TranscriptomeSAM',
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outFilterType', 'BySJout',
                  '--outFilterMultimapNmax', '20',
                  '--outFilterMismatchNmax', '999',
                  '--outFilterMismatchNoverReadLmax', '0.04',
                  '--alignIntronMin', '20',
                  '--alignIntronMax', '1000000',
                  '--alignMatesGapMax', '1000000',
                  '--alignSJoverhangMin', '8',
                  '--alignSJDBoverhangMin', '1',
                  '--sjdbScore', '1',
                  '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']
    # Call: STAR Map
    docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
                work_dir=work_dir, parameters=parameters)
    # Call Samtools Index
    index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam']
    docker_call(work_dir=work_dir, parameters=index_command,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')
    # fileStore
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai'))
    job.fileStore.deleteGlobalFile(r1_cutadapt)
    job.fileStore.deleteGlobalFile(r2_cutadapt)
    # Launch children and follow-on
    vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv()
    spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv()
    job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf')
    download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle')
    # Call Spladder
    command = ['--insert_ir=y',
               '--insert_es=y',
               '--insert_ni=y',
               '--remove_se=n',
               '--validate_sg=n',
               '-b', 'alignment.bam',
               '-o ', '/data',
               '-a', 'annotation.gtf',
               '-v', 'y',
               '-c', '3',
               '-M', 'single',
               '-T', 'n',
               '-n', '50',
               '-P', 'y',
               '-p', 'n',
               '--sparse_bam', 'y']
    docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
Exemple #17
0
def gatk_combine_variants(job,
                          vcfs,
                          ref_fasta,
                          ref_fai,
                          ref_dict,
                          merge_option='UNIQUIFY'):
    """
    Merges VCF files using GATK CombineVariants

    :param JobFunctionWrappingJob job: Toil Job instance
    :param dict vcfs: Dictionary of VCF FileStoreIDs {sample identifier: FileStoreID}
    :param str ref_fasta: FileStoreID for reference genome fasta
    :param str ref_fai: FileStoreID for reference genome index file
    :param str ref_dict: FileStoreID for reference genome sequence dictionary file
    :param str merge_option: Value for --genotypemergeoption flag (Default: 'UNIQUIFY')
                            'UNIQUIFY': Multiple variants at a single site are merged into a
                                        single variant record.
                            'UNSORTED': Used to merge VCFs from the same sample
    :return: FileStoreID for merged VCF file
    :rtype: str
    """
    job.fileStore.logToMaster('Running GATK CombineVariants')

    inputs = {
        'genome.fa': ref_fasta,
        'genome.fa.fai': ref_fai,
        'genome.dict': ref_dict
    }
    inputs.update(vcfs)

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))

    command = [
        '-T', 'CombineVariants', '-R', '/data/genome.fa', '-o',
        '/data/merged.vcf', '--genotypemergeoption', merge_option
    ]

    for uuid, vcf_id in vcfs.iteritems():
        command.extend(['--variant', os.path.join('/data', uuid)])

    docker_call(
        work_dir=work_dir,
        env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)},
        parameters=command,
        tool=
        'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
        inputs=inputs.keys(),
        outputs={'merged.vcf': None})

    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'merged.vcf'))
Exemple #18
0
def run_merge_vcf(job, options, index_dir_id, vcf_file_key_list):

    RealTimeLogger.get().info("Completed gam merging and gam path variant calling.")
    RealTimeLogger.get().info("Starting vcf merging vcf files.")
    # Set up the IO stores each time, since we can't unpickle them on Azure for
    # some reason.
    input_store = IOStore.get(options.input_store)
    out_store = IOStore.get(options.out_store)

    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()
    
    # Download local input files from the remote storage container
    graph_dir = work_dir
    read_global_directory(job.fileStore, index_dir_id, graph_dir)

    vcf_merging_file_key_list = [] 
    for vcf_file_key in vcf_file_key_list:
        vcf_file = "{}/{}.gz".format(work_dir, vcf_file_key)
        vcf_file_idx = "{}.tbi".format(vcf_file)
        out_store.read_input_file(vcf_file_key+".gz", vcf_file)
        out_store.read_input_file(vcf_file_key+".gz"+ ".tbi", vcf_file_idx)
        vcf_merging_file_key_list.append(os.path.basename(vcf_file))

    vcf_merged_file_key = "" 
    if len(vcf_merging_file_key_list) > 1:
        # merge vcf files
        vcf_merged_file_key = "{}.vcf.gz".format(options.sample_name)
        command=['bcftools', 'concat', '-O', 'z', '-o', os.path.basename(vcf_merged_file_key), ' '.join(vcf_merging_file_key_list)]
        docker_call(work_dir=work_dir, parameters=command,
                    tool='quay.io/cmarkello/bcftools')
        command=['bcftools', 'tabix', '-f', '-p', 'vcf', os.path.basename(vcf_merged_file_key)]
        docker_call(work_dir=work_dir, parameters=command,
                    tool='quay.io/cmarkello/bcftools')
    else:
        vcf_merged_file_key = vcf_merging_file_key_list[0]

    # save variant calling results to the output store
    vcf_file = "{}/{}".format(work_dir, vcf_merged_file_key)
    vcf_file_idx = "{}/{}.tbi".format(work_dir, vcf_merged_file_key)

    out_store.write_output_file(vcf_file, vcf_merged_file_key)
    out_store.write_output_file(vcf_file_idx, vcf_merged_file_key + ".tbi")

    
    #Run downloader to download output IO store files to local output directory.
    vcf_file_id = job.fileStore.writeGlobalFile(vcf_file)
    vcf_file_idx_id = job.fileStore.writeGlobalFile(vcf_file_idx) 
    downloadList = [[vcf_file_id, vcf_merged_file_key], [vcf_file_idx_id, vcf_merged_file_key+".tbi"]]

    return downloadList
def merge_vcf_chunks(job, options, index_dir_id, path_name, path_size, chunks, overwrite):
    """ merge a bunch of clipped vcfs created above, taking care to 
    fix up the headers.  everything expected to be sorted already """
    
    # Set up the IO stores each time, since we can't unpickle them on Azure for
    # some reason.
    input_store = IOStore.get(options.input_store)
    out_store = IOStore.get(options.out_store)   

    # Define work directory for docker calls
    out_dir = job.fileStore.getLocalTempDir()
    
    # Download local input files from the remote storage container
    read_global_directory(job.fileStore, index_dir_id, out_dir)

    vcf_path = os.path.join(out_dir, path_name + ".vcf")
    
    if overwrite or not os.path.isfile(vcf_path):
        first = True
        for chunk_i, chunk in enumerate(chunks):
            clip_path = chunk_base_name(path_name, out_dir, chunk_i, "_clip.vcf")
            # Download clip.vcf file
            out_store.read_input_file(os.path.basename(clip_path), clip_path)

            if os.path.isfile(clip_path):
                if first is True:
                    # copy everything including the header
                    run("cat {} > {}".format(clip_path, vcf_path))
                    first = False
                else:
                    # add on everythin but header
                    run("grep -v \"^#\" {} >> {}".format(clip_path, vcf_path), check=False)

    # add a compressed indexed version
    if overwrite or not os.path.isfile(vcf_path + ".gz"):
        vcf_gz_file = vcf_path + ".gz"
        with open(vcf_gz_file, "w") as vcf_gz_file_stream:
            command=['bgzip', '-c', '{}'.format(os.path.basename(vcf_path))]
            docker_call(work_dir=out_dir, parameters=command,
                        tool='quay.io/cmarkello/htslib:latest',
                        outfile=vcf_gz_file_stream)
        command=['bcftools', 'tabix', '-f', '-p', 'vcf', '{}'.format(os.path.basename(vcf_path+".gz"))]
        docker_call(work_dir=out_dir, parameters=command,
                    tool='quay.io/cmarkello/bcftools:latest')

    # Save merged vcf files to the output store
    out_store.write_output_file(vcf_path, os.path.basename(vcf_path))
    out_store.write_output_file(vcf_path+".gz", os.path.basename(vcf_path+".gz"))
    out_store.write_output_file(vcf_path+".gz.tbi", os.path.basename(vcf_path+".gz.tbi"))
    
    return os.path.basename(vcf_path)
Exemple #20
0
def test_docker_call(tmpdir):
    from toil_lib.programs import docker_call
    work_dir = str(tmpdir)
    parameter = ['--help']
    tool = 'quay.io/ucsc_cgl/samtools'
    docker_call(work_dir=work_dir, parameters=parameter, tool=tool)
    # Test outfile
    fpath = os.path.join(work_dir, 'test')
    with open(fpath, 'w') as f:
        docker_call(tool='ubuntu',
                    env=dict(foo='bar'),
                    parameters=['printenv', 'foo'],
                    outfile=f)
    assert open(fpath).read() == 'bar\n'
Exemple #21
0
def run_kallisto(job, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=kallisto_index_url,
                 name='kallisto_hg38.idx',
                 work_dir=work_dir)
    # Retrieve files
    parameters = [
        'quant', '-i', '/data/kallisto_hg38.idx', '-t',
        str(job.cores), '-o', '/data/', '-b', '100'
    ]
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        job.fileStore.readGlobalFile(
            r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq'))
        parameters.extend(
            ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'])
    else:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        parameters.extend(
            ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq'])

    # Call: Kallisto
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86',
        work_dir=work_dir,
        parameters=parameters)
    # Tar output files together and store in fileStore
    output_files = [
        os.path.join(work_dir, x)
        for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']
    ]
    tarball_files(tar_name='kallisto.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'kallisto.tar.gz'))
Exemple #22
0
def gatk_variant_filtration(job, vcf_id, filter_name, filter_expression, ref_fasta, ref_fai, ref_dict):
    """
    Filters VCF file using GATK VariantFiltration. Fixes extra pair of quotation marks in VCF header that
    may interfere with other VCF tools.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str vcf_id: FileStoreID for input VCF file
    :param str filter_name: Name of filter for VCF header
    :param str filter_expression: JEXL filter expression
    :param str ref_fasta: FileStoreID for reference genome fasta
    :param str ref_fai: FileStoreID for reference genome index file
    :param str ref_dict: FileStoreID for reference genome sequence dictionary file
    :return: FileStoreID for filtered VCF file
    :rtype: str
    """
    inputs = {'genome.fa': ref_fasta,
              'genome.fa.fai': ref_fai,
              'genome.dict': ref_dict,
              'input.vcf': vcf_id}

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))

    command = ['-T', 'VariantFiltration',
               '-R', 'genome.fa',
               '-V', 'input.vcf',
               '--filterName', filter_name,   # Documents filter name in header
               '--filterExpression', filter_expression,
               '-o', 'filtered_variants.vcf']

    job.fileStore.logToMaster('Running GATK VariantFiltration using {name}: '
                              '{expression}'.format(name=filter_name, expression=filter_expression))

    docker_call(job=job, work_dir=work_dir,
                env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)},
                parameters=command,
                tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                inputs=inputs.keys(),
                outputs={'filtered_variants.vcf': None})

    # Remove extra quotation marks around filter expression.
    malformed_header = os.path.join(work_dir, 'filtered_variants.vcf')
    fixed_header = os.path.join(work_dir, 'fixed_header.vcf')
    filter_regex = re.escape('"%s"' % filter_expression)
    with open(malformed_header, 'r') as f, open(fixed_header, 'w') as g:
        for line in f:
            g.write(re.sub(filter_regex, filter_expression, line))

    return job.fileStore.writeGlobalFile(fixed_header)
Exemple #23
0
def run_pindel(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai):
    """
    Calls Pindel to compute indels / deletions

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str normal_bam: Normal BAM FileStoreID
    :param str normal_bai: Normal BAM index FileStoreID
    :param str tumor_bam: Tumor BAM FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str fai: Reference index FileStoreID
    :return: Pindel output (tarball) FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai]
    file_names = [
        'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta',
        'ref.fasta.fai'
    ]
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Create Pindel config
    with open(os.path.join(work_dir, 'pindel-config.txt'), 'w') as f:
        for bam in ['normal', 'tumor']:
            f.write('/data/{} {} {}\n'.format(
                bam + '.bam', get_mean_insert_size(work_dir, bam + '.bam'),
                bam))
    # Call: Pindel
    parameters = [
        '-f', '/data/ref.fasta', '-i', '/data/pindel-config.txt',
        '--number_of_threads',
        str(job.cores), '--minimum_support_for_event', '3',
        '--report_long_insertions', 'true', '--report_breakpoints', 'true',
        '-o', 'pindel'
    ]
    docker_call(
        tool=
        'quay.io/ucsc_cgl/pindel:0.2.5b6--4e8d1b31d4028f464b3409c6558fb9dfcad73f88',
        work_dir=work_dir,
        parameters=parameters)
    # Collect output files and write to file store
    output_files = glob(os.path.join(work_dir, 'pindel*'))
    tarball_files('pindel.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'pindel.tar.gz'))
Exemple #24
0
def download_and_transfer_sample(job, sample, inputs):
    """
    Downloads a sample from CGHub via GeneTorrent, then uses S3AM to transfer it to S3

    input_args: dict        Dictionary of input arguments
    analysis_id: str        An analysis ID for a sample in CGHub
    """
    analysis_id = sample[0]
    work_dir = job.fileStore.getLocalTempDir()
    folder_path = os.path.join(work_dir, os.path.basename(analysis_id))
    # Acquire genetorrent key and download sample
    shutil.copy(inputs['genetorrent_key'], os.path.join(work_dir, 'cghub.key'))
    parameters = ['-vv', '-c', 'cghub.key', '-d', analysis_id]
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b',
        work_dir=work_dir,
        parameters=parameters)
    try:
        sample = glob.glob(os.path.join(folder_path, '*tar*'))[0]
    except KeyError as e:
        print 'No tarfile found inside of folder: '.format(e)
        raise
    # Upload sample to S3AM
    key_path = inputs['ssec']
    if sample.endswith('gz'):
        sample_name = analysis_id + '.tar.gz'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    else:
        sample_name = analysis_id + '.tar'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    # Parse s3_dir to get bucket and s3 path
    s3_dir = inputs['s3_dir']
    bucket_name = s3_dir.lstrip('/').split('/')[0]
    base_url = 'https://s3-us-west-2.amazonaws.com/'
    url = os.path.join(base_url, bucket_name, sample_name)
    # Generate keyfile for upload
    with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out:
        f_out.write(generate_unique_key(key_path, url))
    # Upload to S3 via S3AM
    s3am_command = [
        's3am', 'upload', '--sse-key-file',
        os.path.join(work_dir, 'temp.key'),
        'file://{}'.format(os.path.join(work_dir, sample_name)),
        's3://' + bucket_name + '/'
    ]
    subprocess.check_call(s3am_command)
Exemple #25
0
def run_muse(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict,
             fai, dbsnp):
    """
    Calls MuSe to find variants

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str normal_bam: Normal BAM FileStoreID
    :param str normal_bai: Normal BAM index FileStoreID
    :param str tumor_bam: Tumor BAM FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference genome dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str dbsnp: DBSNP VCF FileStoreID
    :return: MuSe output (tarball) FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [
        normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp
    ]
    file_names = [
        'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta',
        'ref.dict', 'ref.fasta.fai', 'dbsnp.vcf'
    ]
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Call: MuSE
    parameters = [
        '--mode', 'wxs', '--dbsnp', '/data/dbsnp.vcf', '--fafile',
        '/data/ref.fasta', '--tumor-bam', '/data/tumor.bam',
        '--tumor-bam-index', '/data/tumor.bai', '--normal-bam',
        '/data/normal.bam', '--normal-bam-index', '/data/normal.bai',
        '--outfile', '/data/muse.vcf', '--cpus',
        str(job.cores)
    ]
    docker_call(
        tool=
        'quay.io/ucsc_cgl/muse:1.0--6add9b0a1662d44fd13bbc1f32eac49326e48562',
        work_dir=work_dir,
        parameters=parameters)
    # Return fileStore ID
    tarball_files('muse.tar.gz',
                  file_paths=[os.path.join(work_dir, 'muse.vcf')],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'muse.tar.gz'))
def xg_path_predecessors(xg_path, path_name, node_id, out_dir, context = 1):
    """ get nodes before given node in a path. """
    
    stdout = ''
    command = ['vg find -x {} -n {} -c {}'.format(os.path.basename(xg_path), str(node_id), str(context)),
                'vg view -j -']
    stdout = docker_call(work_dir=out_dir, parameters=command,
                tools='quay.io/ucsc_cgl/vg:latest',
                check_output=True)
    
    # get our json graph
    j = json.loads(stdout)
    paths = j["path"]
    path = [x for x in paths if x["name"] == path_name][0]
    mappings = path["mapping"]
    assert len(mappings) > 0
    # check that we have a node_mapping
    assert len([x for x in mappings if x["position"]["node_id"] == node_id]) == 1
    # collect mappings that come before
    out_ids = []
    for mapping in mappings:
        if mapping["position"]["node_id"] == node_id:
            break
        out_ids.append(mapping["position"]["node_id"])
    return out_ids
def process_bam_and_upload(job, bam_id, gdc_id, disk='80G'):
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'input.bam'))

    parameters = ['fastq', '-1', '/data/R1.fastq', '-2', '/data/R2.fastq', '/data/input.bam']
    docker_call(tool='quay.io/ucsc_cgl/samtools', work_dir=work_dir, parameters=parameters)

    subprocess.check_call(['gzip', os.path.join(work_dir, 'R1.fastq')])
    subprocess.check_call(['gzip', os.path.join(work_dir, 'R2.fastq')])

    out_tar = os.path.join(work_dir, gdc_id + '.tar.gz')
    with tarfile.open(out_tar, 'w:gz') as tar:
        for name in [os.path.join(work_dir, x) for x in ['R1.fastq.gz', 'R2.fastq.gz']]:
            tar.add(name, arcname=os.path.basename(name))

    s3am_upload(out_tar, s3_dir='s3://cgl-ccle-data/')
Exemple #28
0
def run_oncotator(job, vcf_id, oncotator_db):
    """
    Uses Oncotator to add cancer relevant variant annotations to a VCF file. Oncotator can accept
    other genome builds, but the output VCF is based on hg19.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str vcf_id: FileStoreID for VCF file
    :param str oncotator_db: FileStoreID for Oncotator database
    :return: Annotated VCF FileStoreID
    :rtype: str
    """
    job.fileStore.logToMaster('Running Oncotator')

    inputs = {'input.vcf': vcf_id, 'oncotator_db': oncotator_db}

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        inputs[name] = job.fileStore.readGlobalFile(
            file_store_id, os.path.join(work_dir, name))

    # The Oncotator database may be tar/gzipped
    if tarfile.is_tarfile(inputs['oncotator_db']):
        tar = tarfile.open(inputs['oncotator_db'])
        tar.extractall(path=work_dir)
        # Get the extracted database directory name
        inputs['oncotator_db'] = tar.getmembers()[0].name
        tar.close()

    command = [
        '-i', 'VCF', '-o', 'VCF', '--db-dir', inputs['oncotator_db'],
        'input.vcf', 'annotated.vcf', 'hg19'
    ]  # Oncotator annotations are based on hg19

    docker_call(
        job=job,
        work_dir=work_dir,
        env={
            '_JAVA_OPTIONS':
            '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)
        },
        parameters=command,
        tool='jpfeil/oncotator:1.9--8fffc356981862d50cfacd711b753700b886b605',
        inputs=inputs.keys(),
        outputs={'annotated.vcf': None})

    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'annotated.vcf'))
Exemple #29
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter trimming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()
    if r2_id:
        require(rev_3pr_adapter,
                "Paired end data requires a reverse 3' adapter sequence.")
    # Retrieve files
    parameters = ['-a', fwd_3pr_adapter, '-m', '35']
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend([
            '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p',
            '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq'
        ])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])
    # Call: CutAdapt
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2',
        work_dir=work_dir,
        parameters=parameters)
    # Write to fileStore
    if r1_id and r2_id:
        r1_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R2_cutadapt.fastq'))
    else:
        r1_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = None
    return r1_cut_id, r2_cut_id
def xg_path_node_id(xg_path, path_name, offset, out_dir):
    """ use vg find to get the node containing a given path position """
    #NOTE: vg find -p range offsets are 0-based inclusive.  
    tmp_out_filename = "{}/tmp_out_{}".format(out_dir, uuid4())
    with open(tmp_out_filename, "w") as tmp_out_file:
        command = ['vg find -x {} -p {}:{}-{}'.format(os.path.basename(xg_path), str(path_name), str(offset), str(offset)), 
                    'vg mod -o -', 'vg view -j -'] 
        docker_call(work_dir=out_dir, parameters=command,
                    tools='quay.io/ucsc_cgl/vg:latest',
                    outfile=tmp_out_file)
        
    command = ['cat data/{}'.format(os.path.basename(tmp_out_filename)), 'jq .node[0].id -']
    stdout = docker_call(work_dir=out_dir, parameters=command,
                    tools='devorbitus/ubuntu-bash-jq-curl',
                    check_output=True)
    
    return int(stdout)
Exemple #31
0
def _download_with_genetorrent(url, file_path, cghub_key_path):
    parsed_url = urlparse(url)
    analysis_id = parsed_url.path[1:]
    assert parsed_url.scheme == 'gnos', 'Improper format. gnos://cghub/ID. User supplied: {}'.format(
        parsed_url)
    work_dir = os.path.dirname(file_path)
    folder_path = os.path.join(work_dir, os.path.basename(analysis_id))
    parameters = ['-vv', '-c', cghub_key_path, '-d', analysis_id]
    docker_call(
        tool=
        'quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b',
        work_dir=work_dir,
        parameters=parameters)
    sample = glob.glob(os.path.join(folder_path, '*tar*'))
    assert len(
        sample) == 1, 'More than one sample tar in CGHub download: {}'.format(
            analysis_id)
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = ['mpileup',
                       '-f', 'genome.fa',
                       '-l', 'positions.tsv',
                       '-v', 'alignment.bam',
                       '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
                       '-o', '/data/output.vcf.gz']
    docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')

    # Part 2: QC
    qc_command = ['-o', 'qc',
                  '-n', 'alignment.bam',
                  '-a', 'annotation.gtf',
                  '-m', 'annotation.m53']
    docker_call(work_dir=work_dir, parameters=qc_command,
                tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def chunk_gam(gam_path, xg_path, path_name, out_dir, chunks, filter_opts, overwrite):
    """ use vg filter to chunk up the gam """
    RealTimeLogger.get().info("Starting chunk_gam")
    # make bed chunks
    chunk_path = os.path.join(out_dir, path_name + "_chunks.bed")
    with open(chunk_path, "w") as f:
        for chunk in chunks:
            f.write("{}\t{}\t{}\n".format(chunk[0], chunk[1], chunk[2]))
    # run vg filter on the gam
    stdout = ''
    if overwrite or not any(
            os.path.isfile(chunk_base_name(path_name, out_dir, i, ".gam")) \
               for i in range(len(chunks))):
        
        out_file = os.path.join(out_dir, path_name + "-chunk")
        command = ['filter', os.path.basename(gam_path), '-x', os.path.basename(xg_path), '-R', os.path.basename(chunk_path), '-B', os.path.basename(out_file)] + filter_opts.split(" ")
        docker_call(work_dir=out_dir, parameters=command,
            tool='quay.io/ucsc_cgl/vg:latest')
def download_and_transfer_sample(job, sample, inputs):

    """
    Downloads a sample from CGHub via GeneTorrent, then uses S3AM to transfer it to S3

    input_args: dict        Dictionary of input arguments
    analysis_id: str        An analysis ID for a sample in CGHub
    """
    analysis_id = sample[0]
    work_dir = job.fileStore.getLocalTempDir()
    folder_path = os.path.join(work_dir, os.path.basename(analysis_id))
    # Acquire genetorrent key and download sample
    shutil.copy(inputs['genetorrent_key'], os.path.join(work_dir, 'cghub.key'))
    parameters = ['-vv', '-c', 'cghub.key', '-d', analysis_id]
    docker_call(job=job, tool='quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b',
                work_dir=work_dir, parameters=parameters)
    try:
        sample = glob.glob(os.path.join(folder_path, '*tar*'))[0]
    except KeyError as e:
        print 'No tarfile found inside of folder: '.format(e)
        raise
    # Upload sample to S3AM
    key_path = inputs['ssec']
    if sample.endswith('gz'):
        sample_name = analysis_id + '.tar.gz'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    else:
        sample_name = analysis_id + '.tar'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    # Parse s3_dir to get bucket and s3 path
    s3_dir = inputs['s3_dir']
    bucket_name = s3_dir.lstrip('/').split('/')[0]
    base_url = 'https://s3-us-west-2.amazonaws.com/'
    url = os.path.join(base_url, bucket_name, sample_name)
    # Generate keyfile for upload
    with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out:
        f_out.write(generate_unique_key(key_path, url))
    # Upload to S3 via S3AM
    s3am_command = ['s3am',
                    'upload',
                    '--sse-key-file', os.path.join(work_dir, 'temp.key'),
                    'file://{}'.format(os.path.join(work_dir, sample_name)),
                    's3://' + bucket_name + '/']
    subprocess.check_call(s3am_command)
Exemple #35
0
def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'):
    """
    Runs Picard MarkDuplicates on a BAM file. Requires that the BAM file be coordinate sorted.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str validation_stringency: BAM file validation stringency, default is LENIENT
    :return: FileStoreIDs for BAM and BAI files
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()

    # Retrieve file path
    job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sorted.bam'))
    job.fileStore.readGlobalFile(bai, os.path.join(work_dir, 'sorted.bai'))

    # Call: picardtools
    command = [
        'MarkDuplicates', 'INPUT=sorted.bam', 'OUTPUT=mkdups.bam',
        'METRICS_FILE=metrics.txt', 'ASSUME_SORTED=true', 'CREATE_INDEX=true',
        'VALIDATION_STRINGENCY=%s' % validation_stringency.upper()
    ]

    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=command,
        # picard-tools container doesn't have JAVA_OPTS variable
        # Set TMPDIR to /data to prevent writing temporary files to /tmp
        env={
            '_JAVA_OPTIONS':
            '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)
        },
        tool=
        'quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e',
        outputs={
            'mkdups.bam': None,
            'mkdups.bai': None
        })

    bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bam'))
    bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bai'))
    return bam, bai
Exemple #36
0
def gatk_select_variants(job, mode, vcf_id, ref_fasta, ref_fai, ref_dict):
    """
    Isolates a particular variant type from a VCF file using GATK SelectVariants

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str mode: variant type (i.e. SNP or INDEL)
    :param str vcf_id: FileStoreID for input VCF file
    :param str ref_fasta: FileStoreID for reference genome fasta
    :param str ref_fai: FileStoreID for reference genome index file
    :param str ref_dict: FileStoreID for reference genome sequence dictionary file
    :return: FileStoreID for filtered VCF
    :rtype: str
    """
    job.fileStore.logToMaster('Running GATK SelectVariants to select %ss' %
                              mode)

    inputs = {
        'genome.fa': ref_fasta,
        'genome.fa.fai': ref_fai,
        'genome.dict': ref_dict,
        'input.vcf': vcf_id
    }

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))

    command = [
        '-T', 'SelectVariants', '-R', 'genome.fa', '-V', 'input.vcf', '-o',
        'output.vcf', '-selectType', mode
    ]

    docker_call(
        work_dir=work_dir,
        env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)},
        parameters=command,
        tool=
        'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
        inputs=inputs.keys(),
        outputs={'output.vcf': None})

    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.vcf'))
Exemple #37
0
def gatk_haplotype_caller(job,
                          bam, bai,
                          ref, fai, ref_dict,
                          annotations=None,
                          emit_threshold=10.0, call_threshold=30.0,
                          unsafe_mode=False,
                          hc_output=None):
    """
    Uses GATK HaplotypeCaller to identify SNPs and INDELs. Outputs variants in a Genomic VCF file.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str ref: FileStoreID for reference genome fasta file
    :param str ref_dict: FileStoreID for reference sequence dictionary file
    :param str fai: FileStoreID for reference fasta index file
    :param list[str] annotations: List of GATK variant annotations, default is None
    :param float emit_threshold: Minimum phred-scale confidence threshold for a variant to be emitted, default is 10.0
    :param float call_threshold: Minimum phred-scale confidence threshold for a variant to be called, default is 30.0
    :param bool unsafe_mode: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :param str hc_output: URL or local path to pre-cooked VCF file, default is None
    :return: FileStoreID for GVCF file
    :rtype: str
    """
    job.fileStore.logToMaster('Running GATK HaplotypeCaller')

    inputs = {'genome.fa': ref,
              'genome.fa.fai': fai,
              'genome.dict': ref_dict,
              'input.bam': bam,
              'input.bam.bai': bai}

    work_dir = job.fileStore.getLocalTempDir()
    for name, file_store_id in inputs.iteritems():
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))

    # Call GATK -- HaplotypeCaller with parameters to produce a genomic VCF file:
    # https://software.broadinstitute.org/gatk/documentation/article?id=2803
    command = ['-T', 'HaplotypeCaller',
               '-nct', str(job.cores),
               '-R', 'genome.fa',
               '-I', 'input.bam',
               '-o', 'output.g.vcf',
               '-stand_call_conf', str(call_threshold),
               '-stand_emit_conf', str(emit_threshold),
               '-variant_index_type', 'LINEAR',
               '-variant_index_parameter', '128000',
               '--genotyping_mode', 'Discovery',
               '--emitRefConfidence', 'GVCF']

    if unsafe_mode:
        command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'] + command

    if annotations:
        for annotation in annotations:
            command.extend(['-A', annotation])

    # Uses docker_call mock mode to replace output with hc_output file
    outputs = {'output.g.vcf': hc_output}
    docker_call(work_dir=work_dir,
                env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)},
                parameters=command,
                tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2',
                inputs=inputs.keys(),
                outputs=outputs,
                mock=True if outputs['output.g.vcf'] else False)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.g.vcf'))