def _get_test_fastq_files(tmpdir, tarball=True):
    # path stuff
    tmpdir_str = str(tmpdir)
    tmpdir_repo = os.path.join(tmpdir_str, "single_cell")
    # get from pachterlab
    subprocess.check_call(
        "git clone --no-checkout https://github.com/pachterlab/scRNA-Seq-TCC-prep.git single_cell"
        .split(),
        cwd=tmpdir_str)
    subprocess.check_call("git config core.sparseCheckout true".split(),
                          cwd=tmpdir_repo)
    with open(os.path.join(tmpdir_repo, ".git", "info", "sparse-checkout"),
              'w') as sparse:
        sparse.write("example_dataset/fastq_files/*ATCGCTCC*")
    # subprocess.check_call('echo "example_dataset/fastq_files/*ATCGCTCC*" > .git/info/sparse-checkout'.split(),cwd=tmpdir_repo)
    subprocess.check_call(
        "git checkout 0469873bdadcc48e34782882dbd24c3939c0542a".split(),
        cwd=tmpdir_repo)
    # return location if not tarballed
    fastqs_location = os.path.join(tmpdir_str, "single_cell",
                                   "example_dataset", "fastq_files")
    if not tarball:
        return fastqs_location
    # else, tarball and return that location
    tarball_files(output_dir=tmpdir_str,
                  tar_name='test_fastq.tar.gz',
                  file_paths=[
                      os.path.join(fastqs_location, x)
                      for x in os.listdir(fastqs_location)
                  ])
    return os.path.join(tmpdir_str, 'test_fastq.tar.gz')
Exemple #2
0
def run_fastqc(job, r1_id, r2_id):
    """
    Run Fastqc on the input reads

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2
    :return: FileStoreID of fastQC output (tarball)
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
    parameters = ['/data/R1.fastq']
    output_names = ['R1_fastqc.html', 'R1_fastqc.zip']
    if r2_id:
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend(['-t', '2', '/data/R2.fastq'])
        output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip'])
    docker_call(
        tool=
        'quay.io/ucsc_cgl/fastqc:0.11.5--be13567d00cd4c586edf8ae47d991815c8c72a49',
        work_dir=work_dir,
        parameters=parameters)
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='fastqc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'fastqc.tar.gz'))
Exemple #3
0
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'),
                  (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'),
                  (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(job=job, url=url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = [
        'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v',
        'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
        '-o', '/data/output.vcf.gz'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=variant_command,
        tool=
        'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c'
    )

    # Part 2: QC
    qc_command = [
        '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m',
        'annotation.m53'
    ]
    docker_call(
        job=job,
        work_dir=work_dir,
        parameters=qc_command,
        tool=
        'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz',
                  file_paths=[output_tsv, output_vcf],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
Exemple #4
0
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir,
                                                      'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id,
                                 os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(job=job,
                 url=inputs.gtf,
                 work_dir=work_dir,
                 name='annotation.gtf')
    download_url(job=job,
                 url=inputs.gtf_pickle,
                 work_dir=work_dir,
                 name='annotation.gtf.pickle')
    # Call Spladder
    command = [
        '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n',
        '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a',
        'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n',
        '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'
    ]
    docker_call(job=job,
                work_dir=work_dir,
                parameters=command,
                sudo=inputs.sudo,
                tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder',
                                 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz',
                  file_paths=[output_pickle, output_filt, output],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'spladder.tar.gz'))
Exemple #5
0
def test_tarball_files(tmpdir):
    from toil_lib.files import tarball_files
    work_dir = str(tmpdir)
    fpath = os.path.join(work_dir, 'output_file')
    with open(fpath, 'wb') as fout:
        fout.write(os.urandom(1024))
    tarball_files(output_dir=work_dir, tar_name='test.tar', file_paths=[fpath])
    assert os.path.exists(os.path.join(work_dir, 'test.tar'))
Exemple #6
0
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id):
    """
    Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform.
    These are two-column files: Genes and Quantifications.
    HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: UUID to mark the samples with
    :param str rsem_gene_id: FileStoreID of rsem_gene_ids
    :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids
    :return: FileStoreID from RSEM post process tarball
    :rytpe: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    # I/O
    job.fileStore.readGlobalFile(rsem_gene_id,
                                 os.path.join(work_dir, 'rsem_gene.tab'),
                                 mutable=True)
    job.fileStore.readGlobalFile(rsem_isoform_id,
                                 os.path.join(work_dir, 'rsem_isoform.tab'),
                                 mutable=True)
    # Convert RSEM files into individual .tab files.
    docker_call(tool='jvivian/rsem_postprocess',
                parameters=[uuid],
                work_dir=work_dir)
    os.rename(os.path.join(work_dir, 'rsem_gene.tab'),
              os.path.join(work_dir, 'rsem_genes.results'))
    os.rename(os.path.join(work_dir, 'rsem_isoform.tab'),
              os.path.join(work_dir, 'rsem_isoforms.results'))
    output_files = [
        'rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab',
        'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab',
        'rsem_genes.results', 'rsem_isoforms.results'
    ]
    # Perform HUGO gene / isoform name mapping
    genes = [x for x in output_files if 'rsem.genes' in x]
    isoforms = [x for x in output_files if 'rsem.isoform' in x]
    command = ['-g'] + genes + ['-i'] + isoforms
    docker_call(tool='jvivian/gencode_hugo_mapping',
                parameters=command,
                work_dir=work_dir)
    hugo_files = [
        os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1]
        for x in genes + isoforms
    ]
    # Create tarballs for outputs
    tarball_files('rsem.tar.gz',
                  file_paths=[os.path.join(work_dir, x) for x in output_files],
                  output_dir=work_dir)
    tarball_files('rsem_hugo.tar.gz',
                  [os.path.join(work_dir, x) for x in hugo_files],
                  output_dir=work_dir)
    rsem_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem.tar.gz'))
    hugo_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    return rsem_id, hugo_id
Exemple #7
0
def rsem_quantification(job, config, star_output):
    """
    Unpack STAR results and run RSEM (and saving BAM from STAR)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param tuple(FileID, FileID, FileID, FileID)|tuple(FileID, FileID, FileID) star_output: FileStoreIDs from STAR
    :return: FileStoreID results from RSEM postprocess and STAR log
    :rtype: tuple(FileID, FileID, FileID)
    """
    work_dir = job.fileStore.getLocalTempDir()
    cores = min(16, config.cores)
    if config.wiggle:
        transcriptome_id, sorted_id, wiggle_id, log_id = flatten(star_output)
        wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg')
        job.fileStore.readGlobalFile(wiggle_id, wiggle_path)
        if urlparse(config.output_dir).scheme == 's3':
            s3am_upload(fpath=wiggle_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        else:
            copy_files(file_paths=[wiggle_path], output_dir=config.output_dir)
    else:
        transcriptome_id, sorted_id, log_id = star_output
    # Save sorted bam if flag is selected
    if config.save_bam and not config.bamqc:  # if config.bamqc is selected, bam is being saved in run_bam_qc
        bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam')
        job.fileStore.readGlobalFile(sorted_id, bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[bam_path], output_dir=config.output_dir)
    # Declare RSEM and RSEM post-process jobs
    disk = 5 * transcriptome_id.size
    rsem_output = job.wrapJobFn(run_rsem,
                                transcriptome_id,
                                config.rsem_ref,
                                paired=config.paired,
                                cores=cores,
                                disk=disk)
    rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, rsem_output.rv(0),
                                     rsem_output.rv(1))
    job.addChild(rsem_output)
    rsem_output.addChild(rsem_postprocess)
    # Save STAR log
    log_path = os.path.join(work_dir, 'Log.final.out')
    job.fileStore.readGlobalFile(log_id, log_path)
    tarball_files(tar_name='star.tar.gz',
                  file_paths=[log_path],
                  output_dir=work_dir)
    star_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'star.tar.gz'))
    return rsem_postprocess.rv(), star_id
Exemple #8
0
def run_bam_qc(job, aligned_bam_id, config):
    """
    Run BAM QC as specified by California Kids Cancer Comparison (CKCC)

    :param JobFunctionWrappingJob job:
    :param str aligned_bam_id: FileStoreID of sorted bam from STAR
    :param Namespace config: Argparse Namespace object containing argument inputs
        Must contain:
            config.uuid str: UUID of input sample
            config.save_bam bool: True/False depending on whether to save bam
            config.output_dir str: Path to save bam
            config.ssec str: Path to encryption key for secure upload to S3
    :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar
    :rtype: tuple(bool, str, str)
    """
    work_dir = job.fileStore.getLocalTempDir()
    job.fileStore.readGlobalFile(
        aligned_bam_id,
        os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam'))
    docker_call(tool='hbeale/treehouse_bam_qc:1.0',
                work_dir=work_dir,
                parameters=['runQC.sh', str(job.cores)])

    # Tar Output files
    output_names = [
        'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf',
        'rnaAligned.out.md.sorted.geneBodyCoverage.txt'
    ]
    if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')):
        output_names.append('readDist.txt_PASS_qc.txt')
        fail_flag = False
    else:
        output_names.append('readDist.txt_FAIL_qc.txt')
        fail_flag = True
    output_files = [os.path.join(work_dir, x) for x in output_names]
    tarball_files(tar_name='bam_qc.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)

    # Save output BAM
    if config.save_bam:
        bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam')
        new_bam_path = os.path.join(work_dir,
                                    config.uuid + '.sortedByCoord.md.bam')
        os.rename(bam_path, new_bam_path)
        if urlparse(config.output_dir).scheme == 's3' and config.ssec:
            s3am_upload(fpath=new_bam_path,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
        elif urlparse(config.output_dir).scheme != 's3':
            copy_files(file_paths=[new_bam_path], output_dir=config.output_dir)

    return fail_flag, job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'bam_qc.tar.gz'))
def spladder(job, inputs, bam_id, bai_id):
    """
    Run SplAdder to detect and quantify alternative splicing events

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of SplAdder tarball
    :rtype: str
    """
    job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input file
    download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf')
    download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle')
    # Call Spladder
    command = ['--insert_ir=y',
               '--insert_es=y',
               '--insert_ni=y',
               '--remove_se=n',
               '--validate_sg=n',
               '-b', 'alignment.bam',
               '-o ', '/data',
               '-a', 'annotation.gtf',
               '-v', 'y',
               '-c', '3',
               '-M', 'single',
               '-T', 'n',
               '-n', '50',
               '-P', 'y',
               '-p', 'n',
               '--sparse_bam', 'y']
    docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0')
    # Write output to fileStore and return ids
    output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle')
    if not os.path.exists(output_pickle):
        matches = []
        for root, dirnames, filenames in os.walk(work_dir):
            for filename in fnmatch.filter(filenames, '*genes_graph*'):
                matches.append(os.path.join(root, filename))
        if matches:
            output_pickle = matches[0]
        else:
            raise RuntimeError("Couldn't find genes file!")
    output_filt = os.path.join(work_dir, 'alignment.filt.hdf5')
    output = os.path.join(work_dir, 'alignment.hdf5')
    print os.listdir(work_dir)
    tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
Exemple #10
0
def run_kallisto(job, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    download_url(url=kallisto_index_url,
                 name='kallisto_hg38.idx',
                 work_dir=work_dir)
    # Retrieve files
    parameters = [
        'quant', '-i', '/data/kallisto_hg38.idx', '-t',
        str(job.cores), '-o', '/data/', '-b', '100'
    ]
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        job.fileStore.readGlobalFile(
            r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq'))
        parameters.extend(
            ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'])
    else:
        job.fileStore.readGlobalFile(
            r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
        parameters.extend(
            ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq'])

    # Call: Kallisto
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86',
        work_dir=work_dir,
        parameters=parameters)
    # Tar output files together and store in fileStore
    output_files = [
        os.path.join(work_dir, x)
        for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']
    ]
    tarball_files(tar_name='kallisto.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'kallisto.tar.gz'))
Exemple #11
0
def run_pindel(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai):
    """
    Calls Pindel to compute indels / deletions

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str normal_bam: Normal BAM FileStoreID
    :param str normal_bai: Normal BAM index FileStoreID
    :param str tumor_bam: Tumor BAM FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str fai: Reference index FileStoreID
    :return: Pindel output (tarball) FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai]
    file_names = [
        'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta',
        'ref.fasta.fai'
    ]
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Create Pindel config
    with open(os.path.join(work_dir, 'pindel-config.txt'), 'w') as f:
        for bam in ['normal', 'tumor']:
            f.write('/data/{} {} {}\n'.format(
                bam + '.bam', get_mean_insert_size(work_dir, bam + '.bam'),
                bam))
    # Call: Pindel
    parameters = [
        '-f', '/data/ref.fasta', '-i', '/data/pindel-config.txt',
        '--number_of_threads',
        str(job.cores), '--minimum_support_for_event', '3',
        '--report_long_insertions', 'true', '--report_breakpoints', 'true',
        '-o', 'pindel'
    ]
    docker_call(
        tool=
        'quay.io/ucsc_cgl/pindel:0.2.5b6--4e8d1b31d4028f464b3409c6558fb9dfcad73f88',
        work_dir=work_dir,
        parameters=parameters)
    # Collect output files and write to file store
    output_files = glob(os.path.join(work_dir, 'pindel*'))
    tarball_files('pindel.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'pindel.tar.gz'))
Exemple #12
0
def run_muse(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict,
             fai, dbsnp):
    """
    Calls MuSe to find variants

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str normal_bam: Normal BAM FileStoreID
    :param str normal_bai: Normal BAM index FileStoreID
    :param str tumor_bam: Tumor BAM FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference genome dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str dbsnp: DBSNP VCF FileStoreID
    :return: MuSe output (tarball) FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [
        normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp
    ]
    file_names = [
        'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta',
        'ref.dict', 'ref.fasta.fai', 'dbsnp.vcf'
    ]
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id,
                                     os.path.join(work_dir, name))
    # Call: MuSE
    parameters = [
        '--mode', 'wxs', '--dbsnp', '/data/dbsnp.vcf', '--fafile',
        '/data/ref.fasta', '--tumor-bam', '/data/tumor.bam',
        '--tumor-bam-index', '/data/tumor.bai', '--normal-bam',
        '/data/normal.bam', '--normal-bam-index', '/data/normal.bai',
        '--outfile', '/data/muse.vcf', '--cpus',
        str(job.cores)
    ]
    docker_call(
        tool=
        'quay.io/ucsc_cgl/muse:1.0--6add9b0a1662d44fd13bbc1f32eac49326e48562',
        work_dir=work_dir,
        parameters=parameters)
    # Return fileStore ID
    tarball_files('muse.tar.gz',
                  file_paths=[os.path.join(work_dir, 'muse.vcf')],
                  output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'muse.tar.gz'))
def variant_calling_and_qc(job, inputs, bam_id, bai_id):
    """
    Perform variant calling with samtools nad QC with CheckBias

    :param JobFunctionWrappingJob job: passed by Toil automatically
    :param Namespace inputs: Stores input arguments (see main)
    :param str bam_id: FileStore ID of bam
    :param str bai_id: FileStore ID of bam index file
    :return: FileStore ID of qc tarball
    :rtype: str
    """
    job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    # Pull in alignment.bam from fileStore
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam'))
    job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai'))
    # Download input files
    input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'),
                  (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'),
                  (inputs.gtf_m53, 'annotation.m53')]
    for url, fname in input_info:
        download_url(url, work_dir=work_dir, name=fname)

    # Part 1: Variant Calling
    variant_command = ['mpileup',
                       '-f', 'genome.fa',
                       '-l', 'positions.tsv',
                       '-v', 'alignment.bam',
                       '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP',
                       '-o', '/data/output.vcf.gz']
    docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo,
                tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')

    # Part 2: QC
    qc_command = ['-o', 'qc',
                  '-n', 'alignment.bam',
                  '-a', 'annotation.gtf',
                  '-m', 'annotation.m53']
    docker_call(work_dir=work_dir, parameters=qc_command,
                tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3')
    # Write output to fileStore and return ids
    output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0]
    output_vcf = os.path.join(work_dir, 'output.vcf.gz')
    tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
Exemple #14
0
def run_rsem_postprocess(job, rsem_gene_id, rsem_isoform_id):
    """
    Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform.
    These are two-column files: Genes and Quantifications.
    HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str rsem_gene_id: FileStoreID of rsem_gene_ids
    :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids
    :return: FileStoreID from RSEM post process tarball
    :rytpe: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    # I/O
    genes = job.fileStore.readGlobalFile(
        rsem_gene_id, os.path.join(work_dir, 'rsem_genes.results'))
    iso = job.fileStore.readGlobalFile(
        rsem_isoform_id, os.path.join(work_dir, 'rsem_isoforms.results'))
    # Perform HUGO gene / isoform name mapping
    command = ['-g', 'rsem_genes.results', '-i', 'rsem_isoforms.results']
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/gencode_hugo_mapping:1.0--cb4865d02f9199462e66410f515c4dabbd061e4d',
        parameters=command,
        work_dir=work_dir)
    hugo_files = [
        os.path.join(work_dir, x)
        for x in ['rsem_genes.hugo.results', 'rsem_isoforms.hugo.results']
    ]
    # Create tarballs for outputs
    tarball_files('rsem.tar.gz',
                  file_paths=[os.path.join(work_dir, x) for x in [genes, iso]],
                  output_dir=work_dir)
    tarball_files('rsem_hugo.tar.gz',
                  file_paths=[os.path.join(work_dir, x) for x in hugo_files],
                  output_dir=work_dir)
    rsem_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem.tar.gz'))
    hugo_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'rsem_hugo.tar.gz'))
    return rsem_id, hugo_id
def run_mutect(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, cosmic, dbsnp):
    """
    Calls MuTect to perform variant analysis

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str normal_bam: Normal BAM FileStoreID
    :param str normal_bai: Normal BAM index FileStoreID
    :param str tumor_bam: Tumor BAM FileStoreID
    :param str tumor_bai: Tumor BAM Index FileStoreID
    :param str ref: Reference genome FileStoreID
    :param str ref_dict: Reference dictionary FileStoreID
    :param str fai: Reference index FileStoreID
    :param str cosmic: Cosmic VCF FileStoreID
    :param str dbsnp: DBSNP VCF FileStoreID
    :return: MuTect output (tarball) FileStoreID
    :rtype: str
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai, ref_dict, cosmic, dbsnp]
    file_names = ['normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta',
                  'ref.fasta.fai', 'ref.dict', 'cosmic.vcf', 'dbsnp.vcf']
    for file_store_id, name in zip(file_ids, file_names):
        job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name))
    # Call: MuTect
    parameters = ['--analysis_type', 'MuTect',
                  '--reference_sequence', 'ref.fasta',
                  '--cosmic', '/data/cosmic.vcf',
                  '--dbsnp', '/data/dbsnp.vcf',
                  '--input_file:normal', '/data/normal.bam',
                  '--input_file:tumor', '/data/tumor.bam',
                  '--tumor_lod', str(10), # Taken from MC3 pipeline
                  '--initial_tumor_lod', str(4.0), # Taken from MC3 pipeline
                  '--out', 'mutect.out',
                  '--coverage_file', 'mutect.cov',
                  '--vcf', 'mutect.vcf']
    docker_call(job=job, work_dir=work_dir, parameters=parameters,
                tool='quay.io/ucsc_cgl/mutect:1.1.7--e8bf09459cf0aecb9f55ee689c2b2d194754cbd3')
    # Write output to file store
    output_file_names = ['mutect.vcf', 'mutect.cov', 'mutect.out']
    output_file_paths = [os.path.join(work_dir, x) for x in output_file_names]
    tarball_files('mutect.tar.gz', file_paths=output_file_paths, output_dir=work_dir)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mutect.tar.gz'))
Exemple #16
0
def run_margin_phase(job, config, chunk_file_id, chunk_info):
    # prep
    start = time.time()
    work_dir = job.fileStore.getLocalTempDir()
    chunk_idx = chunk_info[CI_CHUNK_INDEX]
    chunk_identifier = "{}.{}".format(config.uuid, chunk_idx)
    chunk_name = "{}.in.bam".format(chunk_identifier)
    chunk_location = os.path.join(work_dir, chunk_name)
    log(job, str(datetime.datetime.now()), chunk_identifier,
        'run_margin_phase')

    # download bam chunk
    job.fileStore.readGlobalFile(chunk_file_id, chunk_location)
    if not os.path.isfile(chunk_location):
        raise UserError("Failed to download chunk {} from {}".format(
            chunk_name, chunk_file_id))

    # download references
    #ref genome
    genome_reference_name = "reference.fa"
    genome_reference_location = os.path.join(work_dir, genome_reference_name)
    job.fileStore.readGlobalFile(config.reference_genome_fileid,
                                 genome_reference_location)
    if not os.path.isfile(genome_reference_location):
        raise UserError(
            "Failed to download genome reference {} from {}".format(
                os.path.basename(config.reference_genome),
                config.reference_genome_fileid))
    # params
    params_name = "params.json"
    params_location = os.path.join(work_dir, params_name)
    job.fileStore.readGlobalFile(config.params_fileid, params_location)
    if not os.path.isfile(params_location):
        raise UserError("Failed to download params {} from {}".format(
            os.path.basename(config.params), config.params_fileid))

    # do we want to run cPecan?
    cpecan_prob_location = None
    if config.cpecan_probabilities:
        cpecan_prob_location = run_margin_phase__run_cpecan_alignment(
            job, config, chunk_identifier, work_dir, chunk_name,
            genome_reference_name)

    # run marginPhase
    params = [
        os.path.join("/data", chunk_name),
        os.path.join("/data", genome_reference_name),
        os.path.join("/data", params_name), "-o",
        os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag',
        "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START],
                          chunk_info[CI_CHUNK_BOUNDARY_END])
    ]
    if cpecan_prob_location is not None:
        params.extend([
            '--singleNuclProbDir',
            os.path.join("/data", cpecan_prob_location)
        ])
    docker_call(job, config, work_dir, params, config.margin_phase_image,
                config.margin_phase_tag)
    log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG),
                          chunk_identifier, 'margin_phase',
                          [chunk_location, genome_reference_location])
    log_location = os.path.join(work_dir,
                                "marginPhase.{}.log".format(chunk_identifier))
    os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location)

    # document output
    log(job, "Output files after marginPhase:", chunk_identifier,
        'run_margin_phase')
    output_file_locations = glob.glob(
        os.path.join(work_dir, "{}*".format(chunk_identifier)))
    output_file_locations.append(log_location)
    found_vcf, found_sam = False, False
    for f in output_file_locations:
        log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier,
            'run_margin_phase')
        if f.endswith(VCF_SUFFIX): found_vcf = True
        if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True
    if cpecan_prob_location is not None:
        cpecan_tarball = glob.glob(
            os.path.join(work_dir, cpecan_prob_location, "*.tar.gz"))
        if len(cpecan_tarball) == 0:
            # todo why has tarball_files failed in this location?
            log(job, "Found no cpecan output tarball! Trying alt location.",
                chunk_identifier, 'run_margin_phase')
            cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz"))

        if len(cpecan_tarball) == 0:
            log(job, "Found no cpecan output tarball!", chunk_identifier,
                'run_margin_phase')
        elif len(cpecan_tarball) > 1:
            log(
                job, "Found {} cpecan output tarballs: {}".format(
                    len(cpecan_tarball), cpecan_tarball), chunk_identifier,
                'run_margin_phase')
        else:
            log(job,
                "Saving cpecan output tarball: {}".format(cpecan_tarball[0]),
                chunk_identifier, 'run_margin_phase')
            output_file_locations.append(cpecan_tarball[0])

    # tarball the output and save
    tarball_name = "{}.tar.gz".format(chunk_identifier)
    tarball_files(tar_name=tarball_name,
                  file_paths=output_file_locations,
                  output_dir=work_dir)

    # validate output, retry if not
    if not (found_sam and found_vcf):
        if "retry_attempts" not in config:
            config.retry_attempts = 1
        else:
            config.retry_attempts += 1
            if config.retry_attempts > MAX_RETRIES:
                log(job, "", chunk_identifier, 'run_margin_phase')
                error = "Failed to generate appropriate output files {} times".format(
                    MAX_RETRIES)
                log(job, error, chunk_identifier, 'run_margin_phase')
                # this enables us to "recover" in the face of failure during a run
                if CONTINUE_AFTER_FAILURE:
                    output_file_id = job.fileStore.writeGlobalFile(
                        os.path.join(work_dir, tarball_name))
                    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id
                    return chunk_info
                raise UserError("{}:{}".format(chunk_identifier, error))

        log(
            job, "Missing output files.  Attepmting retry {}".format(
                config.retry_attempts), chunk_identifier, 'run_margin_phase')
        log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase')
        log(job, "", chunk_identifier, 'run_margin_phase')
        with open(log_location, 'r') as input:
            for line in input:
                log(job, "\t\t{}".format(line.rstrip()), chunk_identifier,
                    'run_margin_phase')

        # new job
        retry_job = job.addChildJobFn(
            run_margin_phase,
            config,
            chunk_file_id,
            chunk_info,
            memory=str(int(config.maxMemory / 1024)) + "K",
            cores=job.cores,
            disk=job.disk)
        # save failed output
        if config.intermediate_file_location is not None:
            tarball_fail_name = "{}.FAILURE.{}.tar.gz".format(
                chunk_identifier, config.retry_attempts)
            os.rename(os.path.join(work_dir, tarball_name),
                      os.path.join(work_dir, tarball_fail_name))
            copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)],
                       output_dir=config.intermediate_file_location)

        log_generic_job_debug(job,
                              config.uuid,
                              'run_margin_phase',
                              work_dir=work_dir)
        return retry_job.rv()

    # if successfull, save output
    if config.intermediate_file_location is not None:
        copy_files(file_paths=[os.path.join(work_dir, tarball_name)],
                   output_dir=config.intermediate_file_location)
    output_file_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, tarball_name))
    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id

    # log
    log_generic_job_debug(job,
                          config.uuid,
                          'run_margin_phase',
                          work_dir=work_dir)
    log_time(job, "run_margin_phase", start, chunk_identifier)
    return chunk_info
Exemple #17
0
def run_data_analysis(job, config, tcc_matrix_id, pwise_dist_l1_id,
                      nonzero_ec_id, kallisto_matrix_id, matrix_tsv_id,
                      matrix_cells_id):
    """
    Generates graphs and plots of results.  Uploads images to savedir location.
    :param job: toil job
    :param config: toil job configuration
    :param tcc_matrix_id: jobstore location of TCC matrix (.dat)
    :param pwise_dist_l1_id: jobstore location of L1 pairwise distance (.dat)
    :param nonzero_ec_id: jobstore loation of nonzero ec (.dat)
    :param kallisto_matrix_id: id of kallisto output matrix (.ec)
    :param matrix_tsv_id: id of kallisto output matrix (.tsv)
    :param matrix_cells_id: id of kallisto output matrix (.cells)
    """
    # source: https://github.com/pachterlab/scRNA-Seq-TCC-prep (/blob/master/notebooks/10xResults.ipynb)
    # extract output
    job.fileStore.logToMaster('Performing data analysis')
    # read files
    work_dir = job.fileStore.getLocalTempDir()
    tcc_matrix = job.fileStore.readGlobalFile(
        tcc_matrix_id, os.path.join(work_dir, "TCC_matrix.dat"))
    pwise_dist_l1 = job.fileStore.readGlobalFile(
        pwise_dist_l1_id, os.path.join(work_dir, "pwise_dist_L1.dat"))
    nonzero_ec = job.fileStore.readGlobalFile(
        nonzero_ec_id, os.path.join(work_dir, "nonzero_ec.dat"))
    kallisto_matrix = job.fileStore.readGlobalFile(
        kallisto_matrix_id, os.path.join(work_dir, 'kallisto_matrix.ec'))
    matrix_tsv = job.fileStore.readGlobalFile(
        matrix_tsv_id, os.path.join(work_dir, MATRIX_TSV_FILENAME))
    matrix_cells = job.fileStore.readGlobalFile(
        matrix_cells_id, os.path.join(work_dir, MATRIX_CELLS_FILENAME))
    ##############################################################
    # load dataset
    with open(os.path.join(work_dir, "TCC_matrix.dat"), 'rb') as f:
        tcc_matrix = pickle.load(f)
    with open(os.path.join(work_dir, "pwise_dist_L1.dat"), 'rb') as f:
        pwise_dist_l1 = pickle.load(f)
    with open(os.path.join(work_dir, "nonzero_ec.dat"), 'rb') as f:
        nonzero_ec = pickle.load(f)

    ecfile_dir = os.path.join(work_dir, 'kallisto_matrix.ec')
    eclist = np.loadtxt(ecfile_dir, dtype=str)

    tcc = tcc_matrix.T
    T_norm = normalize(tcc_matrix, norm='l1', axis=0)
    t_normt = T_norm.transpose()

    num_of_cells = np.shape(tcc_matrix)[1]
    print("NUM_OF_CELLS =", num_of_cells)
    print("NUM_OF_nonzero_EC =", np.shape(tcc_matrix)[0])

    #################################

    EC_dict = {}
    for i in range(np.shape(eclist)[0]):
        EC_dict[i] = [int(x) for x in eclist[i, 1].split(',')]

    union = set()
    for i in nonzero_ec:
        new = [tx for tx in EC_dict[i]
               if tx not in union]  # filter out previously seen transcripts
        union.update(new)
    NUM_OF_TX_inTCC = len(union)
    print("NUM_OF_Transcripts =", NUM_OF_TX_inTCC
          )  # number of distinct transcripts in nonzero eq. classes

    ##############################################################
    # inspect

    # sort eq. classes based on size
    size_of_ec = [len(EC_dict[i]) for i in nonzero_ec]
    ec_idx = [i[0] for i in sorted(enumerate(size_of_ec), key=lambda x: x[1])]
    index_ec = np.array(ec_idx)

    ec_sort_map = {}
    nonzero_ec_srt = []  # init
    for i in range(len(nonzero_ec)):
        nonzero_ec_srt += [nonzero_ec[index_ec[i]]]
        ec_sort_map[nonzero_ec[index_ec[i]]] = i

    sumi = np.array(tcc_matrix.sum(axis=1))
    sumi_sorted = sumi[index_ec]
    total_num_of_umis = int(sumi_sorted.sum())
    total_num_of_umis_per_cell = np.array(tcc_matrix.sum(axis=0))[0, :]

    print("Total number of UMIs =", total_num_of_umis)

    #################################

    fig, ax1 = plt.subplots()
    ax1.plot(sorted(total_num_of_umis_per_cell)[::-1], 'b-', linewidth=2.0)
    ax1.set_title('UMI counts per cell')
    ax1.set_xlabel('cells (sorted by UMI counts)')
    ax1.set_ylabel('UMI counts')
    ax1.set_yscale("log", nonposy='clip')
    ax1.grid(True)
    ax1.grid(True, 'minor')
    umi_counts_per_cell = os.path.join(work_dir, "UMI_counts_per_cell.png")
    plt.savefig(umi_counts_per_cell, format='png')

    fig, ax1 = plt.subplots()
    ax1.plot(sorted(sumi.reshape(np.shape(sumi)[0]))[::-1],
             'r-',
             linewidth=2.0)
    ax1.set_title('UMI counts per eq. class')
    ax1.set_xlabel('ECs (sorted by UMI counts)')
    ax1.set_ylabel('UMI counts')
    ax1.set_yscale("log", nonposy='clip')
    ax1.grid(True)
    ax1.grid(True, 'minor')
    umi_counts_per_class = os.path.join(work_dir, "UMI_counts_per_class.png")
    plt.savefig(umi_counts_per_class, format='png')

    cell_nonzeros = np.array(((T_norm != 0)).sum(axis=0))[0]

    fig, ax1 = plt.subplots()
    ax1.plot(total_num_of_umis_per_cell, cell_nonzeros, '.g', linewidth=2.0)
    ax1.set_title('UMI counts vs nonzero ECs')
    ax1.set_xlabel('total num of umis per cell')
    ax1.set_ylabel('total num of nonzero ecs per cell')
    ax1.set_yscale("log", nonposy='clip')
    ax1.set_xscale("log", nonposy='clip')
    ax1.grid(True)
    ax1.grid(True, 'minor')
    umi_counts_vs_nonzero_ecs = os.path.join(work_dir,
                                             "UMI_counts_vs_nonzero_ECs.png")
    plt.savefig(umi_counts_vs_nonzero_ecs, format='png')

    # TCC MEAN-VARIANCE
    #todo verify this works
    TCC_var = np.var(tcc.todense(), axis=0)
    TCC_mean = np.mean(tcc.todense(), axis=0)
    TCC_mean = np.array(TCC_mean)[0]
    TCC_var = np.array(TCC_var)[0]
    fig = plt.figure()
    N = tcc.sum()
    C = tcc.shape[0]
    ax = plt.gca()
    ax.plot(TCC_mean,
            TCC_var,
            '.',
            c='blue',
            alpha=0.5,
            markeredgecolor='none')
    xlims = [0.0001, 10 * TCC_mean.max()]
    ax.set_xlim(xlims)
    ax.set_ylim([0.0001, 10 * TCC_var.max()])
    ax.set_yscale('symlog')
    ax.set_xscale('symlog')
    ax.plot(xlims, [(C - 1) * (xlims[0])**2, (C - 1) * (xlims[1])**2],
            color='g',
            linestyle='-',
            linewidth=2)
    ax.plot(xlims, [(xlims[0]), (xlims[1])],
            color='k',
            linestyle='--',
            linewidth=1)
    ax.set_title("TCC Mean-Variance [" + str(tcc.shape[1]) + " TCCs in " +
                 str(C) + " Cells]")
    ax.set_xlabel("mean(TCC)")
    ax.set_ylabel("var(TCC)")
    tcc_mean_variance = os.path.join(work_dir, "TCC_mean_variance.png")
    plt.savefig(tcc_mean_variance, format='png')

    ##############################################################
    # clustering

    #################################
    # t-SNE
    x_tsne = tSNE_pairwise(2, pwise_dist_l1)

    #################################
    # spectral clustering
    n_clusters = config.n_clusters
    similarity_mat = pwise_dist_l1.max() - pwise_dist_l1
    labels_spectral = spectral(n_clusters, similarity_mat)

    spectral_clustering = stain_plot(x_tsne,
                                     labels_spectral, [],
                                     "TCC -- tSNE, spectral clustering with " +
                                     str(n_clusters) + " n_clusters",
                                     work_dir=work_dir,
                                     filename="spectral_clustering_tSNE")

    #################################
    # affinity propagation
    pref = -np.median(pwise_dist_l1) * np.ones(num_of_cells)
    labels_aff = AffinityProp(-pwise_dist_l1, pref, 0.5)
    np.unique(labels_aff)

    affinity_propagation_tsne = stain_plot(
        x_tsne, labels_aff, [], "TCC -- tSNE, affinity propagation", work_dir,
        "affinity_propagation_tSNE")

    #################################
    # pca
    pca = PCA(n_components=2)
    x_pca = pca.fit_transform(t_normt.todense())

    affinity_propagation_pca = stain_plot(x_pca, labels_aff, [],
                                          "TCC -- PCA, affinity propagation",
                                          work_dir, "affinity_propagation_PCA")

    # SC3
    outfilePath = job.fileStore.getLocalTempFile()
    SC3OutputPath = os.path.join(work_dir, SC3_OUTPUT_DIRECTORY)
    os.mkdir(SC3OutputPath)
    shouldUseSC3Output = True
    with open(outfilePath, "r+") as outfile:

        def dockerPathTo(resource):
            return os.path.join(DOCKER_WORK_DIR, resource)

        def boolForR(aBool):
            return "TRUE" if aBool else "FALSE"

        try:
            dockerCall(job,
                       tool='rscript',
                       workDir=work_dir,
                       parameters=map(str, [
                           config.min_k, config.max_k,
                           dockerPathTo(MATRIX_TSV_FILENAME),
                           dockerPathTo(MATRIX_CELLS_FILENAME),
                           dockerPathTo(SC3_OUTPUT_DIRECTORY),
                           boolForR(config.use_estimated_k),
                           boolForR(config.debug)
                       ]),
                       outfile=outfile)
            pass
        except CalledProcessError:
            outfile.seek(0, 0)
            job.fileStore.logToMaster(
                "Docker failed with the following log:  " +
                str(outfile.read()))
            shouldUseSC3Output = False
    # build tarfile of output plots
    output_files = [
        umi_counts_per_cell, umi_counts_per_class, umi_counts_vs_nonzero_ecs,
        tcc_mean_variance, spectral_clustering, affinity_propagation_tsne,
        affinity_propagation_pca, outfilePath
    ] + ([
        os.path.join(work_dir, SC3_OUTPUT_DIRECTORY, x)
        for x in os.listdir(SC3OutputPath)
    ] if shouldUseSC3Output else [])
    tarball_files(tar_name='single_cell_plots.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    # return file id for consolidation
    return job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'single_cell_plots.tar.gz'))
Exemple #18
0
def run_single_cell(job, sample, config):
    """
    Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo:
    https://github.com/pachterlab/scRNA-Seq-TCC-prep).  Output includes TCC matrix from kallisto process.

    :param job: toil job
    :param config: configuration for toil job
    :param sample: a [UUID, url(s)] pair as constructed by parse_samples
    """
    # Common logic (for handling pre- and post- Kallisto data)
    config = argparse.Namespace(**vars(config))  # why?
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    work_dir = job.fileStore.getLocalTempDir()
    # Get input files
    uuid, type, urls = sample
    config.uuid = uuid
    # Handle kallisto output file (only works w/ one file for now)
    if type == "plot":
        filename = os.path.basename(urls[0])
        download_url(job, url=urls[0], name=filename, work_dir=work_dir)
        tar = tarfile.open(name=os.path.join(work_dir, filename))
        root_dir = rstrip(
            os.path.basename(urls[0]), ".tar.gz"
        )  # post, kallisto, plots folders are in this root folder, with same name as the archive
        kallisto_output = None  # could just forward the kallisto output
        post_processing_output = None  # same with this

        # method that, given the location of the file in the tar, writes it to the global job store
        def tarToGlobal(folder, path):
            with closing(tar.extractfile(os.path.join(root_dir, folder,
                                                      path))) as file:
                data = file.read()
                with job.fileStore.writeGlobalFileStream() as (stream, id):
                    stream.write(data)
                    return id

        tcc_matrix_id = tarToGlobal("post", TCC_MATRIX_FILENAME)
        pwise_dist_l1_id = tarToGlobal("post", PWISE_DIST_FILENAME)
        nonzero_ec_id = tarToGlobal("post", NONZERO_EC_FILENAME)
        kallisto_matrix_id = tarToGlobal("post", KALLISTO_MATRIX_FILENAME)
        matrix_tsv_id = tarToGlobal("kallisto", "matrix.tsv")
        matrix_cells_id = tarToGlobal("kallisto", "matrix.cells")
    # Handle fastq file(s)
    else:
        input_location = os.path.join(work_dir, "fastq_input")
        os.mkdir(input_location)
        for url in urls:
            if url.endswith('.tar') or url.endswith('.tar.gz'):
                tar_path = os.path.join(work_dir, os.path.basename(url))
                download_url(job, url=url, work_dir=work_dir)
                subprocess.check_call(
                    ['tar', '-xvf', tar_path, '-C', input_location])
                os.remove(tar_path)
            elif url.endswith('.gz'):
                download_url(job, url=url, work_dir=input_location)
                subprocess.check_call([
                    'gunzip',
                    os.path.join(input_location, os.path.basename(url))
                ])
            else:
                job.fileStore.logToMaster("Download url " + str(url))
                download_url(job, url=url, work_dir=input_location)
        # Generate configuration JSON
        with open(os.path.join(work_dir, "config.json"), 'w') as config_file:
            config_file.write(build_patcherlab_config(config))
        # Get Kallisto index
        download_url(job,
                     url=config.kallisto_index,
                     name='kallisto_index.idx',
                     work_dir=work_dir)
        # Create other locations for patcherlab stuff
        os.mkdir(os.path.join(work_dir, "tcc"))
        os.mkdir(os.path.join(work_dir, "output"))
        if type == "pseudo":
            # Call docker image
            dockerCall(job,
                       tool='quay.io/ucsc_cgl/kallisto_sc:latest',
                       workDir=work_dir,
                       parameters=["/data/config.json"])
        else:  # quantification of quake brain-style paired end fastqs, each for a different cell
            require(type == "quant",
                    "invalid type " + type + " found in manifest ")
            os.mkdir(os.path.join(work_dir, "quant_output"))
            # Call docker image
            dockerCall(job,
                       tool='kallisto_sc_quant',
                       workDir=work_dir,
                       parameters=[
                           "/data/kallisto_index.idx", "/data/quant_output",
                           str(config.cores), "/data/fastq_input"
                       ])
            # Consolidate abundances for the various cells
            quant_output = os.path.join(work_dir, "quant_output")
            consolidated = os.path.join(work_dir, "quant_consolidated")
            os.mkdir(consolidated)
            for output_folder in os.listdir(quant_output):
                shutil.copy(
                    os.path.join(quant_output, output_folder, "abundance.tsv"),
                    os.path.join(consolidated, output_folder + ".tsv"))
            # quant to pseudo
            quant_to_pseudo(None, consolidated, os.path.join(work_dir, "tcc"))
            # run post-processing
            save_dir = os.path.join(work_dir, "save")
            os.mkdir(save_dir)
            prep_tcc_matrix(
                job,
                threads=config.cores,
                tcc_output_dir=os.path.join(work_dir, "tcc"),
                save_dir=save_dir
            )  # this should be the same as specified in build_pachterlab_config. It may be worth refactoring so that these don't have to be manually synced, although there's no reason for these values to ever change and thus become desynced.
        # Irrespective of whether quant or pseudo, because of quant-to-pseudo conversion
        # Build tarfile of output
        output_files = glob(os.path.join(work_dir, "tcc", "*"))
        tarball_files(tar_name='kallisto_output.tar.gz',
                      file_paths=output_files,
                      output_dir=work_dir)
        kallisto_output = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'kallisto_output.tar.gz'))
        # Consolidate post-processing output
        tcc_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', TCC_MATRIX_FILENAME))
        pwise_dist_l1_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', PWISE_DIST_FILENAME))
        nonzero_ec_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', NONZERO_EC_FILENAME))
        kallisto_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'tcc', 'matrix.ec'))
        post_processing_output = {
            TCC_MATRIX_FILENAME: tcc_matrix_id,
            PWISE_DIST_FILENAME: pwise_dist_l1_id,
            NONZERO_EC_FILENAME: nonzero_ec_id,
            KALLISTO_MATRIX_FILENAME:
            kallisto_matrix_id  # technically redundant
        }
        # Prepare files to send to plots for SC3
        matrix_tsv_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, "tcc", "matrix.tsv"))
        matrix_cells_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, "tcc", "matrix.cells"))
    # Graphing step
    if config.generate_graphs:
        graphical_output = job.addChildJobFn(run_data_analysis, config,
                                             tcc_matrix_id, pwise_dist_l1_id,
                                             nonzero_ec_id, kallisto_matrix_id,
                                             matrix_tsv_id,
                                             matrix_cells_id).rv()
        job.addFollowOnJobFn(consolidate_output, config, kallisto_output,
                             graphical_output, post_processing_output)
    else:
        # converts to UUID name scheme and transfers to output location
        consolidate_output(job,
                           config,
                           kallisto_output=kallisto_output,
                           graphical_output=None,
                           post_processing_output=post_processing_output)
Exemple #19
0
def archiveBatchAndUploadToFileStore(parent_job, batch, workdir):
    tarname = "%s.tmp" % uuid.uuid4().hex
    tarpath = os.path.join(workdir, tarname)
    tarball_files(tar_name=tarname, file_paths=batch, output_dir=workdir)
    require(os.path.exists(tarpath), "[archiveBatchAndUploadToFileStore]Didn't make smaller tar")
    return parent_job.fileStore.writeGlobalFile(tarpath)
Exemple #20
0
def run_margin_phase__run_cpecan_alignment(job, config, chunk_identifier,
                                           work_dir, alignment_filename,
                                           reference_filename):
    # prep
    start = time.time()
    fcn_identifier = "run_margin_phase:run_cpecan_alignment"
    log(job, "{}".format(datetime.datetime.now()), chunk_identifier,
        fcn_identifier)
    log(
        job, "Running cPecan positional probabilities on {}".format(
            alignment_filename), chunk_identifier, fcn_identifier)

    # index bam
    _index_bam(job, config, work_dir, alignment_filename)

    # build cPecan args
    out_dir_name = "cPecan_out"
    params = [
        '--ref',
        os.path.join("/data", reference_filename), '--alignment_file',
        os.path.join("/data", alignment_filename), '--workdir_directory',
        '/data/tmp', '--output_directory',
        os.path.join("/data", out_dir_name), '--validate', '--threads',
        str(job.cores)
    ]
    hmm_location = run_margin_phase__infer_cpecan_hmm_location(
        chunk_identifier)
    if hmm_location is not None: params.extend(['--realign_hmm', hmm_location])

    # run cpecan
    docker_call(job, config, work_dir, params, config.cpecan_image,
                config.cpecan_tag)

    # document output
    log_debug_from_docker(job, os.path.join(work_dir, DOCKER_CPECAN_LOG),
                          chunk_identifier, fcn_identifier, [
                              os.path.join(work_dir, alignment_filename),
                              os.path.join(work_dir, reference_filename)
                          ])
    require_docker_file_output(job,
                               config,
                               work_dir,
                               [os.path.join(work_dir, out_dir_name)],
                               fcn_identifier,
                               log_filename=DOCKER_CPECAN_LOG)
    output_files = glob.glob(
        os.path.join(work_dir, out_dir_name, "*".format(chunk_identifier)))
    dir_count = len(list(filter(lambda x: os.path.isdir(x), output_files)))
    file_count = len(list(filter(lambda x: os.path.isfile(x), output_files)))
    log(
        job, "cPecan generated {} output files ({} directory, {} file)".format(
            len(output_files), dir_count, file_count), chunk_identifier,
        fcn_identifier)
    if os.path.isfile(os.path.join(work_dir, DOCKER_CPECAN_LOG)):
        output_files.append(os.path.join(work_dir, DOCKER_CPECAN_LOG))

    # tarball the output and save
    tarball_name = "{}.nuc_pos_prob.tar.gz".format(chunk_identifier)
    try:
        tarball_files(tar_name=tarball_name,
                      file_paths=output_files,
                      output_dir=os.path.join(work_dir, out_dir_name))
    except Exception, e:
        log(job, "{} error making cPecan tarball: {}".format(type(e), e),
            chunk_identifier, fcn_identifier)
        tarball_files(tar_name=tarball_name,
                      file_paths=output_files,
                      output_dir=work_dir)
        log(job,
            "created tarball in work_dir: {}".format(os.path.join(work_dir)),
            chunk_identifier, fcn_identifier)
Exemple #21
0
def merge_chunks(job, config, chunk_infos):
    # prep
    start = time.time()
    uuid = config.uuid
    work_dir = job.fileStore.getLocalTempDir()
    log(job, "{}".format(datetime.datetime.now()), uuid, 'merge_chunks')
    log(job, "Merging {} chunks".format(len(chunk_infos)), uuid,
        'merge_chunks')
    if config.minimal_output:
        log(
            job,
            "Minimal output is configured, will only save full chromosome vcf and merged BAMs",
            uuid, 'merge_chunks')

    # work directory for tar management
    # output files
    merged_chunks_directory = os.path.join(work_dir, ID_MERGED)
    os.mkdir(merged_chunks_directory)
    full_merged_vcf_file = os.path.join(merged_chunks_directory,
                                        "{}.merged.vcf".format(config.uuid))
    full_merged_sam_file = os.path.join(merged_chunks_directory,
                                        "{}.merged.sam".format(config.uuid))

    # sort by chunk index and validate
    chunk_infos.sort(key=(lambda x: x[CI_CHUNK_INDEX]))
    idx = 0
    missing_indices = []
    for ci in chunk_infos:
        while ci[CI_CHUNK_INDEX] > idx:
            missing_indices.append(idx)
            idx += 1
        idx += 1
    if len(missing_indices) > 0:
        log(
            job, "Found {} missing indices: {}".format(len(missing_indices),
                                                       missing_indices), uuid,
            'merge_chunks')

    # prep for iteration
    merge_decisions = dict()
    prev_chunk_workdir = ""
    prev_chunk_sam_file = None
    prev_chunk_vcf_file = None
    prev_chunk = {CI_CHUNK_INDEX: "start"}
    prev_written_reads = set()
    prev_vcf_split_pos = None
    prev_vcf_phase_action = None

    # iterate over all chunks
    for chunk in chunk_infos:

        # get current chunk info/files
        chunk_idx = chunk[CI_CHUNK_INDEX]
        chunk_boundary = chunk[CI_CHUNK_BOUNDARY_START]
        merging_step_identifier = "{}:{}-{}".format(config.uuid,
                                                    prev_chunk[CI_CHUNK_INDEX],
                                                    chunk[CI_CHUNK_INDEX])
        curr_chunk_workdir = os.path.join(work_dir, "tmp-{}".format(chunk_idx))
        curr_chunk_sam_file, curr_chunk_vcf_file = merge_chunks__extract_chunk_tarball(
            job, config, curr_chunk_workdir, chunk)
        log(
            job, "merging {} and {} across boundary {}".format(
                prev_chunk[CI_CHUNK_INDEX], chunk_idx, chunk_boundary), uuid,
            'merge_chunks')

        # error out if missing files
        if curr_chunk_sam_file is None or curr_chunk_vcf_file is None:
            error = "{}: Missing expected output file, sam:{}, vcf:{}, chunk_info:{}".format(
                chunk_idx, curr_chunk_sam_file, curr_chunk_vcf_file, chunk)
            log(job, error, uuid, 'merge_chunks')
            job.fileStore.logToMaster(error)
            if CONTINUE_AFTER_FAILURE:
                # prev chunk info is maintained, and will be written during next chunk
                continue
            raise UserError("{}:{}".format(uuid, error))

        # skip writing the first chunk
        if prev_chunk_sam_file is None:
            curr_written_reads = set()
            curr_vcf_split_pos = 0
            curr_vcf_phase_action = dict()

        # write the rest of the chunks
        else:
            # get chunk splitting
            prev_reads, curr_reads, curr_vcf_split_pos, curr_vcf_phase_action, decision_summary =\
                merge_chunks__determine_chunk_splitting(job, merging_step_identifier, prev_chunk_sam_file,
                                                        curr_chunk_sam_file, chunk_boundary)
            merge_decisions[decision_summary] =\
                merge_decisions[decision_summary] + 1 if decision_summary in merge_decisions else 1

            # write sam
            curr_written_reads = merge_chunks__append_sam_reads(
                job, merging_step_identifier, prev_chunk_sam_file,
                full_merged_sam_file, prev_reads, prev_written_reads)
            if len(curr_reads) > 0:
                curr_written_right_reads = merge_chunks__append_sam_reads(
                    job, merging_step_identifier, curr_chunk_sam_file,
                    full_merged_sam_file, curr_reads, curr_written_reads)
                curr_written_reads = curr_written_reads.union(
                    curr_written_right_reads)

            # write vcf
            merge_chunks__append_vcf_calls(
                job,
                merging_step_identifier,
                prev_chunk_vcf_file,
                full_merged_vcf_file,
                prev_vcf_split_pos,
                curr_vcf_split_pos,
                prev_vcf_phase_action,
                mp_identifier=prev_chunk[CI_CHUNK_INDEX])

        # cleanup
        if os.path.isdir(prev_chunk_workdir):
            shutil.rmtree(prev_chunk_workdir)

        # iterate
        prev_chunk = chunk
        prev_chunk_workdir = curr_chunk_workdir
        prev_chunk_sam_file = curr_chunk_sam_file
        prev_chunk_vcf_file = curr_chunk_vcf_file
        prev_written_reads = curr_written_reads
        prev_vcf_split_pos = curr_vcf_split_pos
        prev_vcf_phase_action = curr_vcf_phase_action

    # write the final reads and calls
    merging_step_identifier = "{}:{}-{}".format(config.uuid,
                                                prev_chunk[CI_CHUNK_INDEX],
                                                "end")
    merge_chunks__append_sam_reads(job, merging_step_identifier,
                                   prev_chunk_sam_file, full_merged_sam_file,
                                   {None: None}, prev_written_reads)
    merge_chunks__append_vcf_calls(job,
                                   merging_step_identifier,
                                   prev_chunk_vcf_file,
                                   full_merged_vcf_file,
                                   prev_vcf_split_pos,
                                   sys.maxint,
                                   prev_vcf_phase_action,
                                   mp_identifier=prev_chunk[CI_CHUNK_INDEX])

    # loggit
    log(job, "Finished merge with following matches:", uuid, 'merge_chunks')
    job.fileStore.logToMaster("{}:merge_chunks: ".format(config.uuid))
    for decision, count in merge_decisions.items():
        log(job, "\t\t{}: \t{}".format(decision, count), uuid, 'merge_chunks')

    # tarball the output and save
    log(job, "Output files for merge:".format(), uuid, 'merge_chunks')
    output_file_locations = glob.glob(
        os.path.join(merged_chunks_directory, "*.*"))
    output_file_locations.sort()
    tmp = output_file_locations
    output_file_locations = list()
    for f in tmp:
        if os.path.isdir(f):
            log(job, "\t\t{} (skipped, directory)".format(os.path.basename(f)),
                uuid, 'merge_chunks')
        else:
            log(job, "\t\t{}".format(os.path.basename(f)), uuid,
                'merge_chunks')
            output_file_locations.append(f)
    tarball_name = "{}.merged.tar.gz".format(config.uuid)
    tarball_files(tar_name=tarball_name,
                  file_paths=output_file_locations,
                  output_dir=work_dir)
    output_file_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, tarball_name))
    # we need to return the input list of chunk infos for consolidation
    chunk_infos.append({
        CI_UUID: config.uuid,
        CI_OUTPUT_FILE_ID: output_file_id,
        CI_CHUNK_INDEX: ID_MERGED
    })

    log_generic_job_debug(job, config.uuid, "merge_chunks", work_dir=work_dir)
    log_time(job, "merge_chunks", start, config.uuid)
    return chunk_infos
Exemple #22
0
def run_single_cell(job, sample, config):
    """
    Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo:
    https://github.com/pachterlab/scRNA-Seq-TCC-prep).  Output includes TCC matrix from kallisto process.

    :param job: toil job
    :param config: configuration for toil job
    :param sample: list of samples as constucted by 'parse_samples' function
    """
    config = argparse.Namespace(**vars(config))
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    work_dir = job.fileStore.getLocalTempDir()
    # Generate configuration JSON
    with open(os.path.join(work_dir, "config.json"), 'w') as config_file:
        config_file.write(build_patcherlab_config(config))
    # Get Kallisto index
    download_url(job,
                 url=config.kallisto_index,
                 name='kallisto_index.idx',
                 work_dir=work_dir)
    # Get input files
    input_location = os.path.join(work_dir, "fastq_input")
    os.mkdir(input_location)
    uuid, urls = sample
    config.uuid = uuid
    for url in urls:
        if url.endswith('.tar') or url.endswith('.tar.gz'):
            tar_path = os.path.join(work_dir, os.path.basename(url))
            download_url(job, url=url, work_dir=work_dir)
            subprocess.check_call(
                ['tar', '-xvf', tar_path, '-C', input_location])
            os.remove(tar_path)
        else:
            download_url(job, url=url, work_dir=input_location)
    # Create other locations for patcherlab stuff
    os.mkdir(os.path.join(work_dir, "tcc"))
    os.mkdir(os.path.join(work_dir, "output"))

    # Call docker image
    dockerCall(job,
               tool='quay.io/ucsc_cgl/kallisto_sc:latest',
               workDir=work_dir,
               parameters=["/data/config.json"])

    # Build tarfile of output
    output_files = [
        os.path.join(work_dir, "tcc", x)
        for x in ['run_info.json', 'matrix.tsv', 'matrix.ec', 'matrix.cells']
    ]
    tarball_files(tar_name='kallisto_output.tar.gz',
                  file_paths=output_files,
                  output_dir=work_dir)
    kallisto_output = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, 'kallisto_output.tar.gz'))
    # Graphing step
    if config.generate_graphs:
        tcc_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'TCC_matrix.dat'))
        pwise_dist_l1_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'pwise_dist_L1.dat'))
        nonzero_ec_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'save', 'nonzero_ec.dat'))
        kallisto_matrix_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'tcc', 'matrix.ec'))

        graphical_output = job.addChildJobFn(run_data_analysis, config,
                                             tcc_matrix_id, pwise_dist_l1_id,
                                             nonzero_ec_id,
                                             kallisto_matrix_id).rv()

        job.addFollowOnJobFn(consolidate_output, config, kallisto_output,
                             graphical_output)
    else:
        # converts to UUID name scheme and transfers to output location
        consolidate_output(job,
                           config,
                           kallisto_output=kallisto_output,
                           graphical_output=None)