Example #1
0
def run_phlat(job, fastqs, sample_type, univ_options, phlat_options):
    """
    Run PHLAT on a pair of input fastqs of type `sample_type`.

    :param list fastqs: List of input fastq files
    :param str sample_type: Description of the sample type to inject into the file name.
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict phlat_options: Options specific to PHLAT
    :return: fsID for the HLA haplotype called from teh input fastqs
    :rtype: toil.fileStore.FileID
    """
    job.fileStore.logToMaster('Running phlat on %s:%s' %
                              (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        'input_1.fastq': fastqs[0],
        'input_2.fastq': fastqs[1],
        'phlat_index.tar.gz': phlat_options['index']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped files
    gz = '.gz' if is_gzipfile(input_files['input_1.fastq']) else ''
    if gz:
        for read_file in 'input_1.fastq', 'input_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['phlat_index'] = untargz(input_files['phlat_index.tar.gz'],
                                         work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [
        '-1',
        input_files['input_1.fastq' + gz],
        '-2',
        input_files['input_2.fastq' + gz],
        '-index',
        input_files['phlat_index'],
        '-b2url',
        '/usr/local/bin/bowtie2',
        '-tag',
        sample_type,
        '-e',
        '/home/phlat-1.0',  # Phlat directory home
        '-o',
        '/data',  # Output directory
        '-p',
        str(phlat_options['n'])
    ]  # Number of threads
    docker_call(tool='phlat',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                tool_version=phlat_options['version'])
    output_file = job.fileStore.writeGlobalFile(''.join(
        [work_dir, '/', sample_type, '_HLA.sum']))
    return output_file
Example #2
0
def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
    """
    Align a pair of fastqs with bwa.

    :param list fastqs: The input fastqs for alignment
    :param str sample_type: Description of the sample to inject into the filename
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict bwa_options: Options specific to bwa
    :return: fsID for the generated sam
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()
    input_files = {
        'dna_1.fastq': fastqs[0],
        'dna_2.fastq': fastqs[1],
        'bwa_index.tar.gz': bwa_options['index']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['dna_1.fastq']) else ''
    if gz:
        for read_file in 'dna_1.fastq', 'dna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['bwa_index'] = untargz(input_files['bwa_index.tar.gz'],
                                       work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [
        'mem',
        '-t',
        str(bwa_options['n']),
        '-v',
        '1',  # Don't print INFO messages to the stderr
        '/'.join([input_files['bwa_index'], univ_options['ref']]),
        input_files['dna_1.fastq' + gz],
        input_files['dna_2.fastq' + gz]
    ]
    with open(''.join([work_dir, '/', sample_type, '.sam']), 'w') as samfile:
        docker_call(tool='bwa',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'],
                    outfile=samfile,
                    tool_version=bwa_options['version'])
    # samfile.name retains the path info
    output_file = job.fileStore.writeGlobalFile(samfile.name)

    job.fileStore.logToMaster('Ran bwa on %s:%s successfully' %
                              (univ_options['patient'], sample_type))
    return output_file
Example #3
0
def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
    """
    This module aligns the SAMPLE_TYPE dna fastqs to the reference

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor'/'normal'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. bwa_options: Dict of parameters specific to bwa
         bwa_options
              |- 'tool_index': <JSid for the bwa index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_files: Dict of aligned bam + reference (nested return)
         output_files
             |- '<ST>_fix_pg_sorted.bam': <JSid>
             +- '<ST>_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to nodes 3 and 4 on the tree
    """
    job.fileStore.logToMaster('Running bwa on %s:%s' % (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        'dna_1.fastq': fastqs[0],
        'dna_2.fastq': fastqs[1],
        'bwa_index.tar.gz': bwa_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['dna_1.fastq']) else ''
    if gz:
        for read_file in 'dna_1.fastq', 'dna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['bwa_index'] = untargz(input_files['bwa_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['mem',
                  '-t', str(bwa_options['n']),
                  '-v', '1',  # Don't print INFO messages to the stderr
                  '/'.join([input_files['bwa_index'], 'hg19']),
                  input_files['dna_1.fastq' + gz],
                  input_files['dna_2.fastq' + gz]]
    with open(''.join([work_dir, '/', sample_type, '_aligned.sam']), 'w') as samfile:
        docker_call(tool='bwa', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=samfile)
    # samfile.name retains the path info
    output_file = job.fileStore.writeGlobalFile(samfile.name)
    return output_file
Example #4
0
def run_phlat(job, fastqs, sample_type, univ_options, phlat_options):
    """
    This module will run PHLAT on SAMPLE_TYPE fastqs.

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor_dna',
                 'normal_dna', or 'tumor_rna'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor' or 'normal'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. phlat_options: Dict of parameters specific to phlat
         phlat_options
              |- 'tool_index': <JSid for the PHLAT index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_file: <JSid for the allele predictions for ST>

    This module corresponds to nodes 5, 6 and 7 on the tree
    """
    job.fileStore.logToMaster('Running phlat on %s:%s' % (univ_options['patient'], sample_type))
    print(phlat_options, file=sys.stderr)
    work_dir = os.getcwd()
    input_files = {
        'input_1.fastq': fastqs[0],
        'input_2.fastq': fastqs[1],
        'phlat_index.tar.gz': phlat_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped files
    gz = '.gz' if is_gzipfile(input_files['input_1.fastq']) else ''
    if gz:
        for read_file in 'input_1.fastq', 'input_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['phlat_index'] = untargz(input_files['phlat_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['-1', input_files['input_1.fastq' + gz],
                  '-2', input_files['input_2.fastq' + gz],
                  '-index', input_files['phlat_index'],
                  '-b2url', '/usr/local/bin/bowtie2',
                  '-tag', sample_type,
                  '-e', '/home/phlat-1.0',  # Phlat directory home
                  '-o', '/data',  # Output directory
                  '-p', str(phlat_options['n'])]  # Number of threads
    docker_call(tool='phlat', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(''.join([work_dir, '/', sample_type, '_HLA.sum']))
    return output_file
Example #5
0
def run_phlat(job, fastqs, sample_type, univ_options, phlat_options):
    """
    This module will run PHLAT on SAMPLE_TYPE fastqs.

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor_dna',
                 'normal_dna', or 'tumor_rna'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor' or 'normal'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. phlat_options: Dict of parameters specific to phlat
         phlat_options
              |- 'tool_index': <JSid for the PHLAT index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_file: <JSid for the allele predictions for ST>

    This module corresponds to nodes 5, 6 and 7 on the tree
    """
    job.fileStore.logToMaster('Running phlat on %s:%s' % (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        'input_1.fastq': fastqs[0],
        'input_2.fastq': fastqs[1],
        'phlat_index.tar.gz': phlat_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped files
    gz = '.gz' if is_gzipfile(input_files['input_1.fastq']) else ''
    if gz:
        for read_file in 'input_1.fastq', 'input_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['phlat_index'] = untargz(input_files['phlat_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['-1', input_files['input_1.fastq' + gz],
                  '-2', input_files['input_2.fastq' + gz],
                  '-index', input_files['phlat_index'],
                  '-b2url', '/usr/local/bin/bowtie2',
                  '-tag', sample_type,
                  '-e', '/home/phlat-1.0',  # Phlat directory home
                  '-o', '/data',  # Output directory
                  '-p', str(phlat_options['n'])]  # Number of threads
    docker_call(tool='phlat', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(''.join([work_dir, '/', sample_type, '_HLA.sum']))
    return output_file
Example #6
0
def run_cutadapt(job, fastqs, univ_options, cutadapt_options):
    """
    Runs cutadapt on the input RNA fastq files.

    :param list fastqs: List of fsIDs for input an RNA-Seq fastq pair
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict cutadapt_options: Options specific to cutadapt
    :return: List of fsIDs of cutadapted fastqs
    :rtype: list[toil.fileStore.FileID]
    """
    work_dir = os.getcwd()
    input_files = {'rna_1.fastq': fastqs[0], 'rna_2.fastq': fastqs[1]}
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_1.fastq']) else ''
    if gz:
        for read_file in 'rna_1.fastq', 'rna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    input_files = {
        key: docker_path(path)
        for key, path in list(input_files.items())
    }
    parameters = [
        '-a',
        cutadapt_options['a'],  # Fwd read 3' adapter
        '-A',
        cutadapt_options['A'],  # Rev read 3' adapter
        '-m',
        '35',  # Minimum size of read
        '-o',
        docker_path('rna_cutadapt_1.fastq.gz'),  # Output for R1
        '-p',
        docker_path('rna_cutadapt_2.fastq.gz'),  # Output for R2
        input_files['rna_1.fastq' + gz],
        input_files['rna_2.fastq' + gz]
    ]
    docker_call(tool='cutadapt',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                tool_version=cutadapt_options['version'])
    output_files = []
    for fastq_file in ['rna_cutadapt_1.fastq.gz', 'rna_cutadapt_2.fastq.gz']:
        output_files.append(
            job.fileStore.writeGlobalFile('/'.join([work_dir, fastq_file])))
    job.fileStore.logToMaster('Ran cutadapt on %s successfully' %
                              univ_options['patient'])
    return output_files
Example #7
0
def get_patient_vcf(job, patient_dict):
    """
    Convenience function to get the vcf from the patient dict

    :param dict patient_dict: dict of patient info
    :return: The vcf
    :rtype: toil.fileStore.FileID
    """
    temp = job.fileStore.readGlobalFile(patient_dict['mutation_vcf'],
                                        os.path.join(os.getcwd(), 'temp.gz'))
    if is_gzipfile(temp):
        outfile = gunzip(temp)
        job.fileStore.deleteGlobalFile(patient_dict['mutation_vcf'])
    else:
        outfile = patient_dict['mutation_vcf']
    return outfile
Example #8
0
def run_cutadapt(job, fastqs, univ_options, cutadapt_options):
    """
    This module runs cutadapt on the input RNA fastq files and then calls the RNA aligners.

    ARGUMENTS
    1. fastqs: List of input RNA-Seq fastqs [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. cutadapt_options: Dict of parameters specific to cutadapt
         cutadapt_options
              |- 'a': <sequence of 3' adapter to trim from fwd read>
              +- 'A': <sequence of 3' adapter to trim from rev read>
    RETURN VALUES
    1. output_files: Dict of cutadapted fastqs
         output_files
             |- 'rna_cutadapt_1.fastq': <JSid>
             +- 'rna_cutadapt_2.fastq': <JSid>

    This module corresponds to node 2 on the tree
    """
    job.fileStore.logToMaster('Running cutadapt on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_1.fastq': fastqs[0],
        'rna_2.fastq': fastqs[1]}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_1.fastq']) else ''
    if gz:
        for read_file in 'rna_1.fastq', 'rna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    input_files = {key: docker_path(path) for key, path in input_files.items()}
    parameters = ['-a', cutadapt_options['a'],  # Fwd read 3' adapter
                  '-A', cutadapt_options['A'],  # Rev read 3' adapter
                  '-m', '35',  # Minimum size of read
                  '-o', docker_path('rna_cutadapt_1.fastq.gz'),  # Output for R1
                  '-p', docker_path('rna_cutadapt_2.fastq.gz'),  # Output for R2
                  input_files['rna_1.fastq' + gz],
                  input_files['rna_2.fastq' + gz]]
    docker_call(tool='cutadapt', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_files = []
    for fastq_file in ['rna_cutadapt_1.fastq.gz', 'rna_cutadapt_2.fastq.gz']:
        output_files.append(job.fileStore.writeGlobalFile('/'.join([work_dir, fastq_file])))
    return output_files
Example #9
0
def run_cutadapt(job, fastqs, univ_options, cutadapt_options):
    """
    This module runs cutadapt on the input RNA fastq files and then calls the RNA aligners.

    ARGUMENTS
    1. fastqs: List of input RNA-Seq fastqs [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. cutadapt_options: Dict of parameters specific to cutadapt
         cutadapt_options
              |- 'a': <sequence of 3' adapter to trim from fwd read>
              +- 'A': <sequence of 3' adapter to trim from rev read>
    RETURN VALUES
    1. output_files: Dict of cutadapted fastqs
         output_files
             |- 'rna_cutadapt_1.fastq': <JSid>
             +- 'rna_cutadapt_2.fastq': <JSid>

    This module corresponds to node 2 on the tree
    """
    job.fileStore.logToMaster('Running cutadapt on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_1.fastq': fastqs[0],
        'rna_2.fastq': fastqs[1]}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_1.fastq']) else ''
    if gz:
        for read_file in 'rna_1.fastq', 'rna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    input_files = {key: docker_path(path) for key, path in input_files.items()}
    parameters = ['-a', cutadapt_options['a'],  # Fwd read 3' adapter
                  '-A', cutadapt_options['A'],  # Rev read 3' adapter
                  '-m', '35',  # Minimum size of read
                  '-o', docker_path('rna_cutadapt_1.fastq.gz'),  # Output for R1
                  '-p', docker_path('rna_cutadapt_2.fastq.gz'),  # Output for R2
                  input_files['rna_1.fastq' + gz],
                  input_files['rna_2.fastq' + gz]]
    docker_call(tool='cutadapt', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_files = []
    for fastq_file in ['rna_cutadapt_1.fastq.gz', 'rna_cutadapt_2.fastq.gz']:
        output_files.append(job.fileStore.writeGlobalFile('/'.join([work_dir, fastq_file])))
    return output_files
Example #10
0
def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
    """
    This module aligns the SAMPLE_TYPE dna fastqs to the reference

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor'/'normal'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. bwa_options: Dict of parameters specific to bwa
         bwa_options
              |- 'tool_index': <JSid for the bwa index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_files: Dict of aligned bam + reference (nested return)
         output_files
             |- '<ST>_fix_pg_sorted.bam': <JSid>
             +- '<ST>_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to nodes 3 and 4 on the tree
    """
    job.fileStore.logToMaster('Running bwa on %s:%s' %
                              (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        'dna_1.fastq': fastqs[0],
        'dna_2.fastq': fastqs[1],
        'bwa_index.tar.gz': bwa_options['tool_index']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['dna_1.fastq']) else ''
    if gz:
        for read_file in 'dna_1.fastq', 'dna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['bwa_index'] = untargz(input_files['bwa_index.tar.gz'],
                                       work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [
        'mem',
        '-t',
        str(bwa_options['n']),
        '-v',
        '1',  # Don't print INFO messages to the stderr
        '/'.join([input_files['bwa_index'], 'hg19']),
        input_files['dna_1.fastq' + gz],
        input_files['dna_2.fastq' + gz]
    ]
    with open(''.join([work_dir, '/', sample_type, '_aligned.sam']),
              'w') as samfile:
        docker_call(tool='bwa',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'],
                    outfile=samfile)
    # samfile.name retains the path info
    output_file = job.fileStore.writeGlobalFile(samfile.name)
    return output_file
Example #11
0
def run_star(job, fastqs, univ_options, star_options):
    """
    Align a pair of fastqs with STAR.

    :param list fastqs: The input fastqs for alignment
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict star_options: Options specific to star
    :return: Dict containing output genome bam, genome bai, and transcriptome bam
                 output_files:
                    |- 'rnaAligned.toTranscriptome.out.bam': fsID
                    +- 'rnaAligned.out.bam': fsID
                    +- 'rnaChimeric.out.junction': fsID
    :rtype: dict
    """
    assert star_options['type'] in ('star', 'starlong')
    work_dir = os.getcwd()
    input_files = {
        'rna_cutadapt_1.fastq': fastqs[0],
        'rna_cutadapt_2.fastq': fastqs[1],
        'star_index.tar.gz': star_options['index']}
    input_files = get_files_from_filestore(job, input_files, work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else ''
    if gz:
        for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['star_index'] = untargz(input_files['star_index.tar.gz'], work_dir)

    # Check to see if user is using a STAR-Fusion index
    star_fusion_idx = os.path.join(input_files['star_index'], 'ref_genome.fa.star.idx')
    if os.path.exists(star_fusion_idx):
        input_files['star_index'] = star_fusion_idx

    input_files = {key: docker_path(path, work_dir=work_dir) for key, path in input_files.items()}

    # Using recommended STAR-Fusion parameters:
    # https://github.com/STAR-Fusion/STAR-Fusion/wiki
    parameters = ['--runThreadN', str(star_options['n']),
                  '--genomeDir', input_files['star_index'],
                  '--twopassMode', 'Basic',
                  '--outReadsUnmapped', 'None',
                  '--chimSegmentMin', '12',
                  '--chimJunctionOverhangMin', '12',
                  '--alignSJDBoverhangMin', '10',
                  '--alignMatesGapMax', '200000',
                  '--alignIntronMax', '200000',
                  '--chimSegmentReadGapMax', 'parameter', '3',
                  '--alignSJstitchMismatchNmax', '5', '-1', '5', '5',
                  '--outFileNamePrefix', 'rna',
                  '--readFilesIn',
                  input_files['rna_cutadapt_1.fastq' + gz],
                  input_files['rna_cutadapt_2.fastq' + gz],
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outSAMtype', 'BAM', 'Unsorted',
                  '--quantMode', 'TranscriptomeSAM']
    if gz:
        parameters.extend(['--readFilesCommand', 'zcat'])

    if star_options['type'] == 'star':
        docker_call(tool='star', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], tool_version=star_options['version'])
    else:
        docker_call(tool='starlong', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], tool_version=star_options['version'])
    output_files = defaultdict()
    for output_file in ['rnaAligned.toTranscriptome.out.bam',
                     'rnaAligned.out.bam',
                     'rnaChimeric.out.junction']:
        output_files[output_file] = job.fileStore.writeGlobalFile('/'.join([work_dir, output_file]))
    export_results(job, output_files['rnaAligned.toTranscriptome.out.bam'], 'rna_transcriptome.bam',
                   univ_options, subfolder='alignments')
    export_results(job, output_files['rnaChimeric.out.junction'], 'rna_chimeric.junction',
                   univ_options, subfolder='mutations/fusions')
    job.fileStore.logToMaster('Ran STAR on %s successfully' % univ_options['patient'])
    return output_files
Example #12
0
def run_star(job, fastqs, univ_options, star_options):
    """
    This module uses STAR to align the RNA fastqs to the reference

    ARGUMENTS
    1. fastqs: REFER RETURN VALUE of run_cutadapt()
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. star_options: Dict of parameters specific to STAR
         star_options
             |- 'tool_index': <JSid for the STAR index tarball>
             +- 'n': <number of threads to allocate>
    RETURN VALUES
    1. output_files: Dict of aligned bams
         output_files
             |- 'rnaAligned.toTranscriptome.out.bam': <JSid>
             +- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai
                                |- 'rna_fix_pg_sorted.bam': <JSid>
                                +- 'rna_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to node 9 on the tree
    """
    assert star_options['type'] in ('star', 'starlong')
    job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_cutadapt_1.fastq': fastqs[0],
        'rna_cutadapt_2.fastq': fastqs[1],
        'star_index.tar.gz': star_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else ''
    if gz:
        for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['star_index'] = untargz(input_files['star_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['--runThreadN', str(star_options['n']),
                  '--genomeDir', input_files['star_index'],
                  '--outFileNamePrefix', 'rna',
                  '--readFilesIn',
                  input_files['rna_cutadapt_1.fastq' + gz],
                  input_files['rna_cutadapt_2.fastq' + gz],
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--quantMode', 'TranscriptomeSAM']
    if gz:
        parameters.extend(['--readFilesCommand', 'zcat'])
    if star_options['type'] == 'star':
        docker_call(tool='star', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    else:
        docker_call(tool='starlong', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    output_files = defaultdict()
    for bam_file in ['rnaAligned.toTranscriptome.out.bam',
                     'rnaAligned.sortedByCoord.out.bam']:
        output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join([
            work_dir, bam_file]))
    return output_files
Example #13
0
def run_star(job, fastqs, univ_options, star_options):
    """
    This module uses STAR to align the RNA fastqs to the reference

    ARGUMENTS
    1. fastqs: REFER RETURN VALUE of run_cutadapt()
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. star_options: Dict of parameters specific to STAR
         star_options
             |- 'tool_index': <JSid for the STAR index tarball>
             +- 'n': <number of threads to allocate>
    RETURN VALUES
    1. output_files: Dict of aligned bams
         output_files
             |- 'rnaAligned.toTranscriptome.out.bam': <JSid>
             +- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai
                                |- 'rna_fix_pg_sorted.bam': <JSid>
                                +- 'rna_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to node 9 on the tree
    """
    assert star_options['type'] in ('star', 'starlong')
    job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_cutadapt_1.fastq': fastqs[0],
        'rna_cutadapt_2.fastq': fastqs[1],
        'star_index.tar.gz': star_options['tool_index']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else ''
    if gz:
        for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['star_index'] = untargz(input_files['star_index.tar.gz'],
                                        work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [
        '--runThreadN',
        str(star_options['n']), '--genomeDir', input_files['star_index'],
        '--outFileNamePrefix', 'rna', '--readFilesIn',
        input_files['rna_cutadapt_1.fastq' + gz],
        input_files['rna_cutadapt_2.fastq' + gz], '--outSAMattributes', 'NH',
        'HI', 'AS', 'NM', 'MD', '--outSAMtype', 'BAM', 'SortedByCoordinate',
        '--quantMode', 'TranscriptomeSAM'
    ]
    if gz:
        parameters.extend(['--readFilesCommand', 'zcat'])
    if star_options['type'] == 'star':
        docker_call(tool='star',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    else:
        docker_call(tool='starlong',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    output_files = defaultdict()
    for bam_file in [
            'rnaAligned.toTranscriptome.out.bam',
            'rnaAligned.sortedByCoord.out.bam'
    ]:
        output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join(
            [work_dir, bam_file]))
    return output_files