Exemple #1
0
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.',
          file=params.logfile)
    # Do the STAR executables point to valid files?
    params.STAR_executable = pi_errors.test_param_value(
        params.STAR_executable, 'STAR', '--STAR', params.logfile)
    params.STAR_executable = pi_errors.test_param_value(
        params.STARlong_executable, 'STARlong', '--STARlong', params.logfile)
    #  If Indexing is required, does twoBitToFa point to a valid file?
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    #  Obtain read length from the second non-empty line of the file (seq line).
    with open(''.join([params.file_path, '/', params.file_prefix, '_1.fastq']),
              'r') as fastq_1_file:
        line_num = 0
        for line in fastq_1_file:
            line = line.strip()
            if len(line) == 0:
                continue
            line_num += 1
            if line_num == 2:
                read_length = len(line)
                break
    #  Set up the executable based on the read length
    if int(read_length) > 300:
        star_executable = params.STARlong_executable
    else:
        star_executable = params.STAR_executable
    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        index_path = star_indexing(star_executable, read_length, params)
    else:
        params.index_location = os.path.abspath(params.index_location)
        if not os.path.exists(''.join([params.index_location, '/SA'])):
            raise pi_errors.InputFileError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Index file not found', params.logfile)
        else:
            index_path = params.index_location
    return star_executable, index_path
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.', file=params.logfile)
    # Do the STAR executables point to valid files?
    params.STAR_executable = pi_errors.test_param_value(
        params.STAR_executable, 'STAR', '--STAR', params.logfile)
    params.STAR_executable = pi_errors.test_param_value(
        params.STARlong_executable, 'STARlong', '--STARlong', params.logfile)
    #  If Indexing is required, does twoBitToFa point to a valid file?
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    #  Obtain read length from the second non-empty line of the file (seq line).
    with open(''.join([params.file_path, '/', params.file_prefix,
                       '_1.fastq']), 'r') as fastq_1_file:
        line_num = 0
        for line in fastq_1_file:
            line = line.strip()
            if len(line) == 0:
                continue
            line_num += 1
            if line_num == 2:
                read_length = len(line)
                break
    #  Set up the executable based on the read length
    if int(read_length) > 300:
        star_executable = params.STARlong_executable
    else:
        star_executable = params.STAR_executable
    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        index_path = star_indexing(star_executable, read_length, params)        
    else:
        params.index_location = os.path.abspath(params.index_location)
        if not os.path.exists(''.join([params.index_location, '/SA'])):
            raise pi_errors.InputFileError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Index file not found', params.logfile)
        else:
            index_path = params.index_location
    return star_executable, index_path
Exemple #3
0
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params):
    '''
    This module will create the rsem indexes at params.index_destination using
    RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version
    to make bowtie indexes as well.
    bowtie_info is a tuple of (bowtie_path, bowtie_version)
    params contains
    index_destination - Folder to store the indexes
    n - number of cores to use
    genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD.
    genome_version - hg19/hg38
    logfile - Open file handle to a log file
    RETURN VALUES
    index_path - Path to directory where nidexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Creating rsem references...',
          file=params.logfile)
    index_path = os.path.abspath(params.index_destination)
    #  If the directory doesn't exist, create it
    if not os.path.exists(index_path):
        prepare.py_mkdir(index_path)
    if params.genome_fasta == 'DOWNLOAD':
        params.genome_fasta = prepare.get_genome(params.genome_version,
                                                 index_path,
                                                 params.tbtf_executable,
                                                 params.logfile)
    else:
        params.genome_fasta = pi_errors.test_param_value(
            params.genome_fasta, 'Genomic Fasta', '--genome_fasta',
            params.logfile)
    #  If the gtf file is required, download it
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running rsem-prepare-reference on fasta reference.',
          file=params.logfile)
    rsem_prepref_call = [rsem_index_executable]  # base call
    rsem_prepref_call.extend(['--gtf', gencode_file])  # gtf file
    if fasta_input:
        rsem_prepref_call.extend([
            ''.join(['--', bowtie_version]),
            ''.join(['--', bowtie_version, '-path']), bowtie_path
        ])
    else:
        rsem_prepref_call.append('--no-bowtie')
    rsem_prepref_call.append(params.genome_fasta)
    rsem_prepref_call.extend(
        [''.join([index_path, '/', params.genome_version])])
    print(rsem_prepref_call, file=params.logfile)
    return_value = call(rsem_prepref_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.',
          file=params.logfile)
    return index_path
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params):
    '''
    This module will create the rsem indexes at params.index_destination using
    RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version
    to make bowtie indexes as well.
    bowtie_info is a tuple of (bowtie_path, bowtie_version)
    params contains
    index_destination - Folder to store the indexes
    n - number of cores to use
    genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD.
    genome_version - hg19/hg38
    logfile - Open file handle to a log file
    RETURN VALUES
    index_path - Path to directory where nidexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Creating rsem references...', file=params.logfile)
    index_path = os.path.abspath(params.index_destination)
    #  If the directory doesn't exist, create it
    if not os.path.exists(index_path):
        prepare.py_mkdir(index_path)
    if params.genome_fasta == 'DOWNLOAD':
        params.genome_fasta = prepare.get_genome(params.genome_version,
                                                 index_path,
                                                 params.tbtf_executable,
                                                 params.logfile)
    else:
        params.genome_fasta = pi_errors.test_param_value(params.genome_fasta,
                                                         'Genomic Fasta',
                                                         '--genome_fasta',
                                                         params.logfile)
    #  If the gtf file is required, download it
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running rsem-prepare-reference on fasta reference.',
          file=params.logfile)
    rsem_prepref_call = [rsem_index_executable] # base call
    rsem_prepref_call.extend(['--gtf', gencode_file]) # gtf file
    if fasta_input:
        rsem_prepref_call.extend([''.join(['--', bowtie_version]),
                                  ''.join(['--', bowtie_version, '-path']),
                                  bowtie_path])
    else:
        rsem_prepref_call.append('--no-bowtie')
    rsem_prepref_call.append(params.genome_fasta)
    rsem_prepref_call.extend([''.join([index_path, '/',
                                       params.genome_version])])
    print(rsem_prepref_call, file=params.logfile)
    return_value = call(rsem_prepref_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.', file=params.logfile)
    return index_path
def main():
    """
    This wrapper script will run the tool cutadapt within the  cutadapt docker
    container for the precision immuno project. The wrapper requires
    1. cutadapt
    2. GNU sed (Tested on version 4.2.1)

    Unless specified, the program will look for default executables on $PATH.
    The program DOES NOT look for jar files and they are required to be
    passed during execution.
    """
    #  Parse the arguments using prepare.parse_args()
    params = prepare.parse_args(main.__doc__, 'cutadapt', 'adapter_fixed')

    # params ERROR handling
    params.cutadapt_executable = pi_errors.test_param_value(
        params.cutadapt_executable, 'cutadapt', '--cutadapt', params.logfile)
    if not(set(params.fwd_3pr_adapter).issubset(set("ACTGN")) and \
           set(params.rev_3pr_adapter).issubset(set("ACTGN"))):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Adapter sequences can only contain A, C, T, G, and N.',
            params.logfile)

    #  Move to working directory before doing I/O intensive work
    os.chdir(params.working_dir)

    #  Remvove adapter contamination
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + \
          ': Trimming adapters using cutadapt.', file=params.logfile)
    cutadapt_call = [params.cutadapt_executable] # base call
    cutadapt_call.extend(['-a', params.fwd_3pr_adapter])  # Fwd read 3' adapter
    cutadapt_call.extend(['-A', params.rev_3pr_adapter])  # Rev read 3' adapter
    cutadapt_call.extend(['-m', '35'])  # Minimum size of read
    cutadapt_call.extend(['-o', ''.join([params.file_prefix,
                                         '_cutadapt_1.fastq'])])
    cutadapt_call.extend(['-p', ''.join([params.file_prefix,
                                         '_cutadapt_2.fastq'])])
    cutadapt_call.append(''.join([params.file_path, '/', params.file_prefix,
                                  '_1.fastq']))
    cutadapt_call.append(''.join([params.file_path, '/', params.file_prefix,
                                  '_2.fastq']))
    print(' '.join(cutadapt_call), file=params.logfile)
    return_value = call(cutadapt_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': cutadapt failed', params.logfile)
    # Move files from temp directory to outdir
    prepare.move_output(params)
    print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
          'completed', file=params.logfile)
    params.logfile.close()
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    #  Is mhc_executable set up correctly?
    if os.path.split(params.mhc_executable)[1] == 'predict_binding.py':
        params.mhc_executable = pi_errors.test_param_value(
            params.mhc_executable, 'predict_binding.py', '--mhc_predictor',
            params.logfile)
    else:
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --mhc_predictor has to be predict_binding.py', params.logfile)
    #  List of acceptable prediction methods.  Used to ensure the correct
    #  prediction method has been provided.
    prediction_methods = set(["ann", "comblib_sidney2008", "consensus", 
                              "IEDB_recommended", "netmhcpan", "smm",
                              "smmpmbec"])
    if params.pred_meth not in prediction_methods:
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --prediction_method has to be one of ' + \
            ''.join(prediction_methods), params.logfile)
    #  Test the value of peplen. For MHCI it can only be 9, 10 or 11.
    if params.peplen == "mhc dependent":
        params.peplen = [9, 10]
    elif set([int(x) for x in params.peplen]).difference(set([9, 10, 11])):
        #  This is true iff params.peplen contains an item not in [9, 10, 11].
        #  The values are stored in params as strings and need to be converted
        #  to ints hence the list comprehension.
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --peplen must be provided space separated values in [9, 10, 11]',
            params.logfile)
    else:
        #  This means the user has passed acceptable values to --peplen
        params.peplen = [int(x) for x in params.peplen]
    #  Once we have the peptide lengths, we need to ensure the --file_prefix has
    #  been set up right
    peplen_filenames = Counter()  #  The 
    if len(params.peplen) == 1 and params.file_prefix.endswith('.faa'):
        #  There is only 1 peplen and the full filename has been provided
        peplen_filenames[params.peplen[0]] = pi_errors.test_param_value(
            '/'.join(params.file_path, params.file_prefix]), 'Input file',
            '--file_prefix', params.logfile)
def main():
    '''
    This wrapper script will run the entire alignment pipeline for genomic DNA
    (WGS or WXS) from alignment of fastqs, to sorting, indexing, and Read Group
    incorporation. The wrapper can even download and produce bwa references if
    required. The wrapper requires
    1. bwa (For aligning reads)
    2. java (For picard)
    3. picard tools (For read groups)
    4. samtools (For sam/bam manipulation)
    5. twoBitToFa from the kent tools library (For extracting the reference
            genome in case indexing is required)

    Unless specified, the program will look for default executables on $PATH.
    The program DOES NOT look for jar files and they are required to be
    passed during execution.
    '''
    #  Parse the arguments using prepare.parse_args()
    params = prepare.parse_args(main.__doc__, 'bwa', 'bwa_alignment')

    #  Params ERROR handling
    #  The memory option for java should be of the form Xmx10G or Xmx10M
    if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please use a suitable value for --Xmx.', params.logfile)
    params.bwa_executable = pi_errors.test_param_value(params.bwa_executable,
                                                       'bwa',
                                                       '--bwa',
                                                       params.logfile)
    params.samtools_executable = pi_errors.test_param_value(
        params.samtools_executable, 'samtools', '--samtools', params.logfile)
    params.java_executable = pi_errors.test_param_value(params.java_executable,
                                                        'java',
                                                        '--java',
                                                        params.logfile)
    #  If Indexing is required, does twoBitToFa point to a valid file?
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    if not params.picard_jar.endswith('jar'):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please specify a valid jar file for picard!', params.logfile)
    else:
        params.picard_jar = pi_errors.test_param_value(params.picard_jar,
                                                       'picard',
                                                       '--picard_jar',
                                                       params.logfile)

    if params.RGID is None:
        params.RGID = params.file_prefix

    #read_group = ''.join(['\'@RG\\tID:', params.RGID, '\\tPL:ILLUMINA\\tSM:',
    #                      params.sample_type, '\''])
    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
              'Indexing fasta...', file=params.logfile)
        if not os.path.exists(params.index_destination):
            prepare.py_mkdir(params.index_destination)
        index_path = params.index_destination
        genome_fasta = prepare.get_genome(params.genome_version, index_path,
                                          params.twoBitToFa_executable,
                                          params.logfile)
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
              'Running BWA index on fasta reference.', file=params.logfile)
        return_value = call([params.bwa_executable, 'index', genome_fasta])
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': bwa index failed.', params.logfile)
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
              'Running samtools faidx.', file=params.logfile)
        return_value = call([params.samtools_executable, 'faidx', genome_fasta])
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': samtools faidx failed', params.logfile)
        index_prefix = genome_fasta
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
              'Indexing completed.', file=params.logfile)
    else:
        if params.index_location.endswith('.fa'):
            assert os.path.exists(params.index_location), 'Index file not found'
            index_prefix = params.index_location
        else:
            fastas = [x for x in os.listdir(params.index_location) if
                      x.endswith(".fa")]
            if len(fastas) == 1:
                index_prefix = "".join([params.index_location, '/', fastas[0]])
            elif len(fastas) == 0:
                raise pi_errors.InputFileError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ': No valid fasta found in provided index folder',
                    params.logfile)
            else:
                raise pi_errors.InputFileError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ':Multiple fastas found in provided index folder. Try ' + \
                    'running with --index_location /path/to/file/filename.fa',
                    params.logfile)

    # Move to working directory before doing I/O intensive alignment
    os.chdir(params.working_dir)

    # Align reads to sam file
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Aligning' +
          ' reads to reference.', file=params.logfile)
    bwa_call = [params.bwa_executable, 'mem'] # base call
    bwa_call.extend(['-t', str(params.n)])  # Number of threads
    #bwa_call.extend(['-R', read_group])  # Read group
    bwa_call.append(index_prefix)  # bwa index
    bwa_call.append(''.join([params.file_path, '/', params.file_prefix,
                             '_1.fastq']))
    bwa_call.append(''.join([params.file_path, '/', params.file_prefix,
                             '_2.fastq']))
    print(' '.join(bwa_call), file=params.logfile)
    with open(''.join([params.file_prefix, '.sam']), 'w') as samfile, \
            open(''.join([params.file_prefix, '_bwa_log.txt']), 'w') as logfile:
        return_value = call(bwa_call, stdout=samfile, stderr=logfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': bwa mem failed.', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Alignment completed. Converting to bam', file=params.logfile)
    # Convert the sam to a bam file
    with open(''.join([params.file_prefix, '.bam']), 'w') as bamfile:
        call([params.samtools_executable, 'view', '-bS',
              ''.join([params.file_prefix, '.sam'])], stdout=bamfile)
    call(['rm', ''.join([params.file_prefix, '.sam'])])
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': bam file' +
          ' created. Preparing file for inserting RG into header.',
          file=params.logfile)
    # Fix PG line
    sam_header = check_output([params.samtools_executable, 'view', '-H',
                               ''.join([params.file_prefix, '.bam'])])
    sam_header = sam_header.strip().split('\n')  # Strip whitespace and separate
    pg_line = sam_header[-1].split('\t')  # Grab @PG line + split by tab
    # Then remove the CL field form the PG line
    sam_header[-1] = '\t'.join([x for x in pg_line if not x.startswith('CL')])
    with open(''.join([params.file_prefix, '_sam.header']), 'w') as hdr_file:
        print('\n'.join(sam_header), file=hdr_file)
    with open(''.join([params.file_prefix, '_fixPG.bam']), 'w') as \
              fixpg_bamfile:
        return_value = call([params.samtools_executable, 'reheader',
                             ''.join([params.file_prefix, '_sam.header']),
                             ''.join([params.file_prefix, '.bam'])],
                            stdout=fixpg_bamfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools reheader failed', params.logfile)
    call(['rm', ''.join([params.file_prefix, '.bam']),
          ''.join([params.file_prefix, '_sam.header'])])
    # Sort and Index the _fixPG.bam file
    return_value = call([params.samtools_executable, 'sort',
                         ''.join([params.file_prefix, '_fixPG.bam']),
                         ''.join([params.file_prefix, '_fixPG_sorted'])])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools sort failed.', params.logfile)
    return_value = call([params.samtools_executable, 'index',
                         ''.join([params.file_prefix, '_fixPG_sorted.bam'])])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools index failed.', params.logfile)
    call(['rm', ''.join([params.file_prefix, '_fixPG.bam'])])
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Inserting @RG tag into header.', file=params.logfile)
    # Reheader the indexed _fixPG_sorted.bam to prepare for mutect
    picard_call = [params.java_executable, ''.join(['-Xmx', params.java_Xmx]),
                   '-jar'] #  Base java call
    picard_call.append(params.picard_jar)  # picard
    picard_call.append('AddOrReplaceReadGroups')  # module
    picard_call.append('CREATE_INDEX=true')
    picard_call.append(''.join(['I=', params.file_prefix, '_fixPG_sorted.bam']))
    picard_call.append(''.join(['O=', params.file_prefix,
                                '_fixPG_sorted_reheader.bam']))
    picard_call.append('SO=coordinate')
    picard_call.append('ID=1')
    picard_call.append(''.join(['LB=', params.file_prefix]))
    picard_call.append('PL=ILLUMINA')
    picard_call.append('PU=12345')
    picard_call.append(''.join(['SM=', params.sample_type]))
    with open(''.join([params.file_prefix, '_picard_log.txt']), 'w') as logfile:
        return_value = call(picard_call, stdout=logfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': picard AddOrReplaceReadGroups failed.', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': @RG ' +
          'inserted. Indexing bam', file=params.logfile)
    # Index _fixPG_sorted_reheader.bam file
    return_value = call([params.samtools_executable, 'index',
                         ''.join([params.file_prefix,
                                  '_fixPG_sorted_reheader.bam'])])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools index failed.', params.logfile)
    # Remove intermediate files
    call(['rm', ''.join([params.file_prefix, '_fixPG_sorted.bam']),
          ''.join([params.file_prefix, '_fixPG_sorted.bam.bai'])])

    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Alignment completed. Finishing up...', params.logfile)
    # Move files from temp directory to outdir
    prepare.move_output(params)
    print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
          'completed', file=params.logfile)
    params.logfile.close()
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    #  Does the provided radia binary provided exist?
    params.radia_executable = pi_errors.test_param_value(
        params.radia_executable, 'radia', '--radia', params.logfile)
    #  Setup filterRadia.py
    params.filter_radia_executable = '/'.join([os.path.split(
        params.radia_executable)[0], 'filterRadia.py'])
    params.filter_radia_executable = pi_errors.test_param_value(
        params.filter_radia_executable, 'filterradia', '--radia',
        params.logfile)
    #  Test input files
    params.tum_d_file = pi_errors.test_param_value(params.tum_d_file,
                                                   'Tumor DNA',
                                                   '--tum_dna_file',
                                                   params.logfile)
    params.norm_d_file = pi_errors.test_param_value(params.norm_d_file,
                                                    'Normal DNA',
                                                    '--norm_dna_file',
                                                    params.logfile)
    if params.tum_r_file is not None:
        params.tum_r_file = pi_errors.test_param_value(params.tum_r_file,
                                                       'Tumor RNA',
                                                       '--tum_rna_file',
                                                       params.logfile)

    #  If you don't have a reference, you need twoBitToFasta
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    #  Are dnsnp or cosmic vcf required?
    if params.dbsnp_file == 'DOWNLOAD' or params.cosmic_file == 'DOWNLOAD' or \
            params.genome_fasta == 'DOWNLOAD':
        # Ensure the vcf storage location has been provided
        if params.vcf_location is None:
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': --vcf_location cannot be empty if either --cosmic, ' + \
                '--dbsnp, or --genome_fasta are empty.', params.logfile)
        else:
            params.vcf_location = os.path.abspath(params.vcf_location)
        # Download dbsnp file if required
        if params.dbsnp_file == 'DOWNLOAD':
            if os.path.exists('/'.join([params.vcf_location, '00-All.vcf'])):
                params.dbsnp_file = '/'.join([params.vcf_location,
                                              '00-All.vcf'])
            else:
                params.dbsnp_file = prepare.download_vcf('dbsnp', params)
        # Download cosmic file if required
        if params.cosmic_file == 'DOWNLOAD':
            if os.path.exists('/'.join([params.vcf_location,
                                        'Cosmic_sorted.vcf'])):
                params.cosmic_file = '/'.join([params.vcf_location,
                                               'Cosmic_sorted.vcf'])
            else:
                params.cosmic_file = prepare.download_vcf('cosmic', params)
        # Download genome fasta if required
        if params.genome_fasta == 'DOWNLOAD' or not \
                os.path.exists(params.genome_fasta):
            if os.path.exists(''.join([params.vcf_location, '/',
                                       params.genome_version, '.fa'])):
                params.genome_fasta = ''.join([params.vcf_location, '/',
                                               params.genome_version, '.fa'])
            else:
                params.genome_fasta = prepare.get_genome(
                    params.genome_version, params.vcf_location,
                    params.twoBitToFa_executable, sys.stderr)
        else:
            params.genome_fasta = os.path.abspath(params.genome_fasta)
    #  Set up the value for rna_fasta
    if params.rna_fasta == 'GENOME_FASTA':
        params.rna_fasta = params.genome_fasta
    else:
        params.rna_fasta = pi_errors.test_param_value(params.rna_fasta,
                                                      'RNA Fasta',
                                                      '--rna_fasta',
                                                      params.logfile)
    #  Ensure the other databases are set up correctly
    #  The package path is 2 levels above the
    radia_pkg_path = os.path.split(os.path.split(params.radia_executable)[0])[0]
    database_map = defaultdict()
    test_database(params.blacklist, 'blacklist', radia_pkg_path,
                  params.vcf_location, 'data/hg19/blacklists/1000Genomes/' + \
                  'phase1/', database_map)
    test_database(params.retrogenes, 'retrogenes', radia_pkg_path,
                  params.vcf_location, 'data/hg19/retroGenes/', database_map)
    test_database(params.pseudogenes, 'pseudogenes', radia_pkg_path,
                  params.vcf_location, 'data/hg19/peudoGenes/', database_map)
    test_database(params.broad_targets, 'broad_targets', radia_pkg_path,
                  params.vcf_location, 'data/hg19/broadTargets/',
                  database_map)
    test_database(params.rna_blacklist, 'rna_blacklist', radia_pkg_path,
                  params.vcf_location, 'data/rnaGeneBlacklist.tab',
                  database_map)
    test_database(params.rna_family_blacklist, 'rna_family_blacklist',
                  radia_pkg_path, params.vcf_location,
                  'data/rnaGeneFamilyBlacklist.tab', database_map)
    #  If any of the above were returned as 'DOWNLOAD' then download the radia
    #  data folder to a temp directory from git and set the values for the
    #  invalid ones.
    if len([db for db, val in database_map.items() if val == 'DOWNLOAD']) > 0:
        download_databases(database_map, params.logfile)
    # if the -C all option was specified, expand params.chromosome
    if params.chromosome == 'all':
        params.chromosome = [''.join(['chr', str(i)]) for i in \
                             range(1, 23)+['X', 'Y']]
    return database_map
Exemple #9
0
def main():
    '''
    This wrapper script will run the entire alignment pipeline for genomic DNA
    (WGS or WXS) from alignment of fastqs, to sorting, indexing, and Read Group
    incorporation. The wrapper can even download and produce bwa references if
    required. The wrapper requires
    1. bwa (For aligning reads)
    2. java (For picard)
    3. picard tools (For read groups)
    4. samtools (For sam/bam manipulation)
    5. twoBitToFa from the kent tools library (For extracting the reference
            genome in case indexing is required)

    Unless specified, the program will look for default executables on $PATH.
    The program DOES NOT look for jar files and they are required to be
    passed during execution.
    '''
    #  Parse the arguments using prepare.parse_args()
    params = prepare.parse_args(main.__doc__, 'bwa', 'bwa_alignment')

    #  Params ERROR handling
    #  The memory option for java should be of the form Xmx10G or Xmx10M
    if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please use a suitable value for --Xmx.', params.logfile)
    params.bwa_executable = pi_errors.test_param_value(params.bwa_executable,
                                                       'bwa', '--bwa',
                                                       params.logfile)
    params.samtools_executable = pi_errors.test_param_value(
        params.samtools_executable, 'samtools', '--samtools', params.logfile)
    params.java_executable = pi_errors.test_param_value(
        params.java_executable, 'java', '--java', params.logfile)
    #  If Indexing is required, does twoBitToFa point to a valid file?
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    if not params.picard_jar.endswith('jar'):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please specify a valid jar file for picard!', params.logfile)
    else:
        params.picard_jar = pi_errors.test_param_value(params.picard_jar,
                                                       'picard',
                                                       '--picard_jar',
                                                       params.logfile)

    if params.RGID is None:
        params.RGID = params.file_prefix

    #read_group = ''.join(['\'@RG\\tID:', params.RGID, '\\tPL:ILLUMINA\\tSM:',
    #                      params.sample_type, '\''])
    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
              'Indexing fasta...',
              file=params.logfile)
        if not os.path.exists(params.index_destination):
            prepare.py_mkdir(params.index_destination)
        index_path = params.index_destination
        genome_fasta = prepare.get_genome(params.genome_version, index_path,
                                          params.twoBitToFa_executable,
                                          params.logfile)
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
              'Running BWA index on fasta reference.',
              file=params.logfile)
        return_value = call([params.bwa_executable, 'index', genome_fasta])
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': bwa index failed.', params.logfile)
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
              'Running samtools faidx.',
              file=params.logfile)
        return_value = call(
            [params.samtools_executable, 'faidx', genome_fasta])
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': samtools faidx failed', params.logfile)
        index_prefix = genome_fasta
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
              'Indexing completed.',
              file=params.logfile)
    else:
        if params.index_location.endswith('.fa'):
            assert os.path.exists(
                params.index_location), 'Index file not found'
            index_prefix = params.index_location
        else:
            fastas = [
                x for x in os.listdir(params.index_location)
                if x.endswith(".fa")
            ]
            if len(fastas) == 1:
                index_prefix = "".join([params.index_location, '/', fastas[0]])
            elif len(fastas) == 0:
                raise pi_errors.InputFileError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ': No valid fasta found in provided index folder',
                    params.logfile)
            else:
                raise pi_errors.InputFileError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ':Multiple fastas found in provided index folder. Try ' + \
                    'running with --index_location /path/to/file/filename.fa',
                    params.logfile)

    # Move to working directory before doing I/O intensive alignment
    os.chdir(params.working_dir)

    # Align reads to sam file
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Aligning' + ' reads to reference.',
          file=params.logfile)
    bwa_call = [params.bwa_executable, 'mem']  # base call
    bwa_call.extend(['-t', str(params.n)])  # Number of threads
    #bwa_call.extend(['-R', read_group])  # Read group
    bwa_call.append(index_prefix)  # bwa index
    bwa_call.append(''.join(
        [params.file_path, '/', params.file_prefix, '_1.fastq']))
    bwa_call.append(''.join(
        [params.file_path, '/', params.file_prefix, '_2.fastq']))
    print(' '.join(bwa_call), file=params.logfile)
    with open(''.join([params.file_prefix, '.sam']), 'w') as samfile, \
            open(''.join([params.file_prefix, '_bwa_log.txt']), 'w') as logfile:
        return_value = call(bwa_call, stdout=samfile, stderr=logfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': bwa mem failed.', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Alignment completed. Converting to bam',
          file=params.logfile)
    # Convert the sam to a bam file
    with open(''.join([params.file_prefix, '.bam']), 'w') as bamfile:
        call([
            params.samtools_executable, 'view', '-bS', ''.join(
                [params.file_prefix, '.sam'])
        ],
             stdout=bamfile)
    call(['rm', ''.join([params.file_prefix, '.sam'])])
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': bam file' +
          ' created. Preparing file for inserting RG into header.',
          file=params.logfile)
    # Fix PG line
    sam_header = check_output([
        params.samtools_executable, 'view', '-H',
        ''.join([params.file_prefix, '.bam'])
    ])
    sam_header = sam_header.strip().split(
        '\n')  # Strip whitespace and separate
    pg_line = sam_header[-1].split('\t')  # Grab @PG line + split by tab
    # Then remove the CL field form the PG line
    sam_header[-1] = '\t'.join([x for x in pg_line if not x.startswith('CL')])
    with open(''.join([params.file_prefix, '_sam.header']), 'w') as hdr_file:
        print('\n'.join(sam_header), file=hdr_file)
    with open(''.join([params.file_prefix, '_fixPG.bam']), 'w') as \
              fixpg_bamfile:
        return_value = call([
            params.samtools_executable, 'reheader', ''.join([
                params.file_prefix, '_sam.header'
            ]), ''.join([params.file_prefix, '.bam'])
        ],
                            stdout=fixpg_bamfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools reheader failed', params.logfile)
    call([
        'rm', ''.join([params.file_prefix, '.bam']),
        ''.join([params.file_prefix, '_sam.header'])
    ])
    # Sort and Index the _fixPG.bam file
    return_value = call([
        params.samtools_executable, 'sort',
        ''.join([params.file_prefix,
                 '_fixPG.bam']), ''.join([params.file_prefix, '_fixPG_sorted'])
    ])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools sort failed.', params.logfile)
    return_value = call([
        params.samtools_executable, 'index',
        ''.join([params.file_prefix, '_fixPG_sorted.bam'])
    ])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools index failed.', params.logfile)
    call(['rm', ''.join([params.file_prefix, '_fixPG.bam'])])
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Inserting @RG tag into header.',
          file=params.logfile)
    # Reheader the indexed _fixPG_sorted.bam to prepare for mutect
    picard_call = [
        params.java_executable, ''.join(['-Xmx', params.java_Xmx]), '-jar'
    ]  #  Base java call
    picard_call.append(params.picard_jar)  # picard
    picard_call.append('AddOrReplaceReadGroups')  # module
    picard_call.append('CREATE_INDEX=true')
    picard_call.append(''.join(['I=', params.file_prefix,
                                '_fixPG_sorted.bam']))
    picard_call.append(''.join(
        ['O=', params.file_prefix, '_fixPG_sorted_reheader.bam']))
    picard_call.append('SO=coordinate')
    picard_call.append('ID=1')
    picard_call.append(''.join(['LB=', params.file_prefix]))
    picard_call.append('PL=ILLUMINA')
    picard_call.append('PU=12345')
    picard_call.append(''.join(['SM=', params.sample_type]))
    with open(''.join([params.file_prefix, '_picard_log.txt']),
              'w') as logfile:
        return_value = call(picard_call, stdout=logfile)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': picard AddOrReplaceReadGroups failed.', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': @RG ' +
          'inserted. Indexing bam',
          file=params.logfile)
    # Index _fixPG_sorted_reheader.bam file
    return_value = call([
        params.samtools_executable, 'index',
        ''.join([params.file_prefix, '_fixPG_sorted_reheader.bam'])
    ])
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': samtools index failed.', params.logfile)
    # Remove intermediate files
    call([
        'rm', ''.join([params.file_prefix, '_fixPG_sorted.bam']),
        ''.join([params.file_prefix, '_fixPG_sorted.bam.bai'])
    ])

    print(
        'PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
        'Alignment completed. Finishing up...', params.logfile)
    # Move files from temp directory to outdir
    prepare.move_output(params)
    print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
          'completed',
          file=params.logfile)
    params.logfile.close()
 else:
     #  This means the user has passed acceptable values to --peplen
     params.peplen = [int(x) for x in params.peplen]
 #  Once we have the peptide lengths, we need to ensure the --file_prefix has
 #  been set up right
 peplen_filenames = Counter()  #  The 
 if len(params.peplen) == 1 and params.file_prefix.endswith('.faa'):
     #  There is only 1 peplen and the full filename has been provided
     peplen_filenames[params.peplen[0]] = pi_errors.test_param_value(
         '/'.join(params.file_path, params.file_prefix]), 'Input file',
         '--file_prefix', params.logfile)
 elif not params.file_prefix.endswith('.faa')):
     #  1 or more peptide lengths provided and file_prefix is a prefix and
     #  not a path to a .faa file
     for peplen in params.peplen:
         peplen_filenames[params.peplen[0]] = pi_errors.test_param_value(
             ''.join([params.file_path, '/', params.file_prefix, '_',
                      str(params.peplen), '_mer_snpeffed.faa']),
             'Input file', '--file_prefix', params.logfile)
 else:
     #  More that 1 peptide provided and file_prefix points to a single file
     raise pi_errors.ParameterError(
         dt.now().strftime('%I:%M %p %b %d, %Y') + \
         ': If more than 1 peplen is provided, --file_prefix must be a ' + \
         'prefix to the individual .faa files and not the path to a ' + \
         'single .faa file itself.', params.logfile)
 #  Test the input alleles for validity
 #  First, if an input .allele file is provided, parse it to obtain the list
 #  of alleles
 if length(params.alleles) == 1 and params.alleles[0].endswith(".alleles"):
     #  This means the user has provided a .alleles file
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.', file=params.logfile)
    #  Does the rsem path point to a valid file?
    params.rsem_path = pi_errors.test_param_value(params.rsem_path,
                                                  'rsem binaries',
                                                  '--rsem_path',
                                                  params.logfile)
    #  Does the file prefix point to a bam or a fastq file?
    if not (os.path.exists(''.join([params.file_path, '/', params.file_prefix,
                                    '.bam'])) or
            os.path.exists(''.join([params.file_path, '/', params.file_prefix,
                                    '_1.fastq']))):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + \
            'Please check input files. Neither bam nor fastq found at:\n' + \
            params.file_path, params.logfile)
    #  If the input is not a bam file, bowtie will be required
    if not os.path.exists(''.join([params.file_path, '/', params.file_prefix,
                                   '.bam'])):
        fasta_input = True
        bowtie_status = [1, 1]  #  Assume both exist [bt1, bt2]
        #  Try bowtie first then bowtie 2.  If both exist, bowtie_path and
        #  bowtie_version will take the values for bowtie2.  If only one exists,
        #  then they will take the values from the one that exists.  Since
        #  test_param_value closes the logfile, the except cases in both try
        #  instances reopen params.logfile.
        try:
            params.bowtie_executable = pi_errors.test_param_value(
                params.bowtie_executable, 'bowtie', '--bowtie',
                params.logfile)
        except pi_errors.ParameterError:
            bowtie_status[0] = 0  #  bt1 doesn't exist
            if params.logfile.name != '<stderr>':
                params.logfile = open(params.logfile.name, params.logfile.mode,
                                      0)
        else:
            bowtie_path, bowtie_version = \
                os.path.split(params.bowtie_executable)
        try:
            params.bowtie2_executable = pi_errors.test_param_value(
                params.bowtie2_executable, 'bowtie2', '--bowtie2',
                params.logfile)
        except pi_errors.ParameterError:
            bowtie_status[1] = 0  #  bt2 doesn't exist
            if params.logfile.name != '<stderr>':
                params.logfile = open(params.logfile.name, params.logfile.mode,
                                      0)
        else:
            bowtie_path, bowtie_version = \
                os.path.split(params.bowtie2_executable)
        if bowtie_status == [0, 0]:  # This means that both aren't present
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': bowtie binaries not found on $PATH. Please specify' + \
                ' one explicitly with --bowtie or --bowtie2.', params.logfile)
    else:
        fasta_input = False
        bowtie_path, bowtie_version = None, None # Set these to None

    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        #  If you need indexes, you need twoBitToFa
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
        index_path = rsem_index(''.join([params.rsem_path,
                                         '/rsem-prepare-reference']),
                                         fasta_input,
                                         (bowtie_path, bowtie_version),
                                         params)
    else:
        index_path = os.path.abspath(params.index_location)

    try:
        #  One of the files crucial to the index is GENOME_FASTA_PREFIX.chrlist
        index_prefix = [x.split('.')[0] for x in os.listdir(index_path) if \
                        x.endswith('chrlist')][0]
    except IndexError:
        if params.logfile.name != '<stderr>':
            params.logfile.close()
        raise SystemExit('ERROR ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
                         ': Indexes not found at specified location')
    return bowtie_path, bowtie_version, index_path, index_prefix
def main():
    '''
    This wrapper script will run the tool PHLAT within the phlat docker
    container for the precision immuno project. The wrapper requires:
    1. PHLAT.py
    2. bowtie2
    3. gdown.pl (For donwloading the PHLAT Index - available from
       https://raw.githubusercontent.com/Nanolx/patchimage/master/tools/gdown.pl)

    Unless specified, the program will look for default executables on $PATH.
    The program DOES NOT look for jar files and they are required to be
    passed during execution.
    '''
    # Parse the arguments using prepare.parse_args()
    params = prepare.parse_args(main.__doc__, 'phlat', 'MHC_typing')

    # params ERROR handling
    if not params.phlat_executable.endswith('PHLAT.py'):
        params.phlat_executable = '/'.join([params.phlat_executable,
                                            'PHLAT.py'])
    params.phlat_executable = pi_errors.test_param_value(
        params.phlat_executable, 'PHLAT', '--phlat', params.logfile)
    params.bowtie2_executable = pi_errors.test_param_value(
        params.bowtie2_executable, 'bowtie2', '--bowtie2', params.logfile)
    phlat_dir = os.path.split(os.path.split(params.phlat_executable)[0])[0]
    params.gdownpl_executable = pi_errors.test_param_value(
        params.gdownpl_executable, 'gdown.pl', '--gdownpl', params.logfile)

    if params.index_location is None:
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +': ' +
              'Downloading Indexes...', file=params.logfile)
        params.index_destination = os.path.abspath(params.index_destination)
        if not os.path.exists(params.index_destination):
            prepare.py_mkdir(params.index_destination)
        getindex_call = [params.gdownpl_executable, 'https://drive.google.com' +
                         '/uc?export=download&confirm=yAjx&id=0Bz-w5tutuZIYY3' +
                         'h5YlMzTjhnbGM', ''.join([params.index_destination,
                                                   '/index4phlat.tar.gz'])]
        print(getindex_call, file=params.logfile)
        return_value = call(getindex_call)
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Could not download indexes. Try manually downloading.',
                params.logfile)
        extract_call = ['tar', '-C', params.index_destination, '-zxvf',
                        '/'.join([params.index_destination,
                                  'index4phlat.tar.gz'])]
        return_value = call(extract_call)
        if return_value != 0:
            raise pi_errors.MyRuntimeError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Index4phlat could not be extracted.', params.logfile)
        else:
            call(['rm', '/'.join([params.index_destination,
                                  'index4phlat.tar.gz'])])
        index_path = '/'.join([params.index_destination, 'index4phlat'])
        print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
              'Indexes Downloaded.', file=params.logfile)
    else:
        params.index_location = os.path.abspath(params.index_location)
        if not os.path.exists(''.join([params.index_location,
                                       '/ucsc.artHLA.1.bt2'])):
            raise pi_errors.InputFileError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Index file not found.', params.logfile)
        else:
            index_path = params.index_location

    # Move to working directory before doing I/O intensive alignment
    os.chdir(params.working_dir)

    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +':    ' +
          'Begining MHC Haplotyping', file=params.logfile)
    system_call = ['/usr/bin/env', 'python2.7', '-O', params.phlat_executable]
    system_call.extend(['-1', ''.join([params.file_path, '/',
                                       params.file_prefix, '_1.fastq'])]) # Fq1
    system_call.extend(['-2', ''.join([params.file_path, "/",
                                       params.file_prefix, '_2.fastq'])]) # Fq2
    system_call.extend(['-index', index_path]) # Index files
    system_call.extend(['-b2url', params.bowtie2_executable]) # Bowtie2
    system_call.extend(['-tag', ''.join([params.out_prefix])]) # DNA/RNA
    system_call.extend(['-e', phlat_dir]) # Phlat directory home
    system_call.extend(['-o', params.outdir]) # Output directory
    system_call.extend(['-p', str(params.n)]) # Number of threads

    # Call the program
    return_value = call(system_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': MHC Haplotyping failed.', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Alignment completed. Finishing up...', file=params.logfile)
    # Move files from temp directory to outdir
    prepare.move_output(params)
    print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
          'completed', file=params.logfile)
    params.logfile.close()
Exemple #13
0
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.',
          file=params.logfile)
    #  Does the rsem path point to a valid file?
    params.rsem_path = pi_errors.test_param_value(params.rsem_path,
                                                  'rsem binaries',
                                                  '--rsem_path',
                                                  params.logfile)
    #  Does the file prefix point to a bam or a fastq file?
    if not (os.path.exists(''.join(
        [params.file_path, '/', params.file_prefix, '.bam']))
            or os.path.exists(''.join(
                [params.file_path, '/', params.file_prefix, '_1.fastq']))):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + \
            'Please check input files. Neither bam nor fastq found at:\n' + \
            params.file_path, params.logfile)
    #  If the input is not a bam file, bowtie will be required
    if not os.path.exists(''.join(
        [params.file_path, '/', params.file_prefix, '.bam'])):
        fasta_input = True
        bowtie_status = [1, 1]  #  Assume both exist [bt1, bt2]
        #  Try bowtie first then bowtie 2.  If both exist, bowtie_path and
        #  bowtie_version will take the values for bowtie2.  If only one exists,
        #  then they will take the values from the one that exists.  Since
        #  test_param_value closes the logfile, the except cases in both try
        #  instances reopen params.logfile.
        try:
            params.bowtie_executable = pi_errors.test_param_value(
                params.bowtie_executable, 'bowtie', '--bowtie', params.logfile)
        except pi_errors.ParameterError:
            bowtie_status[0] = 0  #  bt1 doesn't exist
            if params.logfile.name != '<stderr>':
                params.logfile = open(params.logfile.name, params.logfile.mode,
                                      0)
        else:
            bowtie_path, bowtie_version = \
                os.path.split(params.bowtie_executable)
        try:
            params.bowtie2_executable = pi_errors.test_param_value(
                params.bowtie2_executable, 'bowtie2', '--bowtie2',
                params.logfile)
        except pi_errors.ParameterError:
            bowtie_status[1] = 0  #  bt2 doesn't exist
            if params.logfile.name != '<stderr>':
                params.logfile = open(params.logfile.name, params.logfile.mode,
                                      0)
        else:
            bowtie_path, bowtie_version = \
                os.path.split(params.bowtie2_executable)
        if bowtie_status == [0, 0]:  # This means that both aren't present
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': bowtie binaries not found on $PATH. Please specify' + \
                ' one explicitly with --bowtie or --bowtie2.', params.logfile)
    else:
        fasta_input = False
        bowtie_path, bowtie_version = None, None  # Set these to None

    # Check for indexes. If the user has specified that indexes need to
    # be created then do so.
    if params.index_location is None:
        #  If you need indexes, you need twoBitToFa
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
        index_path = rsem_index(
            ''.join([params.rsem_path, '/rsem-prepare-reference']),
            fasta_input, (bowtie_path, bowtie_version), params)
    else:
        index_path = os.path.abspath(params.index_location)

    try:
        #  One of the files crucial to the index is GENOME_FASTA_PREFIX.chrlist
        index_prefix = [x.split('.')[0] for x in os.listdir(index_path) if \
                        x.endswith('chrlist')][0]
    except IndexError:
        if params.logfile.name != '<stderr>':
            params.logfile.close()
        raise SystemExit('ERROR ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
                         ': Indexes not found at specified location')
    return bowtie_path, bowtie_version, index_path, index_prefix
def main():
    """
    This wrapper script will run the tool mutect within the  mutect docker
    container for the precision immuno project. The wrapper requires
    1. mutect
    2. java (For running mutect)
    3. twoBitToFa from the kent tools library (For extracting the reference
            genome in case indexing is required)
    4. lftp for downloading the cosmic vcf

    Unless specified, the program will look for default executables on $PATH.
    The program DOES NOT look for jar files and they are required to be
    passed during execution.
    """
    # Parse the arguments using prepare.parse_args()
    params = prepare.parse_args(main.__doc__, 'mutect', 'mutect_calls')
    # params ERROR handling
    if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please use a suitable value for --Xmx.', params.logfile)
    params.java_executable = pi_errors.test_param_value(params.java_executable,
                                                        'java',
                                                        '--java',
                                                        params.logfile)
    params.mutect_jar = pi_errors.test_param_value(params.mutect_jar,
                                                   'Mutect jar',
                                                   '--mutect_jar',
                                                   params.logfile)
    #  If Indexing is required, does twoBitToFa point to a valid file?
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    #  Do the dnsnp and cosmic vcfs exist?
    if params.dbsnp_file == 'DOWNLOAD' or params.cosmic_file == 'DOWNLOAD':
        #  First ensure the vcf storage location has been provided
        if params.vcf_location is None:
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': --vcf_location cannot be empty if either --cosmic, ' + \
                '--dbsnp, or --genome_fasta are empty.', params.logfile)
        else:
            params.vcf_location = os.path.abspath(params.vcf_location)
        # Download dbsnp file if required
        if params.dbsnp_file == 'DOWNLOAD':
            if os.path.exists('/'.join([params.vcf_location, '00-All.vcf'])):
                params.dbsnp_file = '/'.join([params.vcf_location,
                                              '00-All.vcf'])
            else:
                params.dbsnp_file = prepare.download_vcf('dbsnp', params)
        # Download cosmic file if required
        if params.cosmic_file == 'DOWNLOAD':
            if os.path.exists('/'.join([params.vcf_location,
                                        'Cosmic_sorted.vcf'])):
                params.cosmic_file = '/'.join([params.vcf_location,
                                               'Cosmic_sorted.vcf'])
            else:
                params.cosmic_file = prepare.download_vcf('cosmic', params)
    # Download genome fasta if required
    if params.genome_fasta == 'DOWNLOAD':
        if params.vcf_location is None:
            #  If params.vcf_location is None, set it to the output directory
            params.vcf_location = params.outdir
        #  Does the fasta exist in the vcf_location directory?
        if os.path.exists(''.join([params.vcf_location, '/',
                                   params.genome_version, '.fa'])):
            params.genome_fasta = ''.join([params.vcf_location, '/',
                                           params.genome_version, '.fa'])
        else:
            params.genome_fasta = prepare.get_genome(params.genome_version,
                                                     params.vcf_location,
                                                     params.tbtf_executable,
                                                     params.logfile)
    else:
        params.genome_fasta = pi_errors.test_param_value(params.genome_fasta,
                                                         'Genomic Fasta',
                                                         '--genome_fasta',
                                                         params.logfile)

    # Move to working directory before doing I/O intensive work
    os.chdir(params.working_dir)

    # Call the program
    mutect_call = [params.java_executable, ''.join(['-Xmx', params.java_Xmx]),
                   '-jar'] #  Base java call
    mutect_call.append(params.mutect_jar)
    mutect_call.extend(['-T', 'MuTect'])
    mutect_call.extend(['-R', params.genome_fasta])
    mutect_call.extend(['--cosmic', params.cosmic_file])
    mutect_call.extend(['--dbsnp', params.dbsnp_file])
    mutect_call.extend(['--input_file:normal', params.norm_d_file])
    mutect_call.extend(['--input_file:tumor', params.tum_d_file])
    mutect_call.extend(['--out', ''.join([params.out_prefix, '.out'])])
    return_value = call(mutect_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': MuTect failed.', params.logfile)

    with open(''.join([params.out_prefix, '.out']), 'r') as mutect_file, \
            open(''.join([params.out_prefix, 'non_rejected.out']), 'w') as \
            nr_file:
        for line in mutect_file:
            line = line.strip()
            if line.startswith('#'):
                print(line, file=nr_file)
                continue
            if line.startswith('contig'):
                print('#', line, sep='', file=nr_file)
                continue
            line = line.split('\t')
            if line[50] == 'REJECT':
                continue
            else:
                print(line, sep='\t', file=nr_file)

    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Mutect run completed. Finishing up...', file=params.logfile)
    # Move files from temp directory to outdir
    prepare.move_output(params)
    print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' +
          'completed', file=params.logfile)
    params.logfile.close()
Exemple #15
0
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    #  Does the provided radia binary provided exist?
    params.radia_executable = pi_errors.test_param_value(
        params.radia_executable, 'radia', '--radia', params.logfile)
    #  Setup filterRadia.py
    params.filter_radia_executable = '/'.join(
        [os.path.split(params.radia_executable)[0], 'filterRadia.py'])
    params.filter_radia_executable = pi_errors.test_param_value(
        params.filter_radia_executable, 'filterradia', '--radia',
        params.logfile)
    #  Test input files
    params.tum_d_file = pi_errors.test_param_value(params.tum_d_file,
                                                   'Tumor DNA',
                                                   '--tum_dna_file',
                                                   params.logfile)
    params.norm_d_file = pi_errors.test_param_value(params.norm_d_file,
                                                    'Normal DNA',
                                                    '--norm_dna_file',
                                                    params.logfile)
    if params.tum_r_file is not None:
        params.tum_r_file = pi_errors.test_param_value(params.tum_r_file,
                                                       'Tumor RNA',
                                                       '--tum_rna_file',
                                                       params.logfile)

    #  If you don't have a reference, you need twoBitToFasta
    if params.index_location is None:
        params.tbtf_executable = pi_errors.test_param_value(
            params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
            params.logfile)
    #  Are dnsnp or cosmic vcf required?
    if params.dbsnp_file == 'DOWNLOAD' or params.cosmic_file == 'DOWNLOAD' or \
            params.genome_fasta == 'DOWNLOAD':
        # Ensure the vcf storage location has been provided
        if params.vcf_location is None:
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': --vcf_location cannot be empty if either --cosmic, ' + \
                '--dbsnp, or --genome_fasta are empty.', params.logfile)
        else:
            params.vcf_location = os.path.abspath(params.vcf_location)
        # Download dbsnp file if required
        if params.dbsnp_file == 'DOWNLOAD':
            if os.path.exists('/'.join([params.vcf_location, '00-All.vcf'])):
                params.dbsnp_file = '/'.join(
                    [params.vcf_location, '00-All.vcf'])
            else:
                params.dbsnp_file = prepare.download_vcf('dbsnp', params)
        # Download cosmic file if required
        if params.cosmic_file == 'DOWNLOAD':
            if os.path.exists('/'.join(
                [params.vcf_location, 'Cosmic_sorted.vcf'])):
                params.cosmic_file = '/'.join(
                    [params.vcf_location, 'Cosmic_sorted.vcf'])
            else:
                params.cosmic_file = prepare.download_vcf('cosmic', params)
        # Download genome fasta if required
        if params.genome_fasta == 'DOWNLOAD' or not \
                os.path.exists(params.genome_fasta):
            if os.path.exists(''.join(
                [params.vcf_location, '/', params.genome_version, '.fa'])):
                params.genome_fasta = ''.join(
                    [params.vcf_location, '/', params.genome_version, '.fa'])
            else:
                params.genome_fasta = prepare.get_genome(
                    params.genome_version, params.vcf_location,
                    params.twoBitToFa_executable, sys.stderr)
        else:
            params.genome_fasta = os.path.abspath(params.genome_fasta)
    #  Set up the value for rna_fasta
    if params.rna_fasta == 'GENOME_FASTA':
        params.rna_fasta = params.genome_fasta
    else:
        params.rna_fasta = pi_errors.test_param_value(params.rna_fasta,
                                                      'RNA Fasta',
                                                      '--rna_fasta',
                                                      params.logfile)
    #  Ensure the other databases are set up correctly
    #  The package path is 2 levels above the
    radia_pkg_path = os.path.split(os.path.split(
        params.radia_executable)[0])[0]
    database_map = defaultdict()
    test_database(params.blacklist, 'blacklist', radia_pkg_path,
                  params.vcf_location, 'data/hg19/blacklists/1000Genomes/' + \
                  'phase1/', database_map)
    test_database(params.retrogenes, 'retrogenes', radia_pkg_path,
                  params.vcf_location, 'data/hg19/retroGenes/', database_map)
    test_database(params.pseudogenes, 'pseudogenes', radia_pkg_path,
                  params.vcf_location, 'data/hg19/peudoGenes/', database_map)
    test_database(params.broad_targets, 'broad_targets', radia_pkg_path,
                  params.vcf_location, 'data/hg19/broadTargets/', database_map)
    test_database(params.rna_blacklist, 'rna_blacklist', radia_pkg_path,
                  params.vcf_location, 'data/rnaGeneBlacklist.tab',
                  database_map)
    test_database(params.rna_family_blacklist, 'rna_family_blacklist',
                  radia_pkg_path, params.vcf_location,
                  'data/rnaGeneFamilyBlacklist.tab', database_map)
    #  If any of the above were returned as 'DOWNLOAD' then download the radia
    #  data folder to a temp directory from git and set the values for the
    #  invalid ones.
    if len([db for db, val in database_map.items() if val == 'DOWNLOAD']) > 0:
        download_databases(database_map, params.logfile)
    # if the -C all option was specified, expand params.chromosome
    if params.chromosome == 'all':
        params.chromosome = [''.join(['chr', str(i)]) for i in \
                             range(1, 23)+['X', 'Y']]
    return database_map
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.', file=params.logfile)
    # Does the input vcf file exist?
    if not os.path.exists(''.join([params.file_path, '/', params.file_prefix,
                                   '.vcf'])):
        raise pi_errors.InputFileError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please provide a valid input file using --file_prefix',
            params.logfile)
    #  The memory option for java should be of the form Xmx10G or Xmx10M
    if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please use a suitable value for --Xmx.', params.logfile)
    params.java_executable = pi_errors.test_param_value(params.java_executable,
                                                        'java',
                                                        '--java',
                                                        params.logfile)
    #  Does the provided snpeff binary provided exist?
    params.snpeff_jar = pi_errors.test_param_value(params.snpeff_jar,
                                                   'snpeff',
                                                   '--snpeff_jar',
                                                   params.logfile)
    params.use_snpeff_db = False
    #  Does the user want a snpEff packaged database?
    if params.config_file == 'PACKAGED':
        params.use_snpeff_db = True
        #  Has the snpeff reference to be used been provided?
        if params.reference_name == None:
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': --snp_reference is required if --config=PACKAGED.',
                params.logfile)
    # If a custom databse is desired, does it need to be created?
    if params.index_location is None:
        #  If the user has provided the location to the parent directory of data
        #  directory, make DATA_DIRECTORY point to data. If they have provided
        #  the link to data, make INDEX_DESTINATION point to the parent and
        #  DATA_DIRECTORY point to data.
        if os.path.split(params.index_destination.rstrip('/'))[1] != 'data':
            params.data_directory = '/'.join([params.index_destination, 'data'])
        else:
            params.data_directory = params.index_destination
            params.index_destination = \
                params.index_destination.rstrip('/').rstrip('/data')
        #  Create the data directory if needed
        if not os.path.exists(params.data_directory):
            prepare.py_mkdir(params.data_directory)
        #  If we're using a custom databse, thre is nothing more to do
        if params.use_snpeff_db:
            return None
        #  Initialise the reference name
        params.reference_name = ''.join([params.genome_version, '_custom'])
        # make a variable to gold GENOME_VERSION_custom
        genome_folder = '/'.join([params.data_directory, params.reference_name])
        prepare.py_mkdir(genome_folder)
        #  If the genome fasta isn't provided or is provided a wrong value,
        #  download it
        if params.genome_fasta == 'DOWNLOAD' or not \
                os.path.exists(params.genome_fasta):
            #  Does the provided tbtf binary point to a valid file?
            params.tbtf_executable = pi_errors.test_param_value(
                params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
                params.logfile)
            params.genome_fasta = prepare.get_genome(
                params.genome_version, genome_folder,
                params.tbtf_executable, params.logfile)
            #  Rename genome fasta
            call(['mv', params.genome_fasta, '/'.join([genome_folder,
                                                       'sequences.fa'])])
        else:
            params.genome_fasta = os.path.abspath(params.genome_fasta)
            #  Link sequencesfa to genome fasta
            call(['ln', '-s', '-T', params.genome_fasta,
                  '/'.join([genome_folder, 'sequences.fa'])])
        #  Download the gencode GTF file
        params.gtf_file = prepare.get_gtf(params.genome_version,
                                          genome_folder,
                                          params.logfile)
        #  Rename gtf file
        call(['mv', params.gtf_file, '/'.join([genome_folder, 'genes.gtf'])])
    # If it has been provided, set up the config file
    else:
        #  If the user has provided the location to the parent directory of data
        #  directory, make DATA_DIRECTORY point to data. If they have provided
        #  the link to data, make INDEX_LOCATION point to the parent and
        #  DATA_DIRECTORY point to data.
        if os.path.split(params.index_location.rstrip('/'))[1] != 'data':
            params.data_directory = '/'.join([params.index_location, 'data'])
        else:
            params.data_directory = params.index_location
            params.index_location = \
                params.index_location.rstrip('/').rstrip('/data')
        #  If we're using a custom databse, thre is nothing more to do
        if params.use_snpeff_db:
            return None
        #  If the config file hasn't been provided, is it in INDEX_LOCATION
        #  AND does GENOME_VERSION_custom exist (i.e. was it created by
        #  this script?)
        if params.config_file is None:
            params.config_file = pi_errors.test_param_value(
                '/'.join([params.index_location, 'snpEff.config']),
                'snpEff.config', '--config', params.logfile)
            #  Dummy variable to ensure the GENOME_VERSION_custom exists
            _ = pi_errors.test_param_value(
                ''.join([params.data_directory, '/', params.genome_version,
                         '_custom']), '_'.join([params.genome_version, 'custom'
                                               ]), '--snpeff_reference and' +
                '--config', params.logfile)
            params.reference_name = '_'.join([params.genome_version, 'custom'])
        #  If a config file has been provided, does it point to a legit file and
        #  has the reference name also been provided?
        else:
            params.config_file = pi_errors.test_param_value(
                params.config_file, 'snpEff config file', '--config',
                params.logfile)
            if params.reference_name is None:
                raise pi_errors.ParameterError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ': --snpeff_reference is required if --config points to' + \
                    ' a custom file.', params.logfile)
    return None
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    #  Are predictor executables set up correctly?
    if os.path.split(params.mhc_executable)[1] == 'mhc_II_binding.py':
        params.mhc_executable = pi_errors.test_param_value(
            params.mhc_executable, 'mhc_II_binding.py', '--mhc_predictor',
            params.logfile)
        params.netmhciipan_executable = pi_errors.test_param_value(
            params.netmhciipan_executable, 'netMHCIIpan', '--netMHCIIpan',
            params.logfile)
    else:
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --mhc_predictor has to be predict_binding.py', params.logfile)
    #  List of acceptable prediction methods.  Used to ensure the correct
    #  prediction method has been provided.
    prediction_methods = set(['comblib', 'consensus3', 'IEDB_recommended',
                              'NetMHCIIpan', 'nn_align', 'smm_align',
                              'tepitope', ])
    if params.pred_meth not in prediction_methods:
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --prediction_method has to be one of ' + \
            ''.join(prediction_methods), params.logfile)
    #  For MHCII, peplen can only be 15.
    params.peplen = [15]
    #  Ensure the --file_prefix has been set up right
    if params.file_prefix.endswith('.faa'):
        pepfilename = pi_errors.test_param_value(
            '/'.join([params.file_path, params.file_prefix]), 'Input file',
            '--file_prefix', params.logfile)
    else:
        #  More that 1 peptide provided and file_prefix points to a single file
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': --file_prefix must point to a valid .faa file.', params.logfile)
    #  Test the input alleles for validity
    #  First, if an input .allele file is provided, parse it to obtain the list
    #  of alleles
    if len(params.alleles) == 1 and params.alleles[0].endswith(".alleles"):
        #  This means the user has provided a .alleles file
        pi_errors.test_param_value(params.alleles[0], params.alleles[0],
                                   '--alleles', params.logfile)
        with open(params.alleles[0], 'r') as allele_file:
            params.alleles = []
            for line in allele_file:
                params.alleles.append(line.strip())
    #  Once the .allele file has been parsed, params.alleles now either contains
    #  the parsed alleles, or the list of alleles provided by the user.  Now we
    #  need to ensure that all alleles have been formatted in the required
    #  format Eg. HLA-DRB1*01:04, HLA-DQA1*03:01/DQB1*03:02
    #  IMPORTANT: HLA-DP isn't implemented yet -- none of the predictors use it
    MHCII = re.compile(
        r'HLA-D((RB\d\*\d{2}:\d{2})|'
        r'([PQ]A\d\*\d{2}:\d{2}/D[PQ]B\d\*\d{2}:\d{2}))')

    for allele in params.alleles:
        #  For each allele in params.alleles, cehck if it matches the regex for
        #  a generic MHCI molecule
        if not MHCII.match(allele):
            return pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': Alleles for mhcii must be in the form HLA-DRB1*XX:XX or' + \
                ' HLA-DQA1*XX:XX/DQB1*XX:XX where X is in [0, 9]',
                params.logfile)
    return pepfilename