Beispiel #1
0
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params):
    '''
    This module will create the rsem indexes at params.index_destination using
    RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version
    to make bowtie indexes as well.
    bowtie_info is a tuple of (bowtie_path, bowtie_version)
    params contains
    index_destination - Folder to store the indexes
    n - number of cores to use
    genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD.
    genome_version - hg19/hg38
    logfile - Open file handle to a log file
    RETURN VALUES
    index_path - Path to directory where nidexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Creating rsem references...',
          file=params.logfile)
    index_path = os.path.abspath(params.index_destination)
    #  If the directory doesn't exist, create it
    if not os.path.exists(index_path):
        prepare.py_mkdir(index_path)
    if params.genome_fasta == 'DOWNLOAD':
        params.genome_fasta = prepare.get_genome(params.genome_version,
                                                 index_path,
                                                 params.tbtf_executable,
                                                 params.logfile)
    else:
        params.genome_fasta = pi_errors.test_param_value(
            params.genome_fasta, 'Genomic Fasta', '--genome_fasta',
            params.logfile)
    #  If the gtf file is required, download it
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running rsem-prepare-reference on fasta reference.',
          file=params.logfile)
    rsem_prepref_call = [rsem_index_executable]  # base call
    rsem_prepref_call.extend(['--gtf', gencode_file])  # gtf file
    if fasta_input:
        rsem_prepref_call.extend([
            ''.join(['--', bowtie_version]),
            ''.join(['--', bowtie_version, '-path']), bowtie_path
        ])
    else:
        rsem_prepref_call.append('--no-bowtie')
    rsem_prepref_call.append(params.genome_fasta)
    rsem_prepref_call.extend(
        [''.join([index_path, '/', params.genome_version])])
    print(rsem_prepref_call, file=params.logfile)
    return_value = call(rsem_prepref_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.',
          file=params.logfile)
    return index_path
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params):
    '''
    This module will create the rsem indexes at params.index_destination using
    RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version
    to make bowtie indexes as well.
    bowtie_info is a tuple of (bowtie_path, bowtie_version)
    params contains
    index_destination - Folder to store the indexes
    n - number of cores to use
    genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD.
    genome_version - hg19/hg38
    logfile - Open file handle to a log file
    RETURN VALUES
    index_path - Path to directory where nidexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Creating rsem references...', file=params.logfile)
    index_path = os.path.abspath(params.index_destination)
    #  If the directory doesn't exist, create it
    if not os.path.exists(index_path):
        prepare.py_mkdir(index_path)
    if params.genome_fasta == 'DOWNLOAD':
        params.genome_fasta = prepare.get_genome(params.genome_version,
                                                 index_path,
                                                 params.tbtf_executable,
                                                 params.logfile)
    else:
        params.genome_fasta = pi_errors.test_param_value(params.genome_fasta,
                                                         'Genomic Fasta',
                                                         '--genome_fasta',
                                                         params.logfile)
    #  If the gtf file is required, download it
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running rsem-prepare-reference on fasta reference.',
          file=params.logfile)
    rsem_prepref_call = [rsem_index_executable] # base call
    rsem_prepref_call.extend(['--gtf', gencode_file]) # gtf file
    if fasta_input:
        rsem_prepref_call.extend([''.join(['--', bowtie_version]),
                                  ''.join(['--', bowtie_version, '-path']),
                                  bowtie_path])
    else:
        rsem_prepref_call.append('--no-bowtie')
    rsem_prepref_call.append(params.genome_fasta)
    rsem_prepref_call.extend([''.join([index_path, '/',
                                       params.genome_version])])
    print(rsem_prepref_call, file=params.logfile)
    return_value = call(rsem_prepref_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.', file=params.logfile)
    return index_path
Beispiel #3
0
def star_indexing(star_executable, read_length, params):
    '''
    This module indexes a genome using STAR_EXECUTABLE using READ_LENGTH to set
    edge size.
    params contains
    index_destination - The location where the index should be stored
    logfile - Open file handle to a log file
    genome_version - hg19/hg38
    n - number of cores to use
    tbtf_executable - path to twoBitToFa
    RETURN VALUES
    index_path - path ot the directory where indexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing fasta...',
          file=params.logfile)
    params.index_destination = os.path.abspath(params.index_destination)
    if not os.path.exists(params.index_destination):
        prepare.py_mkdir(params.index_destination)
    edge_size = max(50, int(round(read_length / 50, 0) * 50))  # minimum edge
    # size = 50
    index_path = ''.join(
        [params.index_destination, '/STAR_',
         str(edge_size), '_references'])
    if not os.path.exists(index_path):  # make reference based on edge size
        prepare.py_mkdir(index_path)
    genome_fasta = prepare.get_genome(params.genome_version, index_path,
                                      params.tbtf_executable, params.logfile)
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running STAR index on fasta reference.',
          file=params.logfile)
    starindex_call = [star_executable]  # Base call
    starindex_call.extend(['--runThreadN', str(params.n)])  # Threads
    starindex_call.extend(['--runMode', 'genomeGenerate'])  # Indexing module
    starindex_call.extend(['--genomeDir', index_path])  # index directory
    starindex_call.extend(['--genomeFastaFiles', genome_fasta])  # Genomic fa
    starindex_call.extend(['--sjdbGTFfile', gencode_file])  # gencode annots
    starindex_call.extend(['--sjdbOverhang', str(read_length)])  # edge size
    print(starindex_call, file=params.logfile)
    return_value = call(starindex_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.',
          file=params.logfile)
    return index_path
def star_indexing(star_executable, read_length, params):
    '''
    This module indexes a genome using STAR_EXECUTABLE using READ_LENGTH to set
    edge size.
    params contains
    index_destination - The location where the index should be stored
    logfile - Open file handle to a log file
    genome_version - hg19/hg38
    n - number of cores to use
    tbtf_executable - path to twoBitToFa
    RETURN VALUES
    index_path - path ot the directory where indexes were stored
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing fasta...', file=params.logfile)
    params.index_destination = os.path.abspath(params.index_destination)
    if not os.path.exists(params.index_destination):
        prepare.py_mkdir(params.index_destination)
    edge_size = max(50, int(round(read_length / 50, 0) * 50)) # minimum edge
                                                              # size = 50
    index_path = ''.join([params.index_destination, '/STAR_',
                          str(edge_size), '_references'])
    if not os.path.exists(index_path): # make reference based on edge size
        prepare.py_mkdir(index_path)
    genome_fasta = prepare.get_genome(params.genome_version, index_path,
                                      params.tbtf_executable,
                                      params.logfile)
    gencode_file = prepare.get_gtf(params.genome_version, index_path,
                                   params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ':    ' +
          'Running STAR index on fasta reference.', file=params.logfile)
    starindex_call = [star_executable] # Base call
    starindex_call.extend(['--runThreadN', str(params.n)]) # Threads
    starindex_call.extend(['--runMode', 'genomeGenerate']) # Indexing module
    starindex_call.extend(['--genomeDir', index_path]) # index directory
    starindex_call.extend(['--genomeFastaFiles', genome_fasta]) # Genomic fa
    starindex_call.extend(['--sjdbGTFfile', gencode_file]) # gencode annots
    starindex_call.extend(['--sjdbOverhang', str(read_length)]) # edge size
    print(starindex_call, file=params.logfile)
    return_value = call(starindex_call)
    if return_value != 0:
        raise pi_errors.MyRuntimeError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Indexing Failed', params.logfile)
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' +
          'Indexing completed.', file=params.logfile)
    return index_path
def process_parameters(params):
    '''
    This module conducts the error handling for all parmeters passed to the
    program.
    '''
    print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +
          ': Processing input parameters.', file=params.logfile)
    # Does the input vcf file exist?
    if not os.path.exists(''.join([params.file_path, '/', params.file_prefix,
                                   '.vcf'])):
        raise pi_errors.InputFileError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please provide a valid input file using --file_prefix',
            params.logfile)
    #  The memory option for java should be of the form Xmx10G or Xmx10M
    if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')):
        raise pi_errors.ParameterError(
            dt.now().strftime('%I:%M %p %b %d, %Y') + \
            ': Please use a suitable value for --Xmx.', params.logfile)
    params.java_executable = pi_errors.test_param_value(params.java_executable,
                                                        'java',
                                                        '--java',
                                                        params.logfile)
    #  Does the provided snpeff binary provided exist?
    params.snpeff_jar = pi_errors.test_param_value(params.snpeff_jar,
                                                   'snpeff',
                                                   '--snpeff_jar',
                                                   params.logfile)
    params.use_snpeff_db = False
    #  Does the user want a snpEff packaged database?
    if params.config_file == 'PACKAGED':
        params.use_snpeff_db = True
        #  Has the snpeff reference to be used been provided?
        if params.reference_name == None:
            raise pi_errors.ParameterError(
                dt.now().strftime('%I:%M %p %b %d, %Y') + \
                ': --snp_reference is required if --config=PACKAGED.',
                params.logfile)
    # If a custom databse is desired, does it need to be created?
    if params.index_location is None:
        #  If the user has provided the location to the parent directory of data
        #  directory, make DATA_DIRECTORY point to data. If they have provided
        #  the link to data, make INDEX_DESTINATION point to the parent and
        #  DATA_DIRECTORY point to data.
        if os.path.split(params.index_destination.rstrip('/'))[1] != 'data':
            params.data_directory = '/'.join([params.index_destination, 'data'])
        else:
            params.data_directory = params.index_destination
            params.index_destination = \
                params.index_destination.rstrip('/').rstrip('/data')
        #  Create the data directory if needed
        if not os.path.exists(params.data_directory):
            prepare.py_mkdir(params.data_directory)
        #  If we're using a custom databse, thre is nothing more to do
        if params.use_snpeff_db:
            return None
        #  Initialise the reference name
        params.reference_name = ''.join([params.genome_version, '_custom'])
        # make a variable to gold GENOME_VERSION_custom
        genome_folder = '/'.join([params.data_directory, params.reference_name])
        prepare.py_mkdir(genome_folder)
        #  If the genome fasta isn't provided or is provided a wrong value,
        #  download it
        if params.genome_fasta == 'DOWNLOAD' or not \
                os.path.exists(params.genome_fasta):
            #  Does the provided tbtf binary point to a valid file?
            params.tbtf_executable = pi_errors.test_param_value(
                params.tbtf_executable, 'twoBitToFa', '--twoBitToFa',
                params.logfile)
            params.genome_fasta = prepare.get_genome(
                params.genome_version, genome_folder,
                params.tbtf_executable, params.logfile)
            #  Rename genome fasta
            call(['mv', params.genome_fasta, '/'.join([genome_folder,
                                                       'sequences.fa'])])
        else:
            params.genome_fasta = os.path.abspath(params.genome_fasta)
            #  Link sequencesfa to genome fasta
            call(['ln', '-s', '-T', params.genome_fasta,
                  '/'.join([genome_folder, 'sequences.fa'])])
        #  Download the gencode GTF file
        params.gtf_file = prepare.get_gtf(params.genome_version,
                                          genome_folder,
                                          params.logfile)
        #  Rename gtf file
        call(['mv', params.gtf_file, '/'.join([genome_folder, 'genes.gtf'])])
    # If it has been provided, set up the config file
    else:
        #  If the user has provided the location to the parent directory of data
        #  directory, make DATA_DIRECTORY point to data. If they have provided
        #  the link to data, make INDEX_LOCATION point to the parent and
        #  DATA_DIRECTORY point to data.
        if os.path.split(params.index_location.rstrip('/'))[1] != 'data':
            params.data_directory = '/'.join([params.index_location, 'data'])
        else:
            params.data_directory = params.index_location
            params.index_location = \
                params.index_location.rstrip('/').rstrip('/data')
        #  If we're using a custom databse, thre is nothing more to do
        if params.use_snpeff_db:
            return None
        #  If the config file hasn't been provided, is it in INDEX_LOCATION
        #  AND does GENOME_VERSION_custom exist (i.e. was it created by
        #  this script?)
        if params.config_file is None:
            params.config_file = pi_errors.test_param_value(
                '/'.join([params.index_location, 'snpEff.config']),
                'snpEff.config', '--config', params.logfile)
            #  Dummy variable to ensure the GENOME_VERSION_custom exists
            _ = pi_errors.test_param_value(
                ''.join([params.data_directory, '/', params.genome_version,
                         '_custom']), '_'.join([params.genome_version, 'custom'
                                               ]), '--snpeff_reference and' +
                '--config', params.logfile)
            params.reference_name = '_'.join([params.genome_version, 'custom'])
        #  If a config file has been provided, does it point to a legit file and
        #  has the reference name also been provided?
        else:
            params.config_file = pi_errors.test_param_value(
                params.config_file, 'snpEff config file', '--config',
                params.logfile)
            if params.reference_name is None:
                raise pi_errors.ParameterError(
                    dt.now().strftime('%I:%M %p %b %d, %Y') + \
                    ': --snpeff_reference is required if --config points to' + \
                    ' a custom file.', params.logfile)
    return None