Beispiel #1
0
def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('rmdup_blacklist')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20)

    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        # This is extremely fast and has minimal memory usage. Yay!
        # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions?
        command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s"

        shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam',  # TODO: CLEAN THIS
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log',
                                        CONFIG['binaries']['bedtools'],
                                        os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome],  # TODO: CLEAN THIS
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam'))

    shell_job_runner.finish()

    primary_logger.info('Removing temporary files from stage 1 ...')
    for filename in input_files:
        os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam')

    primary_logger.info('Completed rmdup and blacklist')
def generate_index(input_files, output_path, disable_parallel=False):
    """
    Many peak pickers want indexed .bams. Let's build indexes! (yay!)

    :param input_files:
    :param output_path:
    :param disable_parallel:
    :return:
    """
    primary_logger = _logshim.getLogger('index')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger,
                                                          delay_seconds=10)

    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        command = "%s index %s"

        shell_job_runner.run(command %
                             (CONFIG['binaries']['samtools'], filename))

    shell_job_runner.finish()
Beispiel #3
0
def fastq_merge(merge_strategy, output_path, disable_parallel=False):
    """
    Concatenate multiple fastq files (from multiple lanes) into one.

    :param merge_strategy:
    :param output_path:
    :return:
    """
    merge_log = _logshim.getLogger('fastq_merge')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(merge_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45)

    for merged_name, merge_inputs in merge_strategy.iteritems():
        merge_input_files = ' '.join(merge_inputs)
        merge_log.info('Spawning niced process to merge: %s' % (merged_name))
        for filename in merge_inputs:
            assert(" " not in filename)
            assert(";" not in filename)  # Vague sanity testing for input filenames
            merge_log.debug('    Input: %s' % (filename))

        # WARNING: Using shell has security implications! Don't work on untrusted input filenames.
        command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name)

        shell_job_runner.run(command)

    shell_job_runner.finish()

    return True
Beispiel #4
0
def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('first_pass')

    output_suffix = ".tmp"

    if disable_parallel:  # Doesn't change parallelism in last samtools sort
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60)

    # We do a few things here:
    #  - View only mapping quality >= 10
    #  - Remove chrM
    #  - Sort by name for fixmate
    #     We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed.
    #  - Fixmate (needed for rmdrup)
    #  - Resorted by position
    tempfiles = []
    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \
                  '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s'

        # A super evil user could modify TMPDIR and make this generate evil strings. That's evil.
        temporary_file = tempfile.mkstemp('.tmp.bam')
        tempfiles.append(temporary_file)

        shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
                                        filename,
                                        CONFIG['binaries']['samtools'],
                                        CONFIG['binaries']['samtools'],
                                        MAX_MEM,
                                        temporary_file[1],
                                        CONFIG['binaries']['samtools'],
                                        CONFIG['binaries']['samtools'],
                                        MAX_MEM,
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix))

    shell_job_runner.finish()

    # Clean up our temporary files.
    primary_logger.info('Removing temporary files ...')
    for fd, fname in tempfiles:
        os.close(fd)
        os.unlink(fname)

    primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.')
def run_bowtie2(paired_end_mapping,
                genome,
                output_path,
                disable_parallel=False):
    bowtie2_logger = _logshim.getLogger('run_bowtie2')

    # Import the config file to get genome locations
    config = _script_helpers.get_config()

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger,
                                                          delay_seconds=60)

    for output_prefix, paired_ends in paired_end_mapping.iteritems():
        bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' %
                            (output_prefix))
        for filename in paired_ends:
            assert (" " not in filename)
            assert (";" not in filename
                    )  # Vague sanity testing for input filenames
            bowtie2_logger.debug('    Input: %s' % (filename))

        # bowtie2 options:
        # --end-to-end: this is the default, but let's explicitly specify it
        # --sensitive: again, the default (consider switching to --fast?)
        # --no-unal: Suppress unaligned reads from the output .sam
        # --no-discordant: These are paired-end reads. We expect them to be non-discordant.
        # --mm: mmap MAP_SHARED (other processes can use our genome, cool!)
        # --met-stderr: Write metrics to stderr
        # --time: output the time things took
        # -x: target genome
        command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s"

        shell_job_runner.run(
            command %
            (config['bowtie2_genomes'][genome], paired_ends[0], paired_ends[1],
             output_path + "/" + output_prefix + ".bt2.log",
             output_path + "/" + output_prefix + ".bt2.bam"))

    shell_job_runner.finish()
def run_macs14(input_files, output_path, genome, disable_parallel=False):
    macs14_log = _logshim.getLogger('run_macs14')

    macs14_log.info('Spawning MACS14 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log,
                                                          delay_seconds=20)

    for filename in input_files:
        macs14_log.debug('Working on: %s' % (filename))

        # macs14 is old, but we try it anyway, since it's sometimes useful.
        # -t: input
        # -n: output name
        # -f: format
        # -g: genome
        # -p: pvalue for peak cutoff
        # --wig: save .wig outputs
        # --single-profile: make one single wiggle
        # --space=50: wiggle resolution (default: 10)
        #
        # Note: This CD hack is because MACS1.4 can't specify an output path :(
        command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs14'

        shell_job_runner.run(command % (
            output_path,  # for cd hack
            'macs14',  # This must be pre-installed by the user. It's a big, complex package.
            os.getcwd() + '/' +
            filename,  # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/
            os.path.basename(filename_without_extension),
            genome,  # for genome size
            os.path.basename(filename_without_extension) + '.macs14.log'))

    shell_job_runner.finish()

    macs14_log.info('MACS14 peak calling complete.')
def run_macs2(input_files, output_path, genome, disable_parallel=False):
    macs2_log = _logshim.getLogger('run_macs2')

    macs2_log.info('Spawning MACS2 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log,
                                                          delay_seconds=0.1)

    for filename in input_files:
        macs2_log.debug('Working on: %s' % (filename))

        # --bdg: generate .bed graph output
        # --nomodel: We'll be shifting manually!
        # --extsize 200: See long discussion at: @@@
        # --shift -100: As per above.
        # --slocal: Look at a local window of 20kb to build peak models
        # --keep-dup: We already removed duplicates with samtools.
        # TODO: Consider allowing tweaks to these settings with flags?
        command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs2'

        shell_job_runner.run(command % (
            'macs2',  # This must be pre-installed by the user. It's a big, complex package.
            filename,  # input file
            os.path.basename(filename_without_extension),
            output_path,
            genome,  # for genome size, uncleaer if this actually matters with nolambda/nomodel
            output_path + "/" + os.path.basename(filename_without_extension) +
            '.log'))

    shell_job_runner.finish()

    macs2_log.info('MACS2 peak calling complete.')
Beispiel #8
0
def merge_and_rmdup(input_files, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('merge_and_rmdup')

    # Sanity checks on the input files list
    assert(len(input_files) > 1)
    # Check files are readable
    for filename in input_files:
        if not os.access(filename, os.R_OK):
            primary_logger.fatal("Unable to read input files.")
            raise IOError

    output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files])

    # Sanity check: maximum output filename length
    max_filename_length = os.statvfs(output_path).f_namemax
    if max_filename_length < 100:
        primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?")
        raise IOError

    if (len(output_file_name) + 10) > max_filename_length:  # roughly truncate filename for sanity.
        primary_logger.critical("Very long filename! Truncating!")
        output_file_name = output_file_name[:-20]  # Give us some extra room for downstream stuff?

    output_file_name += ".merged.bam"

    input_file_string = ' '.join(input_files)

    shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)

    primary_logger.debug('Input file string: %s' % (input_file_string))
    primary_logger.debug('Working on merge as: %s' % (output_file_name))
    # This is pretty fast and has minimal memory usage. Yay!
    # We're probably re-rmduping some files if we're merging. That's ok since this is speedy.
    command = "%s merge -u - %s | %s rmdup - %s 2>%s"

    shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
                                    input_file_string,
                                    CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
                                    output_path + "/" + output_file_name,
                                    output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log'))

    shell_job_runner.finish()


    primary_logger.info('Merge and rmdup complete!')