def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('rmdup_blacklist') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20) for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) # This is extremely fast and has minimal memory usage. Yay! # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions? command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s" shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam', # TODO: CLEAN THIS output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log', CONFIG['binaries']['bedtools'], os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome], # TODO: CLEAN THIS output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam')) shell_job_runner.finish() primary_logger.info('Removing temporary files from stage 1 ...') for filename in input_files: os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam') primary_logger.info('Completed rmdup and blacklist')
def generate_index(input_files, output_path, disable_parallel=False): """ Many peak pickers want indexed .bams. Let's build indexes! (yay!) :param input_files: :param output_path: :param disable_parallel: :return: """ primary_logger = _logshim.getLogger('index') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=10) for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) command = "%s index %s" shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename)) shell_job_runner.finish()
def fastq_merge(merge_strategy, output_path, disable_parallel=False): """ Concatenate multiple fastq files (from multiple lanes) into one. :param merge_strategy: :param output_path: :return: """ merge_log = _logshim.getLogger('fastq_merge') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(merge_log) else: shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45) for merged_name, merge_inputs in merge_strategy.iteritems(): merge_input_files = ' '.join(merge_inputs) merge_log.info('Spawning niced process to merge: %s' % (merged_name)) for filename in merge_inputs: assert(" " not in filename) assert(";" not in filename) # Vague sanity testing for input filenames merge_log.debug(' Input: %s' % (filename)) # WARNING: Using shell has security implications! Don't work on untrusted input filenames. command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name) shell_job_runner.run(command) shell_job_runner.finish() return True
def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('first_pass') output_suffix = ".tmp" if disable_parallel: # Doesn't change parallelism in last samtools sort shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60) # We do a few things here: # - View only mapping quality >= 10 # - Remove chrM # - Sort by name for fixmate # We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed. # - Fixmate (needed for rmdrup) # - Resorted by position tempfiles = [] for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \ '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s' # A super evil user could modify TMPDIR and make this generate evil strings. That's evil. temporary_file = tempfile.mkstemp('.tmp.bam') tempfiles.append(temporary_file) shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename, CONFIG['binaries']['samtools'], CONFIG['binaries']['samtools'], MAX_MEM, temporary_file[1], CONFIG['binaries']['samtools'], CONFIG['binaries']['samtools'], MAX_MEM, output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix)) shell_job_runner.finish() # Clean up our temporary files. primary_logger.info('Removing temporary files ...') for fd, fname in tempfiles: os.close(fd) os.unlink(fname) primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.')
def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False): bowtie2_logger = _logshim.getLogger('run_bowtie2') # Import the config file to get genome locations config = _script_helpers.get_config() if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60) for output_prefix, paired_ends in paired_end_mapping.iteritems(): bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix)) for filename in paired_ends: assert (" " not in filename) assert (";" not in filename ) # Vague sanity testing for input filenames bowtie2_logger.debug(' Input: %s' % (filename)) # bowtie2 options: # --end-to-end: this is the default, but let's explicitly specify it # --sensitive: again, the default (consider switching to --fast?) # --no-unal: Suppress unaligned reads from the output .sam # --no-discordant: These are paired-end reads. We expect them to be non-discordant. # --mm: mmap MAP_SHARED (other processes can use our genome, cool!) # --met-stderr: Write metrics to stderr # --time: output the time things took # -x: target genome command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s" shell_job_runner.run( command % (config['bowtie2_genomes'][genome], paired_ends[0], paired_ends[1], output_path + "/" + output_prefix + ".bt2.log", output_path + "/" + output_prefix + ".bt2.bam")) shell_job_runner.finish()
def run_macs14(input_files, output_path, genome, disable_parallel=False): macs14_log = _logshim.getLogger('run_macs14') macs14_log.info('Spawning MACS14 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20) for filename in input_files: macs14_log.debug('Working on: %s' % (filename)) # macs14 is old, but we try it anyway, since it's sometimes useful. # -t: input # -n: output name # -f: format # -g: genome # -p: pvalue for peak cutoff # --wig: save .wig outputs # --single-profile: make one single wiggle # --space=50: wiggle resolution (default: 10) # # Note: This CD hack is because MACS1.4 can't specify an output path :( command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs14' shell_job_runner.run(command % ( output_path, # for cd hack 'macs14', # This must be pre-installed by the user. It's a big, complex package. os.getcwd() + '/' + filename, # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/ os.path.basename(filename_without_extension), genome, # for genome size os.path.basename(filename_without_extension) + '.macs14.log')) shell_job_runner.finish() macs14_log.info('MACS14 peak calling complete.')
def run_macs2(input_files, output_path, genome, disable_parallel=False): macs2_log = _logshim.getLogger('run_macs2') macs2_log.info('Spawning MACS2 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1) for filename in input_files: macs2_log.debug('Working on: %s' % (filename)) # --bdg: generate .bed graph output # --nomodel: We'll be shifting manually! # --extsize 200: See long discussion at: @@@ # --shift -100: As per above. # --slocal: Look at a local window of 20kb to build peak models # --keep-dup: We already removed duplicates with samtools. # TODO: Consider allowing tweaks to these settings with flags? command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs2' shell_job_runner.run(command % ( 'macs2', # This must be pre-installed by the user. It's a big, complex package. filename, # input file os.path.basename(filename_without_extension), output_path, genome, # for genome size, uncleaer if this actually matters with nolambda/nomodel output_path + "/" + os.path.basename(filename_without_extension) + '.log')) shell_job_runner.finish() macs2_log.info('MACS2 peak calling complete.')
def merge_and_rmdup(input_files, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('merge_and_rmdup') # Sanity checks on the input files list assert(len(input_files) > 1) # Check files are readable for filename in input_files: if not os.access(filename, os.R_OK): primary_logger.fatal("Unable to read input files.") raise IOError output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files]) # Sanity check: maximum output filename length max_filename_length = os.statvfs(output_path).f_namemax if max_filename_length < 100: primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?") raise IOError if (len(output_file_name) + 10) > max_filename_length: # roughly truncate filename for sanity. primary_logger.critical("Very long filename! Truncating!") output_file_name = output_file_name[:-20] # Give us some extra room for downstream stuff? output_file_name += ".merged.bam" input_file_string = ' '.join(input_files) shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) primary_logger.debug('Input file string: %s' % (input_file_string)) primary_logger.debug('Working on merge as: %s' % (output_file_name)) # This is pretty fast and has minimal memory usage. Yay! # We're probably re-rmduping some files if we're merging. That's ok since this is speedy. command = "%s merge -u - %s | %s rmdup - %s 2>%s" shell_job_runner.run(command % (CONFIG['binaries']['samtools'], input_file_string, CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. output_path + "/" + output_file_name, output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log')) shell_job_runner.finish() primary_logger.info('Merge and rmdup complete!')