def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('rmdup_blacklist') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20) for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) # This is extremely fast and has minimal memory usage. Yay! # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions? command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s" shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam', # TODO: CLEAN THIS output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log', CONFIG['binaries']['bedtools'], os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome], # TODO: CLEAN THIS output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam')) shell_job_runner.finish() primary_logger.info('Removing temporary files from stage 1 ...') for filename in input_files: os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam') primary_logger.info('Completed rmdup and blacklist')
def fastq_merge(merge_strategy, output_path, disable_parallel=False): """ Concatenate multiple fastq files (from multiple lanes) into one. :param merge_strategy: :param output_path: :return: """ merge_log = _logshim.getLogger('fastq_merge') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(merge_log) else: shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45) for merged_name, merge_inputs in merge_strategy.iteritems(): merge_input_files = ' '.join(merge_inputs) merge_log.info('Spawning niced process to merge: %s' % (merged_name)) for filename in merge_inputs: assert(" " not in filename) assert(";" not in filename) # Vague sanity testing for input filenames merge_log.debug(' Input: %s' % (filename)) # WARNING: Using shell has security implications! Don't work on untrusted input filenames. command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name) shell_job_runner.run(command) shell_job_runner.finish() return True
def generate_index(input_files, output_path, disable_parallel=False): """ Many peak pickers want indexed .bams. Let's build indexes! (yay!) :param input_files: :param output_path: :param disable_parallel: :return: """ primary_logger = _logshim.getLogger('index') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=10) for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) command = "%s index %s" shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename)) shell_job_runner.finish()
def flatten_tsv(filename): """ Flaten a TSV file -- parse and concatenate identical row names, by summing their values. """ flatlog = _logshim.getLogger('flatten_tsv') flatlog.debug('Flattening input file: %s' % (filename)) data_dict = OrderedDict() with open(filename, 'r') as tsv_ro_fh: tsv_input = csv.reader(tsv_ro_fh, delimiter=str("\t")) header = next(tsv_input, None) for row in tsv_input: row_key = row[0] these_row_values_as_int = map(int, row[1:]) if row_key in data_dict: # Add the current row values to the existing values data_dict[row_key] = map(operator.add, data_dict[row_key], these_row_values_as_int) else: data_dict[row_key] = these_row_values_as_int # Write back the parsed dict with open(filename, 'wb') as tsv_rw_fh: tsv_writer = csv.writer(tsv_rw_fh, delimiter=str("\t")) tsv_writer.writerow(header) for key, val in data_dict.iteritems(): tsv_writer.writerow([key] + val)
def run_macs2(input_files, output_path, genome, disable_parallel=False): macs2_log = _logshim.getLogger('run_macs2') macs2_log.info('Spawning MACS2 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1) for filename in input_files: macs2_log.debug('Working on: %s' % (filename)) # --bdg: generate .bed graph output # --nomodel: We'll be shifting manually! # --extsize 200: See long discussion at: @@@ # --shift -100: As per above. # --slocal: Look at a local window of 20kb to build peak models # --keep-dup: We already removed duplicates with samtools. # TODO: Consider allowing tweaks to these settings with flags? command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs2' shell_job_runner.run(command % ('macs2', # This must be pre-installed by the user. It's a big, complex package. filename, # input file os.path.basename(filename_without_extension), output_path, genome, # for genome size, uncleaer if this actually matters with nolambda/nomodel output_path + "/" + os.path.basename(filename_without_extension) + '.log')) shell_job_runner.finish() macs2_log.info('MACS2 peak calling complete.')
def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('first_pass') output_suffix = ".tmp" if disable_parallel: # Doesn't change parallelism in last samtools sort shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60) # We do a few things here: # - View only mapping quality >= 10 # - Remove chrM # - Sort by name for fixmate # We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed. # - Fixmate (needed for rmdrup) # - Resorted by position tempfiles = [] for filename in input_files: primary_logger.debug('Working on: %s' % (filename)) command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \ '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s' # A super evil user could modify TMPDIR and make this generate evil strings. That's evil. temporary_file = tempfile.mkstemp('.tmp.bam') tempfiles.append(temporary_file) shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename, CONFIG['binaries']['samtools'], CONFIG['binaries']['samtools'], MAX_MEM, temporary_file[1], CONFIG['binaries']['samtools'], CONFIG['binaries']['samtools'], MAX_MEM, output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix)) shell_job_runner.finish() # Clean up our temporary files. primary_logger.info('Removing temporary files ...') for fd, fname in tempfiles: os.close(fd) os.unlink(fname) primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.')
def merge_and_rmdup(input_files, output_path, disable_parallel=False): primary_logger = _logshim.getLogger('merge_and_rmdup') # Sanity checks on the input files list assert(len(input_files) > 1) # Check files are readable for filename in input_files: if not os.access(filename, os.R_OK): primary_logger.fatal("Unable to read input files.") raise IOError output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files]) # Sanity check: maximum output filename length max_filename_length = os.statvfs(output_path).f_namemax if max_filename_length < 100: primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?") raise IOError if (len(output_file_name) + 10) > max_filename_length: # roughly truncate filename for sanity. primary_logger.critical("Very long filename! Truncating!") output_file_name = output_file_name[:-20] # Give us some extra room for downstream stuff? output_file_name += ".merged.bam" input_file_string = ' '.join(input_files) shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) primary_logger.debug('Input file string: %s' % (input_file_string)) primary_logger.debug('Working on merge as: %s' % (output_file_name)) # This is pretty fast and has minimal memory usage. Yay! # We're probably re-rmduping some files if we're merging. That's ok since this is speedy. command = "%s merge -u - %s | %s rmdup - %s 2>%s" shell_job_runner.run(command % (CONFIG['binaries']['samtools'], input_file_string, CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. output_path + "/" + output_file_name, output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log')) shell_job_runner.finish() primary_logger.info('Merge and rmdup complete!')
def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False): bowtie2_logger = _logshim.getLogger('run_bowtie2') # Import the config file to get genome locations config = _script_helpers.get_config() if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60) for output_prefix, paired_ends in paired_end_mapping.iteritems(): bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix)) for filename in paired_ends: assert (" " not in filename) assert (";" not in filename ) # Vague sanity testing for input filenames bowtie2_logger.debug(' Input: %s' % (filename)) # bowtie2 options: # --end-to-end: this is the default, but let's explicitly specify it # --sensitive: again, the default (consider switching to --fast?) # --no-unal: Suppress unaligned reads from the output .sam # --no-discordant: These are paired-end reads. We expect them to be non-discordant. # --mm: mmap MAP_SHARED (other processes can use our genome, cool!) # --met-stderr: Write metrics to stderr # --time: output the time things took # -x: target genome command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s" shell_job_runner.run( command % (config['bowtie2_genomes'][genome], paired_ends[0], paired_ends[1], output_path + "/" + output_prefix + ".bt2.log", output_path + "/" + output_prefix + ".bt2.bam")) shell_job_runner.finish()
def run_macs14(input_files, output_path, genome, disable_parallel=False): macs14_log = _logshim.getLogger('run_macs14') macs14_log.info('Spawning MACS14 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20) for filename in input_files: macs14_log.debug('Working on: %s' % (filename)) # macs14 is old, but we try it anyway, since it's sometimes useful. # -t: input # -n: output name # -f: format # -g: genome # -p: pvalue for peak cutoff # --wig: save .wig outputs # --single-profile: make one single wiggle # --space=50: wiggle resolution (default: 10) # # Note: This CD hack is because MACS1.4 can't specify an output path :( command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs14' shell_job_runner.run(command % ( output_path, # for cd hack 'macs14', # This must be pre-installed by the user. It's a big, complex package. os.getcwd() + '/' + filename, # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/ os.path.basename(filename_without_extension), genome, # for genome size os.path.basename(filename_without_extension) + '.macs14.log')) shell_job_runner.finish() macs14_log.info('MACS14 peak calling complete.')
def run_macs14(input_files, output_path, genome, disable_parallel=False): macs14_log = _logshim.getLogger('run_macs14') macs14_log.info('Spawning MACS14 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20) for filename in input_files: macs14_log.debug('Working on: %s' % (filename)) # macs14 is old, but we try it anyway, since it's sometimes useful. # -t: input # -n: output name # -f: format # -g: genome # -p: pvalue for peak cutoff # --wig: save .wig outputs # --single-profile: make one single wiggle # --space=50: wiggle resolution (default: 10) # # Note: This CD hack is because MACS1.4 can't specify an output path :( command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs14' shell_job_runner.run(command % (output_path, # for cd hack 'macs14', # This must be pre-installed by the user. It's a big, complex package. os.getcwd() + '/' + filename, # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/ os.path.basename(filename_without_extension), genome, # for genome size os.path.basename(filename_without_extension) + '.macs14.log')) shell_job_runner.finish() macs14_log.info('MACS14 peak calling complete.')
def run_macs2(input_files, output_path, genome, disable_parallel=False): macs2_log = _logshim.getLogger('run_macs2') macs2_log.info('Spawning MACS2 jobs...') if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log) else: shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1) for filename in input_files: macs2_log.debug('Working on: %s' % (filename)) # --bdg: generate .bed graph output # --nomodel: We'll be shifting manually! # --extsize 200: See long discussion at: @@@ # --shift -100: As per above. # --slocal: Look at a local window of 20kb to build peak models # --keep-dup: We already removed duplicates with samtools. # TODO: Consider allowing tweaks to these settings with flags? command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s" filename_without_extension = os.path.splitext(filename)[0] + '.macs2' shell_job_runner.run(command % ( 'macs2', # This must be pre-installed by the user. It's a big, complex package. filename, # input file os.path.basename(filename_without_extension), output_path, genome, # for genome size, uncleaer if this actually matters with nolambda/nomodel output_path + "/" + os.path.basename(filename_without_extension) + '.log')) shell_job_runner.finish() macs2_log.info('MACS2 peak calling complete.')
def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False): bowtie2_logger = _logshim.getLogger('run_bowtie2') # Import the config file to get genome locations config = _script_helpers.get_config() if disable_parallel: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger) else: shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60) for output_prefix, paired_ends in paired_end_mapping.iteritems(): bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix)) for filename in paired_ends: assert(" " not in filename) assert(";" not in filename) # Vague sanity testing for input filenames bowtie2_logger.debug(' Input: %s' % (filename)) # bowtie2 options: # --end-to-end: this is the default, but let's explicitly specify it # --sensitive: again, the default (consider switching to --fast?) # --no-unal: Suppress unaligned reads from the output .sam # --no-discordant: These are paired-end reads. We expect them to be non-discordant. # --mm: mmap MAP_SHARED (other processes can use our genome, cool!) # --met-stderr: Write metrics to stderr # --time: output the time things took # -x: target genome command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s" shell_job_runner.run(command % (config['bowtie2_genomes'][genome], paired_ends[0], paired_ends[1], output_path + "/" + output_prefix + ".bt2.log", output_path + "/" + output_prefix + ".bt2.bam")) shell_job_runner.finish()
def parse_h5files(input_files, annotationBedTool, overwrite, flatten, density, normalized, sizescaled): h5logger = _logshim.getLogger('parse_h5files') assert(not (density and normalized)) total_file_count = len(input_files) h5logger.info('Parsing a total of: %d file(s)' % (total_file_count)) output_suffix_list = ['tsv'] annotating_regions = False if annotationBedTool: annotating_regions = True output_suffix_list.append('annotated') if normalized: output_suffix_list.append('normalized') elif density: output_suffix_list.append('density') elif sizescaled: output_suffix_list.append('sizescaled') output_suffix = '.'.join(reversed(output_suffix_list)) # Cache regions that we're annotating, maybe. region_annotation_cache = {} for this_file_count, file in enumerate(input_files): h5logger.info('\tParsing: %s (%d/%d)' % (file, this_file_count + 1, total_file_count)) output_filename = file + '.' + output_suffix if not overwrite and os.path.isfile(output_filename): h5logger.warn('Skipping this .h5 as output .tsv already exists: %s' % (output_filename)) continue # TODO: Modularize H5FD_CORE (the in-memory driver?) with tables.open_file(file, mode="r", driver="H5FD_CORE") as h5_object: assert(h5_object.title.startswith("bam liquidator genome read counts")) # Some sanity checking assert(h5_object.root.file_names[0] == "*") bam_filename_header = h5_object.root.file_names[1:] bam_filename_header.insert(0, 'region') # Note: len(files) = len(file_names) - 1, since file_names has a 'wildcard' first entry. number_of_regions = int(len(h5_object.root.region_counts) / len(h5_object.root.files)) # We expect this .h5 object's region_counts to contain: # /region_counts (Table(SIZE,)) 'region counts' # description := { # "file_key": UInt32Col(shape=(), dflt=0, pos=0), # "chromosome": StringCol(itemsize=64, shape=(), dflt='', pos=1), # "region_name": StringCol(itemsize=64, shape=(), dflt='', pos=2), # "start": UInt64Col(shape=(), dflt=0, pos=3), # "stop": UInt64Col(shape=(), dflt=0, pos=4), # "strand": StringCol(itemsize=1, shape=(), dflt='', pos=5), # "count": UInt64Col(shape=(), dflt=0, pos=6), # "normalized_count": Float64Col(shape=(), dflt=0.0, pos=7)} # byteorder := 'little' # chunkshape := (NNN,) counts = h5_object.root.region_counts with open(output_filename, 'wb') as tsv_output: tsvwriter = csv.writer(tsv_output, delimiter=str("\t")) tsvwriter.writerow(bam_filename_header) if annotating_regions: h5logger.debug('Generating .bed annotations from provided genome.') region_to_gene = {} # Perform one annotation rapidly for all regions in the .hdf5 hdf5_positions_only = [] for region_number in range(0, number_of_regions): hdf5_positions_only.append(counts[region_number][1] + ' ' + str(counts[region_number][3]) + ' ' + str(counts[region_number][4])) hdf5_positions_only_hashkey = ''.join(hdf5_positions_only) if hdf5_positions_only_hashkey in region_annotation_cache: # The genome doesn't change mid run, so we cache only on hdf5_positions region_to_gene = region_annotation_cache[hdf5_positions_only_hashkey] h5logger.debug('Annotation from cache.') else: hdf5_stub_bed = pybedtools.BedTool('\n'.join(hdf5_positions_only), from_string=True) annotated_bed = hdf5_stub_bed.closest(annotationBedTool, t='first') for locus in annotated_bed: region_to_gene[locus.chrom + ':' + str(locus.start) + '-' + str(locus.end)] = locus.fields[11].split('"')[1] region_annotation_cache[hdf5_positions_only_hashkey] = region_to_gene h5logger.debug('Annotation completed.') # We're going to aggressively access the hdf5 at a bunch of fixed offsets. # rowarray = [counts[number_of_regions*0 + i], counts[number_of_regions*1 + i] + counts[number_of_regions*2 + i] ...] number_of_files = len(h5_object.root.files) working_deque = deque(maxlen=number_of_files + 1) # Here, we loop over every "region"/locus (every entry in the first column of the .tsv) # And then (within this loop) jump to each individual "file" (the hdf5 can contain multiple # separate samples) to build the data for every row. for region_number in range(0, number_of_regions): # Prefix the row with chrN:bpSTART-pbEND e.g. chr4:100-2000 locus_name = counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4]) # Sanity checking, in case the input is nuts feature_width = counts[region_number][4] - counts[region_number][3] assert(feature_width > 0) # DESeq2 requires each region have a unique name. # You can either append a unique value, or aggregate identical loci. # We address this later by re-opening and aggregating. if annotating_regions: working_deque.append(region_to_gene[locus_name]) else: working_deque.append(locus_name) #rowarray = [counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])] for file_number in range(0, number_of_files): if normalized: # Standard normalized (counts/mreads) # bamliquidator gives us (counts/mreads)/width so we multiply by width working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * feature_width)) elif density: # (counts/mreads)/width # We upscale the fractional normalized count values by an arbitrary amount, # because subsequent analyses like integers. working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * 10000)) elif sizescaled: # counts/width # We upscale the fractional normalized count values by an arbitrary amount, # because subsequent analyses like integers. working_deque.append(int(counts[number_of_regions * file_number + region_number][6] / feature_width * 100)) else: working_deque.append(int(counts[number_of_regions * file_number + region_number][6])) tsvwriter.writerow(working_deque) if flatten: flatten_tsv(output_filename) h5logger.info('Completed.')
def find_paired_ends(input_path, verbose=False): """ Given an input path, return :param input_path: :return: """ find_pe_logger = _logshim.getLogger('find_paired_ends') # TODO: Modularize all this! if not os.path.isdir(input_path): raise ValueError("Input must be a directory. You gave: %s" % (input_path)) all_files = glob.glob(input_path + "/*.PE1.fastq.gz") # Must have .PEX. in title all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz")) all_files.extend(glob.glob(input_path + "/*.PE1.fastq")) all_files.extend(glob.glob(input_path + "/*.PE2.fastq")) if len(all_files) == 0: raise ValueError("Input directory is empty!") # Given paired ends, we must always have an even number of input files. if len(all_files) % 2 != 0: raise ValueError("Input directory contains an odd number of files.") re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$') file_dict = OrderedDict() prefixes_seen = [] pe_seen = [] for file in sorted(all_files): if not os.access(file, os.R_OK): raise OSError("Cannot read file: %s" % (file)) filename_only = file.rsplit('/', 1)[-1] result = re.match(re_pattern, filename_only) file_dict[file] = {'prefix': str(result.group(1)), 'PE': int(result.group(2))} prefixes_seen.append(file_dict[file]['prefix']) pe_seen.append(file_dict[file]['PE']) if len(set(pe_seen)) != 2: raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen)))) if pe_seen.count(1) != pe_seen.count(2): raise ValueError("Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d" % (pe_seen.count(1), pe_seen.count(2))) find_pe_logger.info("Files seen: %d" % (len(all_files))) find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen)))) merge_strategy = {} find_pe_logger.info("Sample IDs:") for prefix in sorted(set(prefixes_seen)): find_pe_logger.info(" %s" % (prefix)) for file in file_dict.iterkeys(): merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file) if verbose: find_pe_logger.debug("Merge strategy is:") find_pe_logger.debug(pprint.pformat(merge_strategy)) return merge_strategy
def fastq_map_predict(input_path, verbose=False): """ Determine a sane .fastq muti-lane merge strategy. Fail if we can't merge correctly, if there are remaining files, etc. sample file name: Gordon-Ad2-11-AAGAGGCA-AAGAGGCA_S7_L001_R1_001.fastq.gz Args: input_path: An input path containing .fastq / .fastq.gz files Returns: A dict of mappings. """ fastq_map_logger = _logshim.getLogger('fastq_map_predict') if not os.path.isdir(input_path): raise ValueError("Input must be a directory. You gave: %s" % (input_path)) all_files = glob.glob(input_path + "/*_R*.fastq.gz") # Ignore index files, must have _R in title all_files.extend(glob.glob(input_path + "/*_R*.fastq")) if len(all_files) == 0: raise ValueError("Input directory is empty!") # Given paired ends, we must always have an even number of input files. if len(all_files) % 2 != 0: raise ValueError("Input directory contains an odd number of files.") re_pattern = re.compile(r'^(.*)_L(\d+)_R(\d)_\d+(\.fastq|\.fastq\.gz)$') file_dict = OrderedDict() prefixes_seen = [] lanes_seen = [] pe_seen = [] for file in sorted(all_files): if not os.access(file, os.R_OK): raise OSError("Cannot read file: %s" % (file)) filename_only = file.rsplit('/', 1)[-1] result = re.match(re_pattern, filename_only) file_dict[file] = {'prefix': str(result.group(1)), 'L': int(result.group(2)), 'R': int(result.group(3))} prefixes_seen.append(file_dict[file]['prefix']) lanes_seen.append(file_dict[file]['L']) pe_seen.append(file_dict[file]['R']) # Sanity checking here. Missing files? Other oddities? if len(file_dict) % len(set(lanes_seen)) != 0: raise ValueError("Missing or extra file(s)? Saw %d lanes, and %d input files." % (len(file_dict), len(set(lanes_seen)))) if len(set(pe_seen)) != 2: raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen)))) if pe_seen.count(1) != pe_seen.count(2): raise ValueError("Uneven pairing of paired ends (are you missing a file)? R1 count: %d, R2 count: %d" % (pe_seen.count(1), pe_seen.count(2))) fastq_map_logger.info("Files seen: %d" % (len(all_files))) fastq_map_logger.info("Samples seen: %d" % (len(set(prefixes_seen)))) fastq_map_logger.info("Lanes seen: %d" % (len(set(lanes_seen)))) merge_strategy = {} fastq_map_logger.info("Sample IDs:") for prefix in sorted(set(prefixes_seen)): fastq_map_logger.info(" %s" % (prefix)) for file in file_dict.iterkeys(): merge_strategy.setdefault(file_dict[file]['prefix'] + ".PE" + str(file_dict[file]['R']), []).append(file) if verbose: fastq_map_logger.debug("Merge strategy is:") fastq_map_logger.debug(pprint.pformat(merge_strategy)) return merge_strategy
def find_paired_ends(input_path, verbose=False): """ Given an input path, return :param input_path: :return: """ find_pe_logger = _logshim.getLogger('find_paired_ends') # TODO: Modularize all this! if not os.path.isdir(input_path): raise ValueError("Input must be a directory. You gave: %s" % (input_path)) all_files = glob.glob(input_path + "/*.PE1.fastq.gz") # Must have .PEX. in title all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz")) all_files.extend(glob.glob(input_path + "/*.PE1.fastq")) all_files.extend(glob.glob(input_path + "/*.PE2.fastq")) if len(all_files) == 0: raise ValueError("Input directory is empty!") # Given paired ends, we must always have an even number of input files. if len(all_files) % 2 != 0: raise ValueError("Input directory contains an odd number of files.") re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$') file_dict = OrderedDict() prefixes_seen = [] pe_seen = [] for file in sorted(all_files): if not os.access(file, os.R_OK): raise OSError("Cannot read file: %s" % (file)) filename_only = file.rsplit('/', 1)[-1] result = re.match(re_pattern, filename_only) file_dict[file] = { 'prefix': str(result.group(1)), 'PE': int(result.group(2)) } prefixes_seen.append(file_dict[file]['prefix']) pe_seen.append(file_dict[file]['PE']) if len(set(pe_seen)) != 2: raise ValueError( "Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen)))) if pe_seen.count(1) != pe_seen.count(2): raise ValueError( "Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d" % (pe_seen.count(1), pe_seen.count(2))) find_pe_logger.info("Files seen: %d" % (len(all_files))) find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen)))) merge_strategy = {} find_pe_logger.info("Sample IDs:") for prefix in sorted(set(prefixes_seen)): find_pe_logger.info(" %s" % (prefix)) for file in file_dict.iterkeys(): merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file) if verbose: find_pe_logger.debug("Merge strategy is:") find_pe_logger.debug(pprint.pformat(merge_strategy)) return merge_strategy