def _trim_barcode(self, input): if not self.config.trim_barcode: return input print "Trimming barcodes from 5' end..." output = path.join(self.tmp, base_file_name(input) + "_trimmed.fq") if self.config.force_preprocess or not is_existing_file(output): trim_seq = subprocess.Popen( ["fastx_trimmer", "-f", str(self.config.barcode_len + 1), "-Q33", "-i", input, "-o", output] ) trim_seq.wait() print "Barcodes trimmed...\n" return output
def _collapse_reads(self, input): print "Collapsing reads..." output = path.join(self.tmp, base_file_name(input) + "_collapsed.fq") if self.config.force_preprocess or not is_existing_file(output): total_reads = 0 too_long_reads = 0 copied_reads = 0 unique_reads = 0 reads = defaultdict(int) for read in SeqIO.parse(open(input, "rU"), "fastq"): total_reads += 1 if len(read) > self.config.max_seq_len: too_long_reads += 1 continue reads[str(read.seq)] += 1 with open(output, "w") as output_handle: for idx, (read, read_count) in enumerate(reads.iteritems()): output_handle.write(">%i-%i-%i\n%s\n" % (idx, len(read), read_count, read)) copied_reads += read_count unique_reads += 1 print "Max. Length: %i\nInput: %i reads.\nOutput: %i reads (%i " "unique).\ndiscarded %i too-long reads." % ( self.config.max_seq_len, total_reads, copied_reads, unique_reads, too_long_reads, ) print "Reads collapsed!\n" return output
def _clip_adapter(self, input, barcode): start_time = time.time() output = path.join(self.tmp, base_file_name(input) + "_clipped.fq") if self.config.force_preprocess or not is_existing_file(output): # Problem: fastx_clipper is not multithreaded and SUPER slow on large fastq files. # Solution: break up fastq file into smaller chunks, process each chunk on a separate core. def fastx_clipper(chunk_input, chunk_output): clip_seq = subprocess.Popen( [ "fastx_clipper", "-a", self.config.template % barcode, "-c", "-M", str(self.config.min_overlap), "-l", str(self.config.min_seq_len), "-v", "-Q33", "-i", chunk_input, "-o", chunk_output, ] ) clip_seq.wait() # Clean up os.remove(chunk_input) # Figure out chunk size for each core to handle LINES_PER_RECORD = 4 number_of_lines = sum(1 for line in open(input)) number_of_records = number_of_lines / LINES_PER_RECORD number_of_cores = multiprocessing.cpu_count() number_of_records_per_core = number_of_records / number_of_cores print "Clipping adapter sequences across {0} cores...\n".format(number_of_cores) # Prepare for multicore processing processes = [] chunk_output_paths = [] # Break up fastq into smaller chunks chunk_index = 0 chunk_input_handle = None with open(input) as input_handle: for index, line in enumerate(input_handle): # Lazy create chunked fastq file if chunk_input_handle is None: chunk_input_path = path.join( self.tmp, base_file_name(input) + "_{0}_chunk.fq".format(chunk_index) ) chunk_input_handle = open(chunk_input_path, "w") # Write line to chunk file to be processed later chunk_input_handle.write(line) # If this is the last line of the last record for a chunk, start processing the file. record_index = index / LINES_PER_RECORD is_last_line_in_record = (index + 1) % LINES_PER_RECORD == 0 is_last_record_in_chunk = (record_index + 1) % number_of_records_per_core == 0 is_last_chunk = chunk_index == number_of_cores - 1 if is_last_line_in_record and is_last_record_in_chunk and not is_last_chunk: print "Starting fastx_clipper on core {0}".format(chunk_index + 1) # Finish writing the chunk chunk_input_handle.close() # Figure out where to put the output file chunk_output_path = path.join( self.tmp, base_file_name(input) + "_{0}_clipped.fq".format(chunk_index) ) chunk_output_paths.append(chunk_output_path) # Kick off fastx_clipping process = Process(target=fastx_clipper, args=(chunk_input_path, chunk_output_path)) process.start() processes.append(process) # Reset state for the next chunk chunk_input_handle = None chunk_index += 1 if chunk_input_handle is not None: print "Starting fastx_clipper on core {0}".format(chunk_index + 1) # Finish writing the chunk chunk_input_handle.close() # Figure out where to put the output file chunk_output_path = path.join(self.tmp, base_file_name(input) + "_{0}_clipped.fq".format(chunk_index)) chunk_output_paths.append(chunk_output_path) # Kick off processing process = Process(target=fastx_clipper, args=(chunk_input_path, chunk_output_path)) process.start() processes.append(process) print "\nWaiting for {0} processes to finish, this could take a while...\n".format(number_of_cores) # Wait for all processes to finish for process in processes: process.join() print "Clipping complete, combining results into a single output file...\n" # Build back up a single combined output file combined_output_handle = open(output, "w") for chunk_output_path in chunk_output_paths: with open(chunk_output_path) as chunk_output_handle: for line in chunk_output_handle: combined_output_handle.write(line) # Clean up os.remove(chunk_output_path) # And we're done combined_output_handle.close() elapsed_seconds = time.time() - start_time print "Adapters clipped in {0:.0f} seconds\n".format(elapsed_seconds) else: print "Using previously-clipped adapter sequence at {0}\n".format(output) elapsed_seconds = time.time() - start_time print "Adapters clipped in {0:.0f} seconds\n".format(elapsed_seconds) return output