def intersect_fastq(fastq_files, out_file, remove_temp_output=None): """ Intersects multiple fastq files with one another. Includes only the reads (4 lines long each) that are common to all the files. Writes these reads to the output file specified in out_file. """ # optimize for the common case, where we are intersecting 1 file if len(fastq_files) == 1: if remove_temp_output: shutil.move(fastq_files[0], out_file) else: shutil.copyfile(fastq_files[0], out_file) else: # store the number of files that contain each sequence sequence_count = {} for fname in fastq_files: for lines in utilities.read_file_n_lines(fname, 4): sequence_count[lines[0]] = sequence_count.get(lines[0], 0) + 1 num_files = len(fastq_files) with open(out_file, "w") as file_handle: # read through one of the files, writing out each sequence that # is found in all of the files for lines in utilities.read_file_n_lines(fastq_files[0], 4): if sequence_count.get(lines[0], 0) >= num_files: file_handle.write("".join(lines))
def write_tagged_sequences_from_fastq(input_fastq, bmtagger_output, output_fastq, verbose): """ Find the sequences bmtagger has tagged as contaminates from the extract output file """ # store all of the sequences bmtagger has not tagged as contaminates untagged_sequences = set() for lines in utilities.read_file_n_lines(bmtagger_output, 4): untagged_sequences.add(lines[0]) try: file_handle_write = open(output_fastq, "w") except EnvironmentError: sys.exit("ERROR: Unable to open file: " + output_fastq) tagged_sequences = 0 for lines in utilities.read_file_n_lines(input_fastq, 4): # check if the sequence was identified by bmtagger if not lines[0] in untagged_sequences: tagged_sequences += 1 file_handle_write.write("".join(lines)) # log the number of sequences message = "Total contaminate sequences in file ( " + output_fastq + " ): " + str( tagged_sequences) logger.info(message) if verbose: print(message)
def remove_repeats_from_fastq(input_fastq, trf_output, output_fastq): """ Remove the sequences from TRF that contain repeats from the output files """ sequences_with_repeats = set() try: with open(trf_output) as file_handle: for line in file_handle: # sequences start with "@" if line[0] == "@": sequences_with_repeats.add(line) except EnvironmentError: pass try: file_handle_write = open(output_fastq, "w") except EnvironmentError: sys.exit("ERROR: Unable to open file: " + output_fastq) removed_sequences = 0 for lines in utilities.read_file_n_lines(input_fastq, 4): # check if the sequence was identified by TRF if lines[0] in sequences_with_repeats: removed_sequences += 1 else: file_handle_write.write("".join(lines)) # log the number of sequences removed for repeats logger.info("Total number of sequences with repeats removed from file ( " + input_fastq + " ): " + str(removed_sequences))
def test_read_in_file_n_lines(self): """ Test the function that reads in a file n lines at a time """ # Get the sequences from the file, removing end-lines and the fastq "@" character sequences = [ lines[0].rstrip()[1:] for lines in utilities.read_file_n_lines(cfg.merge_files[0], 4) ] self.assertEqual(sorted(sequences), sorted(cfg.merge_files_1_sequences))
def test_intersect_fastq(self): """ Test the intersect_fastq function """ file_handle, temp_output_file = tempfile.mkstemp( prefix="kneaddata_test") run.intersect_fastq(cfg.merge_files, temp_output_file) # read in the sequences from the fastq output file # Get the sequences from the file, removing end-lines and the fastq "@" character sequences = [ lines[0].rstrip()[1:] for lines in utilities.read_file_n_lines(temp_output_file, 4) ] # remove the temp output file utils.remove_temp_file(temp_output_file) self.assertEqual(sorted(sequences), sorted(cfg.merge_files_sequences_intersect))
def run_trf(input,trf_path,trf_options,nproc,output,verbose=True): """ Run trf with the options provided """ tempfile_list=[] datfile_list=[] commands=[] # check for one process and if so just run trf directly if nproc == 1: commands.append([[trf_path,input]+trf_options.split(" "),"trf",[input],[output],output]) utilities.start_processes(commands,nproc,verbose) else: # get the total number of reads total_lines=0 with open(input) as file_handle: for line in file_handle: total_lines+=1 # split the input into multiple files and run in parallel for i in range(int(nproc)): file_out, new_file = tempfile.mkstemp(prefix=os.path.basename(output)+'_'+str(i)+'_temp_trf_output',dir=os.path.dirname(output)) os.close(file_out) tempfile_list.append(new_file) datfile_list.append(new_file+".".join(trf_options.split("-")[0].split(" "))+"dat") # write the input file into all temp output files output_file_number=0 lines_per_file = int(total_lines/int(nproc)) lines_written=0 file_handle_write=None for read_line in utilities.read_file_n_lines(input,2): if not file_handle_write: file_handle_write = open(tempfile_list[output_file_number],"wt") file_handle_write.write("".join(read_line)) lines_written+=2 if lines_written > lines_per_file: file_handle_write.close() lines_written=0 output_file_number+=1 file_handle_write = open(tempfile_list[output_file_number],"wt") file_handle_write.close() # run commands for i, temp_in, temp_out in zip(range(len(tempfile_list)), tempfile_list, datfile_list): trf_command=[trf_path,temp_in]+trf_options.split(" ") commands.append([trf_command,"trf{}".format(i),[temp_in],[temp_out],temp_out]) utilities.start_processes(commands,nproc,verbose) # merge all of the outputs to the final output file with open(output,"w") as file_write: for datfile in datfile_list: with open(datfile) as file_read: for line in file_read: file_write.write(line) # remove temp files for filename in tempfile_list+datfile_list: try: os.remove(filename) except EnvironmentError: print("Unable to remove temp file: " + filename)