def main(): # parse the command line arguments args = parse_arguments(sys.argv) # if no sam output is provided or it is set to dev/null, write to temp file output_dir = os.path.dirname(args.un_pair) temp_files = [] if args.sam is None or args.sam == os.devnull: file_out, args.sam = tempfile.mkstemp("kneaddata_", "_temp.sam", dir=output_dir) os.close(file_out) temp_files.append(args.sam) # run bowtie2 run_bowtie2(args.bowtie2, args.pair1, args.pair2, args.orphan, args.index, args.sam, args.threads, args.bowtie2_options, args.reorder) # write output files process_alignments(args.sam, args.al_pair, args.un_pair, args.al_single, args.un_single, args.mode) # remove the temp files for file in temp_files: utilities.remove_file(file)
def main(): # parse the command line arguments args = parse_arguments(sys.argv) temp_files = [] ch = logging.StreamHandler() ch.setFormatter(logging.Formatter('%(asctime)s kneaddata_bowtie2_discordant_pairs - %(message)s')) logger.addHandler(ch) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("info") pair1_sam=temp_sam_path(args, "pair1") temp_files.append(pair1_sam) command=bowtie2_unpaired_command(args, input_fastq_path = args.pair1, output_sam_path = pair1_sam, maintain_input_ordering = True) logger.debug("Aligning pair1 reads: "+ " ".join(command)) run_command(command) pair2_sam=temp_sam_path(args, "pair2") temp_files.append(pair2_sam) command=bowtie2_unpaired_command(args, input_fastq_path = args.pair2, output_sam_path = pair2_sam, maintain_input_ordering = True) logger.debug("Aligning pair2 reads: "+ " ".join(command)) run_command(command) orphan_sam = None if args.orphan: orphan_sam=temp_sam_path(args, "orphan") temp_files.append(orphan_sam) command=bowtie2_unpaired_command(args, input_fastq_path = args.orphan, output_sam_path = orphan_sam, maintain_input_ordering = args.reorder) logger.debug("Aligning orphan reads: "+ " ".join(command)) run_command(command) logger.debug("Processing the alignments") process_alignments(pair1_sam, pair2_sam, orphan_sam, aligned_pair = args.al_pair, unaligned_pair = args.un_pair, aligned_orphan = args.al_single, unaligned_orphan = args.un_single, treat_pair_as_aligned_if_either_read_aligned = (args.mode == "strict") ) if args.sam and args.sam != os.devnull: logger.debug("Aggregating .sam output to " + args.sam) command = ["cat"] command.extend(temp_files) with open(args.sam, 'w') as fh: run_command(command, stdout = fh) logger.debug("Removing temporary files") for file in temp_files: utilities.remove_file(file)
def combine_fastq_output_files(files_to_combine, out_prefix, remove_temp_output, database_names): """ Combines fastq output created by BMTagger/bowtie2 on multiple databases and returns a list of output files. Also updates the log file with read counts for the input and output files. """ # print out the reads for all files utilities.log_read_count_for_files( files_to_combine, "decontaminated", "Total reads after removing those found in reference database", database_names) # create lists of all of the output files for pair 1 and for pair 2 files_for_pair1 = [f[0] for f in files_to_combine] try: files_for_pair2 = [f[1] for f in files_to_combine] except IndexError: files_for_pair2 = [] # select an output prefix based on if the outputs are paired or not output_file = out_prefix + "_1" + config.fastq_file_extension if not files_for_pair2: output_file = out_prefix + config.fastq_file_extension # create intersect file from all output files for pair 1 intersect_fastq(files_for_pair1, output_file, remove_temp_output) output_files = [output_file] # create an intersect file from all output files for pair 2 if files_for_pair2: output_file = out_prefix + "_2" + config.fastq_file_extension intersect_fastq(files_for_pair2, output_file, remove_temp_output) output_files.append(output_file) # Get the read counts for the newly merged files utilities.log_read_count_for_files( output_files, "final", "Total reads after merging results from multiple databases") # remove temp files if set if remove_temp_output: for group in [files_for_pair1, files_for_pair2]: for filename in group: utilities.remove_file(filename) return output_files
def main(): # Parse the arguments from the user args = parse_arguments(sys.argv) # Update the configuration args = update_configuration(args) # set the prefix for the output files full_path_output_prefix = os.path.join(args.output_dir, args.output_prefix) # Start logging setup_logging(args) temp_output_files = [] # Check for compressed files, bam files, or sam files for index in range(len(args.input)): args.input[index] = utilities.get_decompressed_file( args.input[index], args.output_dir, temp_output_files, args.input) args.input[index] = utilities.get_sam_from_bam_file( args.input[index], args.output_dir, temp_output_files, args.input) args.input[index] = utilities.get_fastq_from_sam_file( args.input[index], args.output_dir, temp_output_files, args.input) # Get the format of the first input file file_format = utilities.get_file_format(args.input[0]) if file_format != "fastq": message = "Your input file is of type: " + file_format + ". Please provide an input file of fastq format." logger.critical(message) sys.exit(message) # if this is the new illumina identifier format, create temp files after reformatting the headers for index in range(len(args.input)): args.input[index] = utilities.get_reformatted_identifiers( args.input[index], index, args.output_dir, temp_output_files, args.input) # check for reads that are not ordered and order if needed (if trimmomatic is run) if not args.bypass_trim and len(args.input) == 2: args.input = utilities.check_and_reorder_reads(args.input, args.output_dir, temp_output_files) # remove any temp files from decompress/reformat that are no longer needed utilities.update_temp_output_files(temp_output_files, [], args.input) # set trimmomatic options # this is done after the decompression and conversions from sam/bam # as the default requires the read length from the input sequences if args.trimmomatic_options: # parse the options from the user into an array of options args.trimmomatic_options = utilities.format_options_to_list( args.trimmomatic_options) else: # if trimmomatic options not set by user, then set to default options # use read length of input file for minlen args.trimmomatic_options = utilities.get_default_trimmomatic_options( utilities.get_read_length_fastq(args.input[0]), path=config.trimmomatic_adapter_folder, type="PE" if len(args.input) == 2 else "SE", sequencer_source=args.sequencer_source) # Get the number of reads initially utilities.log_read_count_for_files(args.input, "raw", "Initial number of reads", args.verbose) # Run fastqc if set to run at start of workflow if args.fastqc_start or args.run_trim_repetitive: run.fastqc(args.fastqc_path, args.output_dir, original_input_files, args.threads, args.verbose) #Setting fastqc output zip and txt file path output_txt_files = [] for input_file_name in original_input_files: temp_file = os.path.splitext(input_file_name)[0] if (temp_file.count('fastq') > 0 or temp_file.count('fq') > 0): temp_file = os.path.splitext(temp_file)[0] output_txt_files.append(args.output_dir + "/fastqc/" + temp_file.split('/')[-1] + "_fastqc/fastqc_data.txt") if not args.bypass_trim: if args.run_trim_repetitive: # Get the Min Overrepresented Seq Length args.trimmomatic_options = utilities.get_updated_trimmomatic_parameters( output_txt_files, args.output_dir, args.trimmomatic_options) trimmomatic_output_files = run.trim( args.input, full_path_output_prefix, args.trimmomatic_path, args.trimmomatic_quality_scores, args.max_memory, args.trimmomatic_options, args.threads, args.verbose) else: message = "Bypass trimming" logger.info(message) print(message) trimmomatic_output_files = [args.input] # Get the number of reads after trimming utilities.log_read_count_for_files(trimmomatic_output_files, "trimmed", "Total reads after trimming", args.verbose) # run TRF, if set if not args.bypass_trf: # run trf on all output files trf_output_files = run.tandem( trimmomatic_output_files, full_path_output_prefix, args.match, args.mismatch, args.delta, args.pm, args.pi, args.minscore, args.maxperiod, args.trf_path, args.processes, args.verbose, args.remove_temp_output, args.threads) # remove the aligment files, if intermediate output files should be removed if args.reference_db and args.remove_intermediate_output: temp_output_files += utilities.resolve_sublists( trimmomatic_output_files) else: trf_output_files = trimmomatic_output_files # If a reference database is not provided, then bypass decontamination step if not args.reference_db: message = "Bypass decontamination" logger.info(message) print(message) # resolve sub-lists if present final_output_files = trf_output_files else: final_output_files = run.decontaminate(args, full_path_output_prefix, trf_output_files) # remove trimmed output files, if set to remove intermediate outputx if not args.bypass_trim and args.remove_intermediate_output: temp_output_files += utilities.resolve_sublists(trf_output_files) # If set, concat the final output files if there is more than one final_output_files = utilities.resolve_sublists(final_output_files) if args.cat_final_output and len(final_output_files) > 1: cat_output_file = full_path_output_prefix + config.fastq_file_extension utilities.cat_files(final_output_files, cat_output_file) # if removing intermediate output, then remove the files that were merged if args.remove_intermediate_output: temp_output_files += final_output_files final_output_files = [cat_output_file] else: final_output_files.append(cat_output_file) # Remove any temp output files, if set if not args.store_temp_output: for file in temp_output_files: utilities.remove_file(file) # Run fastqc if set to run at end of workflow if args.fastqc_end: run.fastqc(args.fastqc_path, args.output_dir, final_output_files, args.threads, args.verbose) if len(final_output_files) > 1: message = "\nFinal output files created: \n" else: message = "\nFinal output file created: \n" message = message + "\n".join(final_output_files) + "\n" logger.info(message) print(message)
def tandem(input_files, output_prefix, match, mismatch, delta, pm, pi, minscore, maxperiod, trf_path, processors, verbose, remove_temp_output, threads): """ Run TRF on all input files """ # Convert all arguments to strings trf_args = list( map(str, [match, mismatch, delta, pm, pi, minscore, maxperiod])) output_files = [] pairs = False unmatched = 1 output_prefix += ".repeats.removed" for input_fastq_files in input_files: # Get the names for the output files if len(input_fastq_files) > 1: pairs = True output_fastq_files = [ output_prefix + "." + str(i) + config.fastq_file_extension for i in range(1, len(input_fastq_files) + 1) ] elif pairs: output_fastq_files = [ output_prefix + ".unmatched." + str(unmatched) + config.fastq_file_extension ] unmatched += 1 else: output_fastq_files = [output_prefix + config.fastq_file_extension] commands = [] temp_fasta_files = [] trf_output_files = [] for input_fastq in input_fastq_files: # create a temp fasta file from the fastq file input_fasta = input_fastq.replace( os.path.splitext(input_fastq)[-1], config.fasta_file_extension) utilities.fastq_to_fasta(input_fastq, input_fasta) temp_fasta_files.append(input_fasta) trf_output_file = input_fasta + ".trf.parameters." + ".".join( trf_args) + ".dat" trf_output_files.append(trf_output_file) # suppress html output and write reduced data file to standard output trf_command = [ "kneaddata_trf_parallel", "--input", input_fasta, "--output", trf_output_file, "--trf-path", trf_path, "--trf-options", "'" + " ".join(trf_args + ["-h", "-ngs"]) + "'", "--nproc", str(threads) ] # only run trf if the fasta file is not empty if os.path.getsize(input_fasta) > 0: commands.append([ trf_command, "trf", [input_fasta], [trf_output_file], trf_output_file ]) # run the trf commands with the number of processes specified utilities.start_processes(commands, processors, verbose) # remove all fasta files when complete for file in temp_fasta_files: utilities.remove_file(file) # use the trf output to print the final fastq output files for i in range(len(input_fastq_files)): remove_repeats_from_fastq(input_fastq_files[i], trf_output_files[i], output_fastq_files[i]) # remove trf output if remove temp output is set if remove_temp_output: for file in trf_output_files: utilities.remove_file(file) # sets for running the alignment steps if pairs: if (len(output_fastq_files) == 2): output_files.append( [output_fastq_files[0], output_fastq_files[1]]) else: output_files.append([output_fastq_files[0]]) else: output_files.append([output_fastq_files[0]]) return output_files
def trim(infiles, outfiles_prefix, trimmomatic_path, quality_scores, java_memory, additional_options, threads, verbose): """ Creates and runs trimmomatic commands based on input files and options. Returns a list of the output files. """ command = ["java", "-Xmx" + java_memory, "-jar", trimmomatic_path] # determine if paired end input files paired_end = False if len(infiles) == 2: paired_end = True if paired_end: # set options for paired end input files mode = "PE" outfiles = [ outfiles_prefix + config.trimomatic_pe_endings[0], outfiles_prefix + config.trimomatic_pe_endings[2], outfiles_prefix + config.trimomatic_pe_endings[1], outfiles_prefix + config.trimomatic_pe_endings[3] ] else: # set options for single input file mode = "SE" outfiles = [outfiles_prefix + config.trimomatic_se_ending] # add positional arguments to command command += [mode, "-threads", str(threads), quality_scores] + infiles + outfiles # add optional arguments to command command += additional_options # run trimmomatic command utilities.run_command(command, "Trimmomatic", infiles, outfiles, None, verbose, exit_on_error=True) # now check all of the output files to find which are non-empty and return as # sets for running the alignment steps nonempty_outfiles = [] outfile_size = [utilities.file_size(file) for file in outfiles] if paired_end: # if paired fastq files remain after trimming, preserve pairing if outfile_size[0] > 0 and outfile_size[2] > 0: nonempty_outfiles.append([outfiles[0], outfiles[2]]) elif outfile_size[0] > 0: nonempty_outfiles.append([outfiles[0]]) # remove the second paired file if empty utilities.remove_file(outfiles[2]) elif outfile_size[2] > 0: nonempty_outfiles.append([outfiles[2]]) # remove the second paired file if empty utilities.remove_file(outfiles[0]) # add sequences without pairs, if present if outfile_size[1] > 0: nonempty_outfiles.append([outfiles[1]]) else: # remove the file if empty utilities.remove_file(outfiles[1]) if outfile_size[3] > 0: nonempty_outfiles.append([outfiles[3]]) else: # remove the file if empty utilities.remove_file(outfiles[3]) else: if outfile_size[0] > 0: nonempty_outfiles = [[outfiles[0]]] else: # remove the file if empty utilities.remove_file(outfiles[0]) if not nonempty_outfiles: sys.exit("ERROR: Trimmomatic created empty output files.") return nonempty_outfiles