コード例 #1
0
def main():
    # parse the command line arguments
    args = parse_arguments(sys.argv)

    # if no sam output is provided or it is set to dev/null, write to temp file
    output_dir = os.path.dirname(args.un_pair)
    temp_files = []
    if args.sam is None or args.sam == os.devnull:
        file_out, args.sam = tempfile.mkstemp("kneaddata_",
                                              "_temp.sam",
                                              dir=output_dir)
        os.close(file_out)
        temp_files.append(args.sam)

    # run bowtie2
    run_bowtie2(args.bowtie2, args.pair1, args.pair2, args.orphan, args.index,
                args.sam, args.threads, args.bowtie2_options, args.reorder)

    # write output files
    process_alignments(args.sam, args.al_pair, args.un_pair, args.al_single,
                       args.un_single, args.mode)

    # remove the temp files
    for file in temp_files:
        utilities.remove_file(file)
コード例 #2
0
def main():
    # parse the command line arguments
    args = parse_arguments(sys.argv)
    temp_files = []

    ch = logging.StreamHandler()
    ch.setFormatter(logging.Formatter('%(asctime)s kneaddata_bowtie2_discordant_pairs - %(message)s'))
    logger.addHandler(ch)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.debug("info")
    pair1_sam=temp_sam_path(args, "pair1")
    temp_files.append(pair1_sam)
    command=bowtie2_unpaired_command(args, input_fastq_path = args.pair1, output_sam_path = pair1_sam, maintain_input_ordering = True)
    logger.debug("Aligning pair1 reads: "+ " ".join(command))
    run_command(command)

    pair2_sam=temp_sam_path(args, "pair2")
    temp_files.append(pair2_sam)
    command=bowtie2_unpaired_command(args, input_fastq_path = args.pair2, output_sam_path = pair2_sam, maintain_input_ordering = True)
    logger.debug("Aligning pair2 reads: "+ " ".join(command))
    run_command(command)

    orphan_sam = None
    if args.orphan:
        orphan_sam=temp_sam_path(args, "orphan")
        temp_files.append(orphan_sam)
        command=bowtie2_unpaired_command(args, input_fastq_path = args.orphan, output_sam_path = orphan_sam, maintain_input_ordering = args.reorder)
        logger.debug("Aligning orphan reads: "+ " ".join(command))
        run_command(command)

    logger.debug("Processing the alignments")
    process_alignments(pair1_sam, pair2_sam, orphan_sam,
      aligned_pair = args.al_pair,
      unaligned_pair = args.un_pair,
      aligned_orphan = args.al_single,
      unaligned_orphan = args.un_single,
      treat_pair_as_aligned_if_either_read_aligned = (args.mode == "strict")
    )

    if args.sam and args.sam != os.devnull:
        logger.debug("Aggregating .sam output to " + args.sam)
        command = ["cat"]
        command.extend(temp_files)
        with open(args.sam, 'w') as fh: 
            run_command(command, stdout = fh)

    logger.debug("Removing temporary files")
    for file in temp_files:
        utilities.remove_file(file)
コード例 #3
0
def combine_fastq_output_files(files_to_combine, out_prefix,
                               remove_temp_output, database_names):
    """ Combines fastq output created by BMTagger/bowtie2 on multiple databases and 
    returns a list of output files. Also updates the log file with read counts for the 
    input and output files.
    """

    # print out the reads for all files
    utilities.log_read_count_for_files(
        files_to_combine, "decontaminated",
        "Total reads after removing those found in reference database",
        database_names)

    # create lists of all of the output files for pair 1 and for pair 2
    files_for_pair1 = [f[0] for f in files_to_combine]
    try:
        files_for_pair2 = [f[1] for f in files_to_combine]
    except IndexError:
        files_for_pair2 = []

    # select an output prefix based on if the outputs are paired or not
    output_file = out_prefix + "_1" + config.fastq_file_extension
    if not files_for_pair2:
        output_file = out_prefix + config.fastq_file_extension

    # create intersect file from all output files for pair 1
    intersect_fastq(files_for_pair1, output_file, remove_temp_output)
    output_files = [output_file]

    # create an intersect file from all output files for pair 2
    if files_for_pair2:
        output_file = out_prefix + "_2" + config.fastq_file_extension
        intersect_fastq(files_for_pair2, output_file, remove_temp_output)
        output_files.append(output_file)

    # Get the read counts for the newly merged files
    utilities.log_read_count_for_files(
        output_files, "final",
        "Total reads after merging results from multiple databases")

    # remove temp files if set
    if remove_temp_output:
        for group in [files_for_pair1, files_for_pair2]:
            for filename in group:
                utilities.remove_file(filename)

    return output_files
コード例 #4
0
ファイル: knead_data.py プロジェクト: biobakery/kneaddata
def main():
    # Parse the arguments from the user
    args = parse_arguments(sys.argv)

    # Update the configuration
    args = update_configuration(args)

    # set the prefix for the output files
    full_path_output_prefix = os.path.join(args.output_dir, args.output_prefix)

    # Start logging
    setup_logging(args)

    temp_output_files = []
    # Check for compressed files, bam files, or sam files
    for index in range(len(args.input)):
        args.input[index] = utilities.get_decompressed_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_sam_from_bam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_fastq_from_sam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)

    # Get the format of the first input file
    file_format = utilities.get_file_format(args.input[0])

    if file_format != "fastq":
        message = "Your input file is of type: " + file_format + ". Please provide an input file of fastq format."
        logger.critical(message)
        sys.exit(message)

    # if this is the new illumina identifier format, create temp files after reformatting the headers
    for index in range(len(args.input)):
        args.input[index] = utilities.get_reformatted_identifiers(
            args.input[index], index, args.output_dir, temp_output_files,
            args.input)

    # check for reads that are not ordered and order if needed (if trimmomatic is run)
    if not args.bypass_trim and len(args.input) == 2:
        args.input = utilities.check_and_reorder_reads(args.input,
                                                       args.output_dir,
                                                       temp_output_files)

    # remove any temp files from decompress/reformat that are no longer needed
    utilities.update_temp_output_files(temp_output_files, [], args.input)

    # set trimmomatic options
    # this is done after the decompression and conversions from sam/bam
    # as the default requires the read length from the input sequences
    if args.trimmomatic_options:
        # parse the options from the user into an array of options
        args.trimmomatic_options = utilities.format_options_to_list(
            args.trimmomatic_options)
    else:
        # if trimmomatic options not set by user, then set to default options
        # use read length of input file for minlen
        args.trimmomatic_options = utilities.get_default_trimmomatic_options(
            utilities.get_read_length_fastq(args.input[0]),
            path=config.trimmomatic_adapter_folder,
            type="PE" if len(args.input) == 2 else "SE",
            sequencer_source=args.sequencer_source)

    # Get the number of reads initially
    utilities.log_read_count_for_files(args.input, "raw",
                                       "Initial number of reads", args.verbose)

    # Run fastqc if set to run at start of workflow
    if args.fastqc_start or args.run_trim_repetitive:
        run.fastqc(args.fastqc_path, args.output_dir, original_input_files,
                   args.threads, args.verbose)
        #Setting fastqc output zip and txt file path
        output_txt_files = []
        for input_file_name in original_input_files:
            temp_file = os.path.splitext(input_file_name)[0]
            if (temp_file.count('fastq') > 0 or temp_file.count('fq') > 0):
                temp_file = os.path.splitext(temp_file)[0]
            output_txt_files.append(args.output_dir + "/fastqc/" +
                                    temp_file.split('/')[-1] +
                                    "_fastqc/fastqc_data.txt")

    if not args.bypass_trim:
        if args.run_trim_repetitive:
            # Get the Min Overrepresented Seq Length
            args.trimmomatic_options = utilities.get_updated_trimmomatic_parameters(
                output_txt_files, args.output_dir, args.trimmomatic_options)

        trimmomatic_output_files = run.trim(
            args.input, full_path_output_prefix, args.trimmomatic_path,
            args.trimmomatic_quality_scores, args.max_memory,
            args.trimmomatic_options, args.threads, args.verbose)

    else:
        message = "Bypass trimming"
        logger.info(message)
        print(message)
        trimmomatic_output_files = [args.input]

    # Get the number of reads after trimming
    utilities.log_read_count_for_files(trimmomatic_output_files, "trimmed",
                                       "Total reads after trimming",
                                       args.verbose)

    # run TRF, if set
    if not args.bypass_trf:
        # run trf on all output files
        trf_output_files = run.tandem(
            trimmomatic_output_files, full_path_output_prefix, args.match,
            args.mismatch, args.delta, args.pm, args.pi, args.minscore,
            args.maxperiod, args.trf_path, args.processes, args.verbose,
            args.remove_temp_output, args.threads)
        # remove the aligment files, if intermediate output files should be removed
        if args.reference_db and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(
                trimmomatic_output_files)
    else:
        trf_output_files = trimmomatic_output_files
    # If a reference database is not provided, then bypass decontamination step
    if not args.reference_db:
        message = "Bypass decontamination"
        logger.info(message)
        print(message)
        # resolve sub-lists if present
        final_output_files = trf_output_files
    else:
        final_output_files = run.decontaminate(args, full_path_output_prefix,
                                               trf_output_files)
        # remove trimmed output files, if set to remove intermediate outputx
        if not args.bypass_trim and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(trf_output_files)

    # If set, concat the final output files if there is more than one
    final_output_files = utilities.resolve_sublists(final_output_files)
    if args.cat_final_output and len(final_output_files) > 1:
        cat_output_file = full_path_output_prefix + config.fastq_file_extension
        utilities.cat_files(final_output_files, cat_output_file)

        # if removing intermediate output, then remove the files that were merged
        if args.remove_intermediate_output:
            temp_output_files += final_output_files
            final_output_files = [cat_output_file]
        else:
            final_output_files.append(cat_output_file)

    # Remove any temp output files, if set
    if not args.store_temp_output:
        for file in temp_output_files:
            utilities.remove_file(file)

    # Run fastqc if set to run at end of workflow
    if args.fastqc_end:
        run.fastqc(args.fastqc_path, args.output_dir, final_output_files,
                   args.threads, args.verbose)

    if len(final_output_files) > 1:
        message = "\nFinal output files created: \n"
    else:
        message = "\nFinal output file created: \n"

    message = message + "\n".join(final_output_files) + "\n"
    logger.info(message)
    print(message)
コード例 #5
0
def tandem(input_files, output_prefix, match, mismatch, delta, pm, pi,
           minscore, maxperiod, trf_path, processors, verbose,
           remove_temp_output, threads):
    """ Run TRF on all input files """

    # Convert all arguments to strings
    trf_args = list(
        map(str, [match, mismatch, delta, pm, pi, minscore, maxperiod]))

    output_files = []
    pairs = False
    unmatched = 1
    output_prefix += ".repeats.removed"
    for input_fastq_files in input_files:
        # Get the names for the output files
        if len(input_fastq_files) > 1:
            pairs = True
            output_fastq_files = [
                output_prefix + "." + str(i) + config.fastq_file_extension
                for i in range(1,
                               len(input_fastq_files) + 1)
            ]
        elif pairs:
            output_fastq_files = [
                output_prefix + ".unmatched." + str(unmatched) +
                config.fastq_file_extension
            ]
            unmatched += 1
        else:
            output_fastq_files = [output_prefix + config.fastq_file_extension]

        commands = []
        temp_fasta_files = []
        trf_output_files = []
        for input_fastq in input_fastq_files:
            # create a temp fasta file from the fastq file
            input_fasta = input_fastq.replace(
                os.path.splitext(input_fastq)[-1], config.fasta_file_extension)
            utilities.fastq_to_fasta(input_fastq, input_fasta)
            temp_fasta_files.append(input_fasta)

            trf_output_file = input_fasta + ".trf.parameters." + ".".join(
                trf_args) + ".dat"
            trf_output_files.append(trf_output_file)

            # suppress html output and write reduced data file to standard output
            trf_command = [
                "kneaddata_trf_parallel", "--input", input_fasta, "--output",
                trf_output_file, "--trf-path", trf_path, "--trf-options",
                "'" + " ".join(trf_args + ["-h", "-ngs"]) + "'", "--nproc",
                str(threads)
            ]

            # only run trf if the fasta file is not empty
            if os.path.getsize(input_fasta) > 0:
                commands.append([
                    trf_command, "trf", [input_fasta], [trf_output_file],
                    trf_output_file
                ])

        # run the trf commands with the number of processes specified
        utilities.start_processes(commands, processors, verbose)

        # remove all fasta files when complete
        for file in temp_fasta_files:
            utilities.remove_file(file)

        # use the trf output to print the final fastq output files
        for i in range(len(input_fastq_files)):
            remove_repeats_from_fastq(input_fastq_files[i],
                                      trf_output_files[i],
                                      output_fastq_files[i])

        # remove trf output if remove temp output is set
        if remove_temp_output:
            for file in trf_output_files:
                utilities.remove_file(file)

        # sets for running the alignment steps
        if pairs:
            if (len(output_fastq_files) == 2):
                output_files.append(
                    [output_fastq_files[0], output_fastq_files[1]])
            else:
                output_files.append([output_fastq_files[0]])
        else:
            output_files.append([output_fastq_files[0]])

    return output_files
コード例 #6
0
def trim(infiles, outfiles_prefix, trimmomatic_path, quality_scores,
         java_memory, additional_options, threads, verbose):
    """ Creates and runs trimmomatic commands based on input files and options. 
    Returns a list of the output files.
    """

    command = ["java", "-Xmx" + java_memory, "-jar", trimmomatic_path]

    # determine if paired end input files
    paired_end = False
    if len(infiles) == 2:
        paired_end = True

    if paired_end:
        # set options for paired end input files
        mode = "PE"
        outfiles = [
            outfiles_prefix + config.trimomatic_pe_endings[0],
            outfiles_prefix + config.trimomatic_pe_endings[2],
            outfiles_prefix + config.trimomatic_pe_endings[1],
            outfiles_prefix + config.trimomatic_pe_endings[3]
        ]
    else:
        # set options for single input file
        mode = "SE"
        outfiles = [outfiles_prefix + config.trimomatic_se_ending]

    # add positional arguments to command
    command += [mode, "-threads",
                str(threads), quality_scores] + infiles + outfiles
    # add optional arguments to command
    command += additional_options

    # run trimmomatic command
    utilities.run_command(command,
                          "Trimmomatic",
                          infiles,
                          outfiles,
                          None,
                          verbose,
                          exit_on_error=True)

    # now check all of the output files to find which are non-empty and return as
    # sets for running the alignment steps
    nonempty_outfiles = []
    outfile_size = [utilities.file_size(file) for file in outfiles]
    if paired_end:
        # if paired fastq files remain after trimming, preserve pairing
        if outfile_size[0] > 0 and outfile_size[2] > 0:
            nonempty_outfiles.append([outfiles[0], outfiles[2]])
        elif outfile_size[0] > 0:
            nonempty_outfiles.append([outfiles[0]])
            # remove the second paired file if empty
            utilities.remove_file(outfiles[2])
        elif outfile_size[2] > 0:
            nonempty_outfiles.append([outfiles[2]])
            # remove the second paired file if empty
            utilities.remove_file(outfiles[0])

        # add sequences without pairs, if present
        if outfile_size[1] > 0:
            nonempty_outfiles.append([outfiles[1]])
        else:
            # remove the file if empty
            utilities.remove_file(outfiles[1])

        if outfile_size[3] > 0:
            nonempty_outfiles.append([outfiles[3]])
        else:
            # remove the file if empty
            utilities.remove_file(outfiles[3])

    else:
        if outfile_size[0] > 0:
            nonempty_outfiles = [[outfiles[0]]]
        else:
            # remove the file if empty
            utilities.remove_file(outfiles[0])

    if not nonempty_outfiles:
        sys.exit("ERROR: Trimmomatic created empty output files.")

    return nonempty_outfiles