Beispiel #1
0
def run_bowtie2(bowtie2_path, pair1, pair2, orphans, database, sam, threads,
                options, reorder):
    """ Run bowtie2 with the options provided """

    command = [bowtie2_path]
    # if pairs are to be run as single end, then provide them all as orphans
    if orphans:
        orphans = ",".join([pair1, pair2, orphans])
    else:
        orphans = ",".join([pair1, pair2])

    command += [
        "--threads",
        str(threads), "-x", database, "-S", sam, "--no-head"
    ]
    if orphans:
        command += ["-U", orphans]
    if options:
        command += utilities.format_options_to_list([options])
    if reorder:
        command += ["--reorder"]

    try:
        return_code = subprocess.check_call(command)
    except (EnvironmentError, subprocess.CalledProcessError) as e:
        message = "Unable to run bowtie2: " + " ".join(command)
        if hasattr(e, 'output') and e.output:
            message += "\nError message returned from bowtie2:\n" + e.output
        sys.exit(message)
Beispiel #2
0
def bowtie2_unpaired_command(args, input_fastq_path, output_sam_path, maintain_input_ordering):

    # Bowtie2 from input as unpaired reads to output, using the provided index
    command=[args.bowtie2, "-U", input_fastq_path, "-S", output_sam_path, "-x", args.index]
    

    # Runtime options
    command+=["--threads",str(args.threads)]
    if args.bowtie2_options:
        command+=utilities.format_options_to_list([args.bowtie2_options])

    if not args.verbose:
        command+=["--quiet"]
   
    # Maintain ordering of the fastq, if required
    if maintain_input_ordering:
        command+=["--reorder"]

    return command
Beispiel #3
0
def main():
    # Parse the arguments from the user
    args = parse_arguments(sys.argv)

    # Update the configuration
    args = update_configuration(args)

    # set the prefix for the output files
    full_path_output_prefix = os.path.join(args.output_dir, args.output_prefix)

    # Start logging
    setup_logging(args)

    temp_output_files = []
    # Check for compressed files, bam files, or sam files
    for index in range(len(args.input)):
        args.input[index] = utilities.get_decompressed_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_sam_from_bam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_fastq_from_sam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)

    # Get the format of the first input file
    file_format = utilities.get_file_format(args.input[0])

    if file_format != "fastq":
        message = "Your input file is of type: " + file_format + ". Please provide an input file of fastq format."
        logger.critical(message)
        sys.exit(message)

    # if this is the new illumina identifier format, create temp files after reformatting the headers
    for index in range(len(args.input)):
        args.input[index] = utilities.get_reformatted_identifiers(
            args.input[index], index, args.output_dir, temp_output_files,
            args.input)

    # check for reads that are not ordered and order if needed (if trimmomatic is run)
    if not args.bypass_trim and len(args.input) == 2:
        args.input = utilities.check_and_reorder_reads(args.input,
                                                       args.output_dir,
                                                       temp_output_files)

    # remove any temp files from decompress/reformat that are no longer needed
    utilities.update_temp_output_files(temp_output_files, [], args.input)

    # set trimmomatic options
    # this is done after the decompression and conversions from sam/bam
    # as the default requires the read length from the input sequences
    if args.trimmomatic_options:
        # parse the options from the user into an array of options
        args.trimmomatic_options = utilities.format_options_to_list(
            args.trimmomatic_options)
    else:
        # if trimmomatic options not set by user, then set to default options
        # use read length of input file for minlen
        args.trimmomatic_options = utilities.get_default_trimmomatic_options(
            utilities.get_read_length_fastq(args.input[0]),
            path=config.trimmomatic_adapter_folder,
            type="PE" if len(args.input) == 2 else "SE",
            sequencer_source=args.sequencer_source)

    # Get the number of reads initially
    utilities.log_read_count_for_files(args.input, "raw",
                                       "Initial number of reads", args.verbose)

    # Run fastqc if set to run at start of workflow
    if args.fastqc_start or args.run_trim_repetitive:
        run.fastqc(args.fastqc_path, args.output_dir, original_input_files,
                   args.threads, args.verbose)
        #Setting fastqc output zip and txt file path
        output_txt_files = []
        for input_file_name in original_input_files:
            temp_file = os.path.splitext(input_file_name)[0]
            if (temp_file.count('fastq') > 0 or temp_file.count('fq') > 0):
                temp_file = os.path.splitext(temp_file)[0]
            output_txt_files.append(args.output_dir + "/fastqc/" +
                                    temp_file.split('/')[-1] +
                                    "_fastqc/fastqc_data.txt")

    if not args.bypass_trim:
        if args.run_trim_repetitive:
            # Get the Min Overrepresented Seq Length
            args.trimmomatic_options = utilities.get_updated_trimmomatic_parameters(
                output_txt_files, args.output_dir, args.trimmomatic_options)

        trimmomatic_output_files = run.trim(
            args.input, full_path_output_prefix, args.trimmomatic_path,
            args.trimmomatic_quality_scores, args.max_memory,
            args.trimmomatic_options, args.threads, args.verbose)

    else:
        message = "Bypass trimming"
        logger.info(message)
        print(message)
        trimmomatic_output_files = [args.input]

    # Get the number of reads after trimming
    utilities.log_read_count_for_files(trimmomatic_output_files, "trimmed",
                                       "Total reads after trimming",
                                       args.verbose)

    # run TRF, if set
    if not args.bypass_trf:
        # run trf on all output files
        trf_output_files = run.tandem(
            trimmomatic_output_files, full_path_output_prefix, args.match,
            args.mismatch, args.delta, args.pm, args.pi, args.minscore,
            args.maxperiod, args.trf_path, args.processes, args.verbose,
            args.remove_temp_output, args.threads)
        # remove the aligment files, if intermediate output files should be removed
        if args.reference_db and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(
                trimmomatic_output_files)
    else:
        trf_output_files = trimmomatic_output_files
    # If a reference database is not provided, then bypass decontamination step
    if not args.reference_db:
        message = "Bypass decontamination"
        logger.info(message)
        print(message)
        # resolve sub-lists if present
        final_output_files = trf_output_files
    else:
        final_output_files = run.decontaminate(args, full_path_output_prefix,
                                               trf_output_files)
        # remove trimmed output files, if set to remove intermediate outputx
        if not args.bypass_trim and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(trf_output_files)

    # If set, concat the final output files if there is more than one
    final_output_files = utilities.resolve_sublists(final_output_files)
    if args.cat_final_output and len(final_output_files) > 1:
        cat_output_file = full_path_output_prefix + config.fastq_file_extension
        utilities.cat_files(final_output_files, cat_output_file)

        # if removing intermediate output, then remove the files that were merged
        if args.remove_intermediate_output:
            temp_output_files += final_output_files
            final_output_files = [cat_output_file]
        else:
            final_output_files.append(cat_output_file)

    # Remove any temp output files, if set
    if not args.store_temp_output:
        for file in temp_output_files:
            utilities.remove_file(file)

    # Run fastqc if set to run at end of workflow
    if args.fastqc_end:
        run.fastqc(args.fastqc_path, args.output_dir, final_output_files,
                   args.threads, args.verbose)

    if len(final_output_files) > 1:
        message = "\nFinal output files created: \n"
    else:
        message = "\nFinal output file created: \n"

    message = message + "\n".join(final_output_files) + "\n"
    logger.info(message)
    print(message)
Beispiel #4
0
def update_configuration(args):
    """ Update the run settings based on the arguments provided """

    # get the full path for the output directory
    args.output_dir = os.path.abspath(args.output_dir)

    # set if temp output should be removed
    args.remove_temp_output = not args.store_temp_output

    # if intermediate output should be removed, then also remove temp output
    if args.remove_intermediate_output:
        args.remove_temp_output = True

    # check the input files are non-empty and readable
    args.input = []
    if (args.input1 and args.input2):
        args.input.append(os.path.abspath(args.input1))
        args.input.append(os.path.abspath(args.input2))
    if (args.unpaired):
        args.input.append(os.path.abspath(args.unpaired))
    utilities.is_file_readable(args.input[0], exit_on_error=True)
    if len(args.input) == 2:
        utilities.is_file_readable(args.input[1], exit_on_error=True)
    elif len(args.input) > 2:
        sys.exit("ERROR: Please provide at most 2 input files.")
    elif len(args.input) == 0:
        sys.exit(
            "ERROR: Please provide --input1/--input2 or --unpaired (input) files."
        )

    #Store original file paths for FASTQC
    for input in args.input:
        original_input_files.append(input)

    # create the output directory if needed
    utilities.create_directory(args.output_dir)

    # set bowtie2 options
    if args.bowtie2_options:
        # parse the options from the user into any array of options
        args.bowtie2_options = utilities.format_options_to_list(
            args.bowtie2_options)
    else:
        # if not set by user, then set to default options
        args.bowtie2_options = config.bowtie2_options

    # add the quality scores to the bowtie2 options
    args.bowtie2_options += [
        config.bowtie2_flag_start + args.trimmomatic_quality_scores
    ]

    # set the mode for single end input file
    if len(args.input) == 1:
        args.decontaminate_pairs = "unpaired"

    # set the bowtie2 mode based on the pairs input
    args.discordant = False
    if args.decontaminate_pairs != "lenient":
        args.discordant = True

    # update the quality score option into a flag for trimmomatic
    args.trimmomatic_quality_scores = config.trimmomatic_flag_start + args.trimmomatic_quality_scores

    # find the location of trimmomatic, trimmomatic does not need to be executable
    if not args.bypass_trim:
        args.trimmomatic_path = utilities.find_dependency(
            args.trimmomatic_path,
            config.trimmomatic_jar,
            "trimmomatic",
            "--trimmomatic",
            bypass_permissions_check=True)

    # find the location of bmtagger, if set to run
    if args.reference_db:
        if args.bmtagger:
            args.bmtagger_path = utilities.find_dependency(
                args.bmtagger_path,
                config.bmtagger_exe,
                "bmtagger",
                "--bmtagger",
                bypass_permissions_check=False)
            # add this folder to path, so as to be able to find other dependencies like bmfilter
            utilities.add_exe_to_path(os.path.dirname(args.bmtagger_path))
        else:
            # find the location of bowtie2, if not running with bmtagger
            args.bowtie2_path = utilities.find_dependency(
                args.bowtie2_path,
                config.bowtie2_exe,
                "bowtie2",
                "--bowtie2",
                bypass_permissions_check=False)

    # find the location of trf, if set to run
    if not args.bypass_trf:
        args.trf_path = utilities.find_dependency(
            args.trf_path,
            config.trf_exe,
            "trf",
            "--trf",
            bypass_permissions_check=False)

    # if fastqc is set to be run, check if the executable can be found
    if args.fastqc_start or args.fastqc_end:
        args.fastqc_path = utilities.find_dependency(
            args.fastqc_path,
            config.fastqc_exe,
            "fastqc",
            "--fastqc",
            bypass_permissions_check=False)

    # set the default output prefix
    if args.output_prefix == None:
        if args.input[0].endswith(".gz") or args.input[0].endswith(".bz2"):
            # remove compression extension if present
            infile_base = os.path.splitext(
                os.path.splitext(os.path.basename(args.input[0]))[0])[0]
        else:
            infile_base = os.path.splitext(os.path.basename(args.input[0]))[0]
        args.output_prefix = infile_base + "_kneaddata"

    # find the bowtie2 indexes for each of the reference databases
    # reference database inputs can be directories, indexes, or index files
    if args.reference_db:
        reference_indexes = []
        database_type = "bowtie2"
        if args.bmtagger:
            database_type = "bmtagger"
        for directory in args.reference_db:
            reference_indexes.append(
                utilities.find_database_index(os.path.abspath(directory),
                                              database_type))

        args.reference_db = reference_indexes

    return args