Esempio n. 1
0
def parse_arguments(args):
    """ 
    Parse the arguments from the user
    """

    parser = argparse.ArgumentParser(
        description="KneadData\n",
        formatter_class=argparse.RawTextHelpFormatter,
        prog="kneaddata")
    group1 = parser.add_argument_group("global options")
    group1.add_argument("--version",
                        action="version",
                        version="%(prog)s v" + VERSION)
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="additional output is printed\n")
    group1.add_argument("-i1",
                        "--input1",
                        help="Pair 1 input FASTQ file",
                        dest='input1')
    group1.add_argument("-i2",
                        "--input2",
                        help="Pair 2 input FASTQ file",
                        dest='input2')
    group1.add_argument("-un",
                        "--unpaired",
                        help="unparied input FASTQ file",
                        dest='unpaired')
    group1.add_argument("-o",
                        "--output",
                        dest='output_dir',
                        help="directory to write output files",
                        required=True)
    group1.add_argument(
        "-db",
        "--reference-db",
        default=[],
        action="append",
        help=
        "location of reference database (additional arguments add databases)")
    group1.add_argument("--bypass-trim",
                        action="store_true",
                        help="bypass the trim step")
    group1.add_argument(
        "--output-prefix",
        help="prefix for all output files\n[ DEFAULT : $SAMPLE_kneaddata ]")
    group1.add_argument("-t",
                        "--threads",
                        type=int,
                        default=config.threads,
                        metavar="<" + str(config.threads) + ">",
                        help="number of threads\n[ Default : " +
                        str(config.threads) + " ]")
    group1.add_argument("-p",
                        "--processes",
                        type=int,
                        default=config.processes,
                        metavar="<" + str(config.processes) + ">",
                        help="number of processes\n[ Default : " +
                        str(config.processes) + " ]")
    group1.add_argument("-q",
                        "--quality-scores",
                        default=config.quality_scores,
                        choices=config.quality_scores_options,
                        dest='trimmomatic_quality_scores',
                        help="quality scores\n[ DEFAULT : " +
                        config.quality_scores + " ]")
    group1.add_argument(
        "--run-bmtagger",
        default=False,
        action="store_true",
        dest='bmtagger',
        help="run BMTagger instead of Bowtie2 to identify contaminant reads")
    group1.add_argument("--bypass-trf",
                        action="store_true",
                        help="option to bypass the removal of tandem repeats")
    group1.add_argument(
        "--run-trf",
        action="store_true",
        help=
        "legacy option to run the removal of tandem repeats (now run by default)"
    )
    group1.add_argument("--run-fastqc-start",
                        default=False,
                        dest='fastqc_start',
                        action="store_true",
                        help="run fastqc at the beginning of the workflow")
    group1.add_argument("--run-fastqc-end",
                        default=False,
                        dest='fastqc_end',
                        action="store_true",
                        help="run fastqc at the end of the workflow")
    group1.add_argument(
        "--store-temp-output",
        action="store_true",
        help=
        "store temp output files\n[ DEFAULT : temp output files are removed ]")
    group1.add_argument(
        "--remove-intermediate-output",
        action="store_true",
        help=
        "remove intermediate output files\n[ DEFAULT : intermediate output files are stored ]"
    )
    group1.add_argument(
        "--cat-final-output",
        action="store_true",
        help=
        "concatenate all final output files\n[ DEFAULT : final output is not concatenated ]"
    )
    group1.add_argument("--log-level",
                        default=config.log_level,
                        choices=config.log_level_choices,
                        help="level of log messages\n[ DEFAULT : " +
                        config.log_level + " ]")
    group1.add_argument(
        "--log",
        help="log file\n[ DEFAULT : $OUTPUT_DIR/$SAMPLE_kneaddata.log ]")

    group2 = parser.add_argument_group("trimmomatic arguments")
    group2.add_argument("--trimmomatic",
                        dest='trimmomatic_path',
                        help="path to trimmomatic\n[ DEFAULT : $PATH ]")
    group2.add_argument(
        "--run-trim-repetitive",
        default=False,
        dest='run_trim_repetitive',
        action="store_true",
        help="Trim fastqc generated overrepresented sequences\n")
    group2.add_argument("--max-memory",
                        default=config.trimmomatic_memory,
                        help="max amount of memory\n[ DEFAULT : " +
                        config.trimmomatic_memory + " ]")
    group2.add_argument(
        "--trimmomatic-options",
        action="append",
        help="options for trimmomatic\n[ DEFAULT : "+" ".join(utilities.get_default_trimmomatic_options())+" ]\n"+\
             "MINLEN is set to "+str(config.trimmomatic_min_len_percent)+" percent of total input read length. The user can alternatively specify a length (in bases) for MINLEN.")
    group2.add_argument("--sequencer-source",
                        dest='sequencer_source',
                        default=config.trimmomatic_provided_sequencer_default,
                        choices=config.trimmomatic_provided_sequencer_source,
                        help="options for sequencer-source\n[ DEFAULT : " +
                        config.trimmomatic_provided_sequencer_default + "]")

    group3 = parser.add_argument_group("bowtie2 arguments")
    group3.add_argument("--bowtie2",
                        dest='bowtie2_path',
                        help="path to bowtie2\n[ DEFAULT : $PATH ]")
    group3.add_argument("--bowtie2-options",
                        action="append",
                        help="options for bowtie2\n[ DEFAULT : " +
                        " ".join(config.bowtie2_options) + " ]")
    group3.add_argument(
        "--decontaminate-pairs",
        choices=["strict","lenient","unpaired"],
        default="strict",
        help="options for filtering of paired end reads (strict='remove both R1+R2 if either align', lenient='remove only if both R1+R2 align', unpaired='ignore pairing and remove as single end')\n"+\
             "[ DEFAULT : %(default)s ]")
    group3.add_argument(
        "--reorder",
        action="store_true",
        help=
        "order the sequences in the same order as the input\n[ DEFAULT : Sequences are not ordered ]"
    )
    group3.add_argument(
        "--serial",
        action="store_true",
        help=
        "filter the input in serial for multiple databases so a subset of reads are processed in each database search"
    )

    group4 = parser.add_argument_group("bmtagger arguments")
    group4.add_argument("--bmtagger",
                        dest='bmtagger_path',
                        help="path to BMTagger\n[ DEFAULT : $PATH ]")

    group5 = parser.add_argument_group("trf arguments")
    group5.add_argument("--trf",
                        dest='trf_path',
                        help="path to TRF\n[ DEFAULT : $PATH ]")
    group5.add_argument("--match",
                        type=int,
                        default=config.trf_match,
                        help="matching weight\n[ DEFAULT : " +
                        str(config.trf_match) + " ]")
    group5.add_argument("--mismatch",
                        type=int,
                        default=config.trf_mismatch,
                        help="mismatching penalty\n[ DEFAULT : " +
                        str(config.trf_mismatch) + " ]")
    group5.add_argument("--delta",
                        type=int,
                        default=config.trf_delta,
                        help="indel penalty\n[ DEFAULT : " +
                        str(config.trf_delta) + " ]")
    group5.add_argument("--pm",
                        type=int,
                        default=config.trf_match_probability,
                        help="match probability\n[ DEFAULT : " +
                        str(config.trf_match_probability) + " ]")
    group5.add_argument("--pi",
                        type=int,
                        default=config.trf_pi,
                        help="indel probability\n[ DEFAULT : " +
                        str(config.trf_pi) + " ]")
    group5.add_argument(
        "--minscore",
        type=int,
        default=config.trf_minscore,
        help="minimum alignment score to report\n[ DEFAULT : " +
        str(config.trf_minscore) + " ]")
    group5.add_argument("--maxperiod",
                        type=int,
                        default=config.trf_maxperiod,
                        help="maximum period size to report\n[ DEFAULT : " +
                        str(config.trf_maxperiod) + " ]")
    group6 = parser.add_argument_group("fastqc arguments")
    group6.add_argument("--fastqc",
                        dest='fastqc_path',
                        help="path to fastqc\n[ DEFAULT : $PATH ]")

    return parser.parse_args()
Esempio n. 2
0
def main():
    # Parse the arguments from the user
    args = parse_arguments(sys.argv)

    # Update the configuration
    args = update_configuration(args)

    # set the prefix for the output files
    full_path_output_prefix = os.path.join(args.output_dir, args.output_prefix)

    # Start logging
    setup_logging(args)

    temp_output_files = []
    # Check for compressed files, bam files, or sam files
    for index in range(len(args.input)):
        args.input[index] = utilities.get_decompressed_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_sam_from_bam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)
        args.input[index] = utilities.get_fastq_from_sam_file(
            args.input[index], args.output_dir, temp_output_files, args.input)

    # Get the format of the first input file
    file_format = utilities.get_file_format(args.input[0])

    if file_format != "fastq":
        message = "Your input file is of type: " + file_format + ". Please provide an input file of fastq format."
        logger.critical(message)
        sys.exit(message)

    # if this is the new illumina identifier format, create temp files after reformatting the headers
    for index in range(len(args.input)):
        args.input[index] = utilities.get_reformatted_identifiers(
            args.input[index], index, args.output_dir, temp_output_files,
            args.input)

    # check for reads that are not ordered and order if needed (if trimmomatic is run)
    if not args.bypass_trim and len(args.input) == 2:
        args.input = utilities.check_and_reorder_reads(args.input,
                                                       args.output_dir,
                                                       temp_output_files)

    # remove any temp files from decompress/reformat that are no longer needed
    utilities.update_temp_output_files(temp_output_files, [], args.input)

    # set trimmomatic options
    # this is done after the decompression and conversions from sam/bam
    # as the default requires the read length from the input sequences
    if args.trimmomatic_options:
        # parse the options from the user into an array of options
        args.trimmomatic_options = utilities.format_options_to_list(
            args.trimmomatic_options)
    else:
        # if trimmomatic options not set by user, then set to default options
        # use read length of input file for minlen
        args.trimmomatic_options = utilities.get_default_trimmomatic_options(
            utilities.get_read_length_fastq(args.input[0]),
            path=config.trimmomatic_adapter_folder,
            type="PE" if len(args.input) == 2 else "SE",
            sequencer_source=args.sequencer_source)

    # Get the number of reads initially
    utilities.log_read_count_for_files(args.input, "raw",
                                       "Initial number of reads", args.verbose)

    # Run fastqc if set to run at start of workflow
    if args.fastqc_start or args.run_trim_repetitive:
        run.fastqc(args.fastqc_path, args.output_dir, original_input_files,
                   args.threads, args.verbose)
        #Setting fastqc output zip and txt file path
        output_txt_files = []
        for input_file_name in original_input_files:
            temp_file = os.path.splitext(input_file_name)[0]
            if (temp_file.count('fastq') > 0 or temp_file.count('fq') > 0):
                temp_file = os.path.splitext(temp_file)[0]
            output_txt_files.append(args.output_dir + "/fastqc/" +
                                    temp_file.split('/')[-1] +
                                    "_fastqc/fastqc_data.txt")

    if not args.bypass_trim:
        if args.run_trim_repetitive:
            # Get the Min Overrepresented Seq Length
            args.trimmomatic_options = utilities.get_updated_trimmomatic_parameters(
                output_txt_files, args.output_dir, args.trimmomatic_options)

        trimmomatic_output_files = run.trim(
            args.input, full_path_output_prefix, args.trimmomatic_path,
            args.trimmomatic_quality_scores, args.max_memory,
            args.trimmomatic_options, args.threads, args.verbose)

    else:
        message = "Bypass trimming"
        logger.info(message)
        print(message)
        trimmomatic_output_files = [args.input]

    # Get the number of reads after trimming
    utilities.log_read_count_for_files(trimmomatic_output_files, "trimmed",
                                       "Total reads after trimming",
                                       args.verbose)

    # run TRF, if set
    if not args.bypass_trf:
        # run trf on all output files
        trf_output_files = run.tandem(
            trimmomatic_output_files, full_path_output_prefix, args.match,
            args.mismatch, args.delta, args.pm, args.pi, args.minscore,
            args.maxperiod, args.trf_path, args.processes, args.verbose,
            args.remove_temp_output, args.threads)
        # remove the aligment files, if intermediate output files should be removed
        if args.reference_db and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(
                trimmomatic_output_files)
    else:
        trf_output_files = trimmomatic_output_files
    # If a reference database is not provided, then bypass decontamination step
    if not args.reference_db:
        message = "Bypass decontamination"
        logger.info(message)
        print(message)
        # resolve sub-lists if present
        final_output_files = trf_output_files
    else:
        final_output_files = run.decontaminate(args, full_path_output_prefix,
                                               trf_output_files)
        # remove trimmed output files, if set to remove intermediate outputx
        if not args.bypass_trim and args.remove_intermediate_output:
            temp_output_files += utilities.resolve_sublists(trf_output_files)

    # If set, concat the final output files if there is more than one
    final_output_files = utilities.resolve_sublists(final_output_files)
    if args.cat_final_output and len(final_output_files) > 1:
        cat_output_file = full_path_output_prefix + config.fastq_file_extension
        utilities.cat_files(final_output_files, cat_output_file)

        # if removing intermediate output, then remove the files that were merged
        if args.remove_intermediate_output:
            temp_output_files += final_output_files
            final_output_files = [cat_output_file]
        else:
            final_output_files.append(cat_output_file)

    # Remove any temp output files, if set
    if not args.store_temp_output:
        for file in temp_output_files:
            utilities.remove_file(file)

    # Run fastqc if set to run at end of workflow
    if args.fastqc_end:
        run.fastqc(args.fastqc_path, args.output_dir, final_output_files,
                   args.threads, args.verbose)

    if len(final_output_files) > 1:
        message = "\nFinal output files created: \n"
    else:
        message = "\nFinal output file created: \n"

    message = message + "\n".join(final_output_files) + "\n"
    logger.info(message)
    print(message)