def get_parser(): epilog = """ The output is one file for each input file, <input file>.abundtrim, placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. The ``-V/--variable-coverage`` parameter will, if specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, use ``load-into-counting.py`` and ``filter-abund.py``. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. Example:: trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Trim low-abundance k-mers using a streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, help='base cutoff on this median k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--output', metavar="output_filename", type=argparse.FileType('wb'), help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./') add_output_compression_type(parser) return parser
def get_parser(): epilog = """ The output is one file for each input file, <input file>.corr, placed in the current directory. This output contains the input sequences, corrected at low-abundance k-mers. Note that the output reads will not necessarily be in the same order as the reads in the input files. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. Example:: correct-reads.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Correct reads using a semi-streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='k-mers below this abundance are not trusted', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, help='base cutoff on this median k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--out', metavar="filename", type=argparse.FileType('w'), default=None, help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only correct sequences that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./') parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) return parser
def get_parser(): epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force_single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: normalize-by-median.py -p -k 17 \\ tests/test-data/test-abund-read-paired.fa Example:: normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\ >> appended-output.fq Example:: normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\ tests/test-data/test-fastq-reads.fq Example:: normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog), citations=['diginorm']) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default=None, help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', help='write progress report to report_filename', metavar='report_filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', type=int, default=100000, help='report progress every report_frequency reads') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') parser.add_argument('-o', '--output', metavar="filename", type=khFileType('wb'), default=None, dest='single_output_file', help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) parser.add_argument('-z', '--loadgraph2', metavar="filename", default=None, help='load a second k-mer graph') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force_single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: normalize-by-median.py -p -k 17 \\ tests/test-data/test-abund-read-paired.fa Example:: normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\ >> appended-output.fq Example:: normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\ tests/test-data/test-fastq-reads.fq Example:: normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog), citations=['diginorm']) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('-C', '--cutoff', help="when the median " "k-mer coverage level is above this number the " "read is not kept.", type=check_argument_range(0, 256, "cutoff"), default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default=None, help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', help='write progress report to report_filename', metavar='report_filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', type=int, default=100000, help='report progress every report_frequency reads') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') parser.add_argument('-o', '--output', metavar="filename", type=khFileType('wb'), default=None, dest='single_output_file', help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) add_output_compression_type(parser) return parser
def get_parser(): epilog = (""" Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. Paired end reads will be considered together if :option:`-p` is set. If either read will be kept, then both will be kept. This should result in keeping (or discarding) each sequencing fragment. This helps with retention of repeats, especially. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. With :option:`-d`, the k-mer countgraph will be saved every d files for multifile runs; if :option:`-s` is set, the specified name will be used, and if not, the name `backup.ct` will be used. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these tables are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. :option:`-f`/:option:`--fault-tolerant` will force the program to continue upon encountering a formatting error in a sequence file; the k-mer counting table up to that point will be dumped, and processing will continue on the next file. Example:: saturate-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: """ " saturate-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa" # noqa """ Example:: """ " saturate-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq" # noqa """ Example:: """ " saturate-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads") # noqa parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)) parser.add_argument('-C', '--cutoff', type=int, default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savegraph', metavar="filename", default='') parser.add_argument('-R', '--report', metavar='filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', default=100000, type=int) parser.add_argument('-f', '--fault-tolerant', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('-o', '--out', metavar="filename", dest='single_output_filename', default='', help='only output a single' ' file with the specified filename') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) return parser
def get_parser(): epilog = """\ The output is one file for each input file, ``<input file>.abundtrim``, placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. The :option:`-V`/:option:`--variable-coverage` parameter will, if specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, use :program:`load-into-counting.py` and :program:`filter-abund.py`. However, read pairs will be kept together, in "broken-paired" format; you can use :program:`extract-paired-reads.py` to extract read pairs and orphans. Example:: trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Trim low-abundance k-mers using a streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--trim-at-coverage', '-Z', '--normalize-to', type=int, help='trim reads when entire read above this coverage', default=DEFAULT_TRIM_AT_COVERAGE) parser.add_argument('-o', '--output', metavar="output_filename", type=argparse.FileType('wb'), help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./', help="Set location of temporary directory for " "second pass") add_output_compression_type(parser) parser.add_argument('--diginorm', default=False, action='store_true', help="Eliminate high-coverage reads altogether " "(digital normalization).") parser.add_argument('--diginorm-coverage', type=int, default=DEFAULT_DIGINORM_COVERAGE, help="Coverage threshold for --diginorm") parser.add_argument('--single-pass', default=False, action='store_true', help="Do not do a second pass across the low coverage " "data") return parser
def get_parser(): epilog = ( """ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force-single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: """ " normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa" # noqa """ Example:: """ " normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq" # noqa """ Example:: """ " normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq" # noqa """ Example:: """ " normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads" ) # noqa parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog) ) parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true") parser.add_argument("-C", "--cutoff", type=int, default=DEFAULT_DESIRED_COVERAGE) parser.add_argument("-p", "--paired", action="store_true", help="require that all sequences be properly paired") parser.add_argument( "--force-single", dest="force_single", action="store_true", help="treat all sequences as single-ended/unpaired" ) parser.add_argument( "-u", "--unpaired-reads", metavar="unpaired_reads_filename", help="include a file of unpaired reads to which " "-p/--paired does not apply.", ) parser.add_argument( "-s", "--savegraph", metavar="filename", default="", help="save the k-mer countgraph to disk after all" "reads are loaded.", ) parser.add_argument("-R", "--report", metavar="report_filename", type=argparse.FileType("w")) parser.add_argument("--report-frequency", metavar="report_frequency", type=int, default=100000) parser.add_argument("-f", "--force", dest="force", help="continue past file reading errors", action="store_true") parser.add_argument( "-o", "--output", metavar="filename", type=argparse.FileType("wb"), default=None, dest="single_output_file", help="only output a single file with " 'the specified filename; use a single dash "-" to ' "specify that output should go to STDOUT (the " "terminal)", ) parser.add_argument( "input_filenames", metavar="input_sequence_filename", help="Input FAST[AQ] sequence filename.", nargs="+" ) add_loadgraph_args(parser) add_output_compression_type(parser) return parser