Ejemplo n.º 1
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.abundtrim, placed
    in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The ``-V/--variable-coverage`` parameter will, if specified,
    prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use ``load-into-counting.py`` and ``filter-abund.py``.  However, read
    pairs will be kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o', '--out', metavar="filename",
                        type=argparse.FileType('w'),
                        default=None, help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadhash_args(parser)
    parser.add_argument('-s', '--savetable', metavar="filename", default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')

    return parser
Ejemplo n.º 2
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >> sys.stderr, 'file with ht: %s' % counting_ht

    print >> sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(
        ht, args.trusted_cov, args.theta
    )  # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)

    ### the filtering loop
    for infile in infiles:
        print >> sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >> sys.stderr, name
            print >> sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(
                seq)
            print >> sys.stderr, score
            print >> sys.stderr, graph_alignment
            print >> sys.stderr, read_alignment
            print >> sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Ejemplo n.º 3
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >>sys.stderr, 'file with ht: %s' % counting_ht

    print >>sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(ht, args.trusted_cov, args.theta) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)
    
    ### the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >>sys.stderr, name
            print >>sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(seq)
            print >>sys.stderr, score
            print >>sys.stderr, graph_alignment
            print >>sys.stderr, read_alignment
            print >>sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Ejemplo n.º 4
0
def get_parser():
    epilog = ("""
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

""" "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads")   # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C', '--cutoff', type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savetable', metavar="filename", default='')
    parser.add_argument('-R', '--report',
                        metavar='filename', type=argparse.FileType('w'))
    parser.add_argument('-f', '--fault-tolerant', dest='force',
                        help='continue on next file if read errors are \
                         encountered', action='store_true')
    parser.add_argument('--save-on-failure', dest='fail_save',
                        action='store_false', default=True,
                        help='Save k-mer counting table when an error occurs')
    parser.add_argument('-d', '--dump-frequency', dest='dump_frequency',
                        type=int, help='dump k-mer counting table every d '
                        'files', default=-1)
    parser.add_argument('-o', '--out', metavar="filename",
                        dest='single_output_filename',
                        default='', help='only output a single'
                        ' file with the specified filename')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.', nargs='+')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers"
                        " post-normalization to stderr")
    parser.add_argument('--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 5
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    Example::

        saturate-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        saturate-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savetable', metavar="filename", default='')
    parser.add_argument('-R',
                        '--report',
                        metavar='filename',
                        type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency',
                        default=100000,
                        type=int)
    parser.add_argument('-f',
                        '--fault-tolerant',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        dest='single_output_filename',
                        default='',
                        help='only output a single'
                        ' file with the specified filename')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 6
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.abundtrim, placed
    in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The ``-V/--variable-coverage`` parameter will, if specified,
    prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use ``load-into-counting.py`` and ``filter-abund.py``.  However, read
    pairs will be kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff',
                        '-C',
                        type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        type=argparse.FileType('w'),
                        default=None,
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadhash_args(parser)
    parser.add_argument('-s',
                        '--savetable',
                        metavar="filename",
                        default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')

    return parser
Ejemplo n.º 7
0
def get_parser():
    epilog = ("""
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force-single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

""" "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
    """

    Example::

""" "        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads")   # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C', '--cutoff', type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force-single', dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u', '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s', '--savetable', metavar="filename", default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')
    parser.add_argument('-R', '--report',
                        metavar='filename', type=argparse.FileType('w'))
    parser.add_argument('-f', '--force', dest='force',
                        help='continue on next file if read errors are \
                         encountered', action='store_true')
    parser.add_argument('-o', '--out', metavar="filename",
                        dest='single_output_file',
                        type=argparse.FileType('w'),
                        default=None, help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.', nargs='+')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 8
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force-single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force-single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s',
                        '--savetable',
                        metavar="filename",
                        default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')
    parser.add_argument('-R',
                        '--report',
                        metavar='filename',
                        type=argparse.FileType('w'))
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        dest='single_output_file',
                        type=argparse.FileType('w'),
                        default=None,
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 9
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially. With :option: `-u`/:option:`--unpaired-reads`, 
    unpaired reads from the specified file will be read after the paired data
    is read. 

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='with paired data only,\
                        include an unpaired file')
    parser.add_argument('-s',
                        '--savetable',
                        metavar="filename",
                        default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')
    parser.add_argument('-R',
                        '--report',
                        metavar='filename',
                        type=argparse.FileType('w'))
    parser.add_argument('-f',
                        '--fault-tolerant',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('--save-on-failure',
                        dest='fail_save',
                        action='store_false',
                        default=True,
                        help='Save k-mer counting table when an error occurs')
    parser.add_argument('-d',
                        '--dump-frequency',
                        dest='dump_frequency',
                        type=int,
                        help='dump k-mer counting table every d '
                        'files',
                        default=-1)
    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        dest='single_output_file',
                        type=argparse.FileType('w'),
                        default=None,
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    parser.add_argument('--report-total-kmers',
                        '-t',
                        action='store_true',
                        help="Prints the total number of k-mers"
                        " post-normalization to stderr")
    parser.add_argument('--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 10
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    Example::

        saturate-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        saturate-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)
    )
    parser.add_argument("-C", "--cutoff", type=int, default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument("-p", "--paired", action="store_true")
    parser.add_argument("-s", "--savetable", metavar="filename", default="")
    parser.add_argument("-R", "--report", metavar="filename", type=argparse.FileType("w"))
    parser.add_argument("--report-frequency", metavar="report_frequency", default=100000, type=int)
    parser.add_argument(
        "-f",
        "--fault-tolerant",
        dest="force",
        help="continue on next file if read errors are \
                         encountered",
        action="store_true",
    )
    parser.add_argument(
        "-o",
        "--out",
        metavar="filename",
        dest="single_output_filename",
        default="",
        help="only output a single" " file with the specified filename",
    )
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", help="Input FAST[AQ] sequence filename.", nargs="+"
    )
    add_loadhash_args(parser)
    return parser