Esempio n. 1
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.abundtrim, placed
    in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The ``-V/--variable-coverage`` parameter will, if specified,
    prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use ``load-into-counting.py`` and ``filter-abund.py``.  However, read
    pairs will be kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o', '--output', metavar="output_filename",
                        type=argparse.FileType('wb'),
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s', '--savegraph', metavar="filename", default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')
    add_output_compression_type(parser)

    return parser
Esempio n. 2
0
def get_parser():
    epilog = """\
    Example::

        extract-long-sequences.py --length 10 tests/test-data/paired-mixed.fa
    """
    parser = KhmerArgumentParser(
        description='Extract FASTQ or FASTA sequences longer than'
        ' specified length (default: 200 bp).',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames',
                        help='Input FAST[AQ]'
                        ' sequence filename.',
                        nargs='+')
    parser.add_argument('-o',
                        '--output',
                        help='The name of the output'
                        ' sequence file.',
                        default=sys.stdout,
                        metavar='output',
                        type=argparse.FileType('wb'))
    parser.add_argument('-l',
                        '--length',
                        help='The minimum length of'
                        ' the sequence file.',
                        type=int,
                        default=200)
    add_output_compression_type(parser)
    return parser
Esempio n. 3
0
def get_parser():
    epilog = """\
    Take a list of files containing sequences, and subsample 100,000
    sequences (:option:`-N`/:option:`--num_reads`) uniformly, using
    reservoir sampling.  Stop after first 100m sequences
    (:option:`-M`/:option:`--max_reads`). By default take one subsample,
    but take :option:`-S`/:option:`--samples` samples if specified.

    The output is placed in :option:`-o`/:option:`--output` <file>
    (for a single sample) or in ``<file>.subset.0`` to ``<file>.subset.S-1``
    (for more than one sample).

    This script uses the `reservoir sampling
    <http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.
    """

    parser = KhmerArgumentParser(
        description="Uniformly subsample sequences from a collection of files",
        epilog=textwrap.dedent(epilog))

    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-N',
                        '--num_reads',
                        type=int,
                        dest='num_reads',
                        default=DEFAULT_NUM_READS,
                        help='samples the '
                        'number of sequences or pairs specified with -N')
    parser.add_argument('-M',
                        '--max_reads',
                        type=int,
                        dest='max_reads',
                        default=DEFAULT_MAX_READS)
    parser.add_argument('-S',
                        '--samples',
                        type=int,
                        dest='num_samples',
                        default=1)
    parser.add_argument('-R',
                        '--random-seed',
                        type=int,
                        dest='random_seed',
                        help='Provide a random seed for the generator')
    parser.add_argument('--force_single',
                        default=False,
                        action='store_true',
                        help='Ignore read pair information if present')
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        type=argparse.FileType('wb'),
                        metavar="filename",
                        default=None)
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exits')
    add_output_compression_type(parser)
    return parser
Esempio n. 4
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 5
0
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = argparse.ArgumentParser(
        description='Produce interleaved files from R1/R2 paired files',
        epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter)

    parser.add_argument('left')
    parser.add_argument('right')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=argparse.FileType('wb'),
                        default=sys.stdout)
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).
    This reformatting can be switched off with the
    :option:`--no-reformat` flag.

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = argparse.ArgumentParser(
        description='Produce interleaved files from R1/R2 paired files',
        epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter)

    parser.add_argument('left')
    parser.add_argument('right')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=argparse.FileType('wb'),
                        default=sys.stdout)
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('--no-reformat', default=False, action='store_true',
                        help='Do not reformat read names or enforce\
                              consistency')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 7
0
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).
    This reformatting can be switched off with the
    :option:`--no-reformat` flag.

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = KhmerArgumentParser(
        description='Produce interleaved files from R1/R2 paired files',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('left')
    parser.add_argument('right')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        default=sys.stdout)
    parser.add_argument('--no-reformat', default=False, action='store_true',
                        help='Do not reformat read names or enforce\
                              consistency')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 8
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).',
        formatter_class=ComboFormatter)

    parser.add_argument('input_sequence',
                        help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o',
                        '--output',
                        metavar="filename",
                        type=argparse.FileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n',
                        '--n_keep',
                        default=False,
                        action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                        'input_sequence file. Default is to drop reads')
    parser.add_argument('--version',
                        action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    add_output_compression_type(parser)
    return parser
Esempio n. 9
0
def get_parser():
    epilog = """\
    Many read-handling programs (assemblers, mappers, etc.) require
    that you give them either perfectly interleaved files, or files
    containing only single reads. This script takes files that were
    originally interleaved but where reads may have been orphaned (via
    error filtering, application of abundance filtering, digital
    normalization in non-paired mode, or partitioning) and separates
    the interleaved reads from the orphaned reads.

    The default output is two files, `<input file>.pe` and `<input
    file>.se`, placed in the current directory. The .pe file contains
    interleaved and properly paired sequences, while the .se file
    contains orphan sequences.

    The directory into which the interleaved and orphaned reads are
    output may be specified using :option:`-d`/:option:`--output-dir`.
    This directory will be created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-p`/:option:`--output-paired` and
    :option:`-s`/:option:`--output-single`, which will override the
    :option:`-d`/:option:`--output-dir` option.

    Example::

        extract-paired-reads.py tests/test-data/paired.fq
    """
    parser = KhmerArgumentParser(
        description='Take a mixture of reads and split into pairs and '
        'orphans.',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('infile', nargs='?', default='/dev/stdin')
    parser.add_argument('-d',
                        '--output-dir',
                        default='',
                        help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('-p',
                        '--output-paired',
                        metavar="filename",
                        type=khFileType('wb'),
                        default=None,
                        help='Output paired reads to this '
                        'file')
    parser.add_argument('-s',
                        '--output-single',
                        metavar="filename",
                        type=khFileType('wb'),
                        default=None,
                        help='Output orphaned reads to this file')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 10
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = argparse.ArgumentParser(
        description="Separate sequences that are annotated with partitions "
        "into grouped files.",
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)
    parser.add_argument('prefix', metavar='output_filename_prefix')
    parser.add_argument('part_filenames',
                        metavar='input_partition_filename',
                        nargs='+')
    parser.add_argument('--max-size',
                        '-X',
                        dest='max_size',
                        default=DEFAULT_MAX_SIZE,
                        type=int,
                        help='Max group size (n sequences)')
    parser.add_argument('--min-partition-size',
                        '-m',
                        dest='min_part_size',
                        default=DEFAULT_THRESHOLD,
                        type=int,
                        help='Minimum partition size worth keeping')
    parser.add_argument('--no-output-groups',
                        '-n',
                        dest='output_groups',
                        default=True,
                        action='store_false',
                        help='Do not actually output groups files.')
    parser.add_argument('--output-unassigned',
                        '-U',
                        default=False,
                        action='store_true',
                        help='Output unassigned sequences, too')
    parser.add_argument('--version',
                        action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 11
0
def get_parser():
    epilog = """
    Some programs want paired-end read input in the One True Format, which is
    interleaved; other programs want input in the Insanely Bad Format, with
    left- and right- reads separated. This reformats the former to the latter.

    The directory into which the left- and right- reads are output may be
    specified using :option:`-d`/:option:`--output-dir`. This directory will be
    created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-1`/:option:`--output-first` and
    :option:`-2`/:option:`--output-second`, which will override the
    :option:`-d`/:option:`--output-dir` setting on a file-specific basis.

    :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
    and orphaned reads will be saved separately, to the specified file.

    Example::

        split-paired-reads.py tests/test-data/paired.fq

    Example::

        split-paired-reads.py -o ~/reads-go-here tests/test-data/paired.fq

    Example::

        split-paired-reads.py -1 reads.1 -2 reads.2 tests/test-data/paired.fq
    """
    parser = argparse.ArgumentParser(
        description='Split interleaved reads into two files, left and right.',
        epilog=textwrap.dedent(epilog),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('infile', nargs='?', default='/dev/stdin')

    parser.add_argument('-d', '--output-dir', metavar="output_directory",
                        dest='output_directory', default='', help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('-0', '--output-orphaned', metavar='output_orphaned',
                        help='Allow "orphaned" reads and extract them to ' +
                        'this file',
                        type=argparse.FileType('wb'))
    parser.add_argument('-1', '--output-first', metavar='output_first',
                        default=None, help='Output "left" reads to this '
                        'file', type=argparse.FileType('wb'))
    parser.add_argument('-2', '--output-second', metavar='output_second',
                        default=None, help='Output "right" reads to this '
                        'file', type=argparse.FileType('wb'))
    parser.add_argument('--version', action='version', version='%(prog)s ' +
                        khmer.__version__)
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 12
0
def get_parser():
    epilog = """
    Many read-handling programs (assemblers, mappers, etc.) require
    that you give them either perfectly interleaved files, or files
    containing only single reads. This script takes files that were
    originally interleaved but where reads may have been orphaned (via
    error filtering, application of abundance filtering, digital
    normalization in non-paired mode, or partitioning) and separates
    the interleaved reads from the orphaned reads.

    The default output is two files, `<input file>.pe` and `<input
    file>.se`, placed in the current directory. The .pe file contains
    interleaved and properly paired sequences, while the .se file
    contains orphan sequences.

    The directory into which the interleaved and orphaned reads are
    output may be specified using :option:`-d`/:option:`--output-dir`.
    This directory will be created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-p`/:option:`--output-paired` and
    :option:`-s`/:option:`--output-single`, which will override the
    :option:`-d`/:option:`--output-dir` option.

    Example::

        extract-paired-reads.py tests/test-data/paired.fq
    """
    parser = argparse.ArgumentParser(
        description='Take a mixture of reads and split into pairs and '
        'orphans.', epilog=textwrap.dedent(epilog))
    parser.add_argument('infile', nargs='?', default='/dev/stdin')
    parser.add_argument('--version', action='version', version='%(prog)s ' +
                        khmer.__version__)

    parser.add_argument('-d', '--output-dir', default='', help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')

    parser.add_argument('--output-paired', '-p', metavar="filename",
                        type=argparse.FileType('wb'),
                        default=None, help='Output paired reads to this '
                        'file')
    parser.add_argument('--output-single', '-s', metavar="filename",
                        type=argparse.FileType('wb'), default=None,
                        help='Output orphaned reads to this file')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 13
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = KhmerArgumentParser(
        description="Separate sequences that are annotated with partitions " "into grouped files.",
        epilog=textwrap.dedent(epilog),
        citations=["graph"],
    )
    parser.add_argument("prefix", metavar="output_filename_prefix")
    parser.add_argument("part_filenames", metavar="input_partition_filename", nargs="+")
    parser.add_argument(
        "--max-size", "-X", dest="max_size", default=DEFAULT_MAX_SIZE, type=int, help="Max group size (n sequences)"
    )
    parser.add_argument(
        "--min-partition-size",
        "-m",
        dest="min_part_size",
        default=DEFAULT_THRESHOLD,
        type=int,
        help="Minimum partition size worth keeping",
    )
    parser.add_argument(
        "--no-output-groups",
        "-n",
        dest="output_groups",
        default=True,
        action="store_false",
        help="Do not actually output groups files.",
    )
    parser.add_argument(
        "--output-unassigned", "-U", default=False, action="store_true", help="Output unassigned sequences, too"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    add_output_compression_type(parser)
    return parser
Esempio n. 14
0
def get_parser():
    epilog = """\
    Some programs want paired-end read input in the One True Format, which is
    interleaved; other programs want input in the Insanely Bad Format, with
    left- and right- reads separated. This reformats the former to the latter.
    The directory into which the left- and right- reads are output may be
    specified using :option:`-d`/:option:`--output-dir`. This directory will be
    created if it does not already exist.
    Alternatively, you can specify the filenames directly with
    :option:`-1`/:option:`--output-first` and
    :option:`-2`/:option:`--output-second`, which will override the
    :option:`-d`/:option:`--output-dir` setting on a file-specific basis.
    :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
    and orphaned reads will be saved separately, to the specified file.
    Example::
        split-paired-reads.py tests/test-data/paired.fq
    Example::
        split-paired-reads.py -0 reads-output-file tests/test-data/paired.fq
    Example::
        split-paired-reads.py -1 reads.1 -2 reads.2 tests/test-data/paired.fq
    """
    parser = argparse.ArgumentParser(
        description='Split interleaved reads into two files, left and right.',
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)

    parser.add_argument('infile', nargs='?', default='/dev/stdin')

    parser.add_argument('-d', '--output-dir', metavar="output_directory",
                        dest='output_directory', default='', help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('-0', '--output-orphaned', metavar='output_orphaned',
                        help='Allow "orphaned" reads and extract them to ' +
                        'this file',
                        type=argparse.FileType('wb'))
    parser.add_argument('-1', '--output-first', metavar='output_first',
                        default=None, help='Output "left" reads to this '
                        'file', type=argparse.FileType('wb'))
    parser.add_argument('-2', '--output-second', metavar='output_second',
                        default=None, help='Output "right" reads to this '
                        'file', type=argparse.FileType('wb'))
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 15
0
def get_parser():
    parser = KhmerArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).')

    parser.add_argument('input_sequence', help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n', '--n_keep', default=False, action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                             'input_sequence file. Default is to drop reads')
    add_output_compression_type(parser)
    return parser
Esempio n. 16
0
def get_parser():
    parser = KhmerArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).')

    parser.add_argument('input_sequence', help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n', '--n_keep', default=False, action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                             'input_sequence file. Default is to drop reads')
    add_output_compression_type(parser)
    return parser
Esempio n. 17
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Extract FASTQ or FASTA sequences longer than'
        ' specified length (default: 200 bp).',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input_filenames', help='Input FAST[AQ]'
                        ' sequence filename.', nargs='+')
    parser.add_argument('-o', '--output', help='The name of the output'
                        ' sequence file.', default=sys.stdout,
                        metavar='output', type=argparse.FileType('wb'))
    parser.add_argument('-l', '--length', help='The minimum length of'
                        ' the sequence file.',
                        type=int, default=200)
    add_output_compression_type(parser)
    return parser
Esempio n. 18
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt`` for each input sequence file. If
    the input sequences are from RNAseq or metagenome sequencing then
    :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa
        filter-abund.py -C 2 countgraph data/100k-filtered.fa
    """
    parser = argparse.ArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)
    parser.add_argument('input_graph', metavar='input_count_graph_filename',
                        help='The input k-mer countgraph filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, 'cutoff'),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--output', dest='single_output_file',
                        type=argparse.FileType('wb'),
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Esempio n. 19
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Esempio n. 20
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-V', '--variable-coverage', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Esempio n. 21
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt`` for each input sequence file. If
    the input sequences are from RNAseq or metagenome sequencing then
    :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa
        filter-abund.py -C 2 countgraph data/100k-filtered.fa
    """
    parser = KhmerArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        citations=['counting'])
    parser.add_argument('input_graph', metavar='input_count_graph_filename',
                        help='The input k-mer countgraph filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, 'cutoff'),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--output', dest='single_output_file',
                        type=khFileType('wb'),
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Esempio n. 22
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).',
        formatter_class=ComboFormatter)

    parser.add_argument('input_sequence', help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=argparse.FileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n', '--n_keep', default=False, action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                             'input_sequence file. Default is to drop reads')
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    add_output_compression_type(parser)
    return parser
Esempio n. 23
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = argparse.ArgumentParser(
        description="Separate sequences that are annotated with partitions "
        "into grouped files.", epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)
    parser.add_argument('prefix', metavar='output_filename_prefix')
    parser.add_argument('part_filenames', metavar='input_partition_filename',
                        nargs='+')
    parser.add_argument('--max-size', '-X', dest='max_size',
                        default=DEFAULT_MAX_SIZE, type=int,
                        help='Max group size (n sequences)')
    parser.add_argument('--min-partition-size', '-m', dest='min_part_size',
                        default=DEFAULT_THRESHOLD, type=int,
                        help='Minimum partition size worth keeping')
    parser.add_argument('--no-output-groups', '-n', dest='output_groups',
                        default=True, action='store_false',
                        help='Do not actually output groups files.')
    parser.add_argument('--output-unassigned', '-U', default=False,
                        action='store_true',
                        help='Output unassigned sequences, too')
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 24
0
def get_parser():
    epilog = ("""

    Take a list of files containing sequences, and subsample 100,000
    sequences (:option:`-N`/:option:`--num_reads`) uniformly, using
    reservoir sampling.  Stop after first 100m sequences
    (:option:`-M`/:option:`--max_reads`). By default take one subsample,
    but take :option:`-S`/:option:`--samples` samples if specified.

    The output is placed in :option:`-o`/:option:`--output` <file>
    (for a single sample) or in <file>.subset.0 to <file>.subset.S-1
    (for more than one sample).

    This script uses the `reservoir sampling
    <http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.
    """)   # noqa

    parser = argparse.ArgumentParser(
        description="Uniformly subsample sequences from a collection of files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        epilog=textwrap.dedent(epilog))

    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-N', '--num_reads', type=int, dest='num_reads',
                        default=DEFAULT_NUM_READS)
    parser.add_argument('-M', '--max_reads', type=int, dest='max_reads',
                        default=DEFAULT_MAX_READS)
    parser.add_argument('-S', '--samples', type=int, dest='num_samples',
                        default=1)
    parser.add_argument('-R', '--random-seed', type=int, dest='random_seed')
    parser.add_argument('--force_single', default=False, action='store_true',
                        help='Ignore read pair information if present')
    parser.add_argument('-o', '--output', dest='output_file',
                        type=argparse.FileType('wb'),
                        metavar="filename", default=None)
    parser.add_argument('--version', action='version', version='%(prog)s ' +
                        khmer.__version__)
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exits')
    add_output_compression_type(parser)
    return parser
Esempio n. 25
0
def get_parser():
    epilog = """\
    Example::

        extract-long-sequences.py --length 10 tests/test-data/paired-mixed.fa
    """
    parser = KhmerArgumentParser(
        description='Extract FASTQ or FASTA sequences longer than'
        ' specified length (default: 200 bp).',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', help='Input FAST[AQ]'
                        ' sequence filename.', nargs='+')
    parser.add_argument('-o', '--output', help='The name of the output'
                        ' sequence file.', default=sys.stdout,
                        metavar='output', type=argparse.FileType('wb'))
    parser.add_argument('-l', '--length', help='The minimum length of'
                        ' the sequence file.',
                        type=int, default=200)
    add_output_compression_type(parser)
    return parser
Esempio n. 26
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph',
                        metavar="filename",
                        default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('datafile',
                        metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Esempio n. 27
0
def get_parser():
    epilog = """

    Take a list of files containing sequences, and subsample 100,000
    sequences (:option:`-N`/:option:`--num_reads`) uniformly, using
    reservoir sampling.  Stop after first 100m sequences
    (:option:`-M`/:option:`--max_reads`). By default take one subsample,
    but take :option:`-S`/:option:`--samples` samples if specified.

    The output is placed in :option:`-o`/:option:`--output` <file>
    (for a single sample) or in `<file>.subset.0` to `<file>.subset.S-1`
    (for more than one sample).

    This script uses the `reservoir sampling
    <http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.
    """  # noqa

    parser = argparse.ArgumentParser(
        description="Uniformly subsample sequences from a collection of files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        epilog=textwrap.dedent(epilog),
    )

    parser.add_argument("filenames", nargs="+")
    parser.add_argument("-N", "--num_reads", type=int, dest="num_reads", default=DEFAULT_NUM_READS)
    parser.add_argument("-M", "--max_reads", type=int, dest="max_reads", default=DEFAULT_MAX_READS)
    parser.add_argument("-S", "--samples", type=int, dest="num_samples", default=1)
    parser.add_argument(
        "-R", "--random-seed", type=int, dest="random_seed", help="Provide a random seed for the generator"
    )
    parser.add_argument(
        "--force_single", default=False, action="store_true", help="Ignore read pair information if present"
    )
    parser.add_argument(
        "-o", "--output", dest="output_file", type=argparse.FileType("wb"), metavar="filename", default=None
    )
    parser.add_argument("--version", action="version", version="%(prog)s " + khmer.__version__)
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exits")
    add_output_compression_type(parser)
    return parser
Esempio n. 28
0
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).
    This reformatting can be switched off with the
    :option:`--no-reformat` flag.

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = argparse.ArgumentParser(
        description="Produce interleaved files from R1/R2 paired files",
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter,
    )

    parser.add_argument("left")
    parser.add_argument("right")
    parser.add_argument("-o", "--output", metavar="filename", type=argparse.FileType("wb"), default=sys.stdout)
    parser.add_argument("--version", action=_VersionStdErrAction, version="khmer {v}".format(v=__version__))
    parser.add_argument(
        "--no-reformat",
        default=False,
        action="store_true",
        help="Do not reformat read names or enforce\
                              consistency",
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    add_output_compression_type(parser)
    return parser
def get_parser():
    epilog = """\
    Some programs want paired-end read input in the One True Format, which is
    interleaved; other programs want input in the Insanely Bad Format, with
    left- and right- reads separated. This reformats the former to the latter.

    The directory into which the left- and right- reads are output may be
    specified using :option:`-d`/:option:`--output-dir`. This directory will be
    created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-1`/:option:`--output-first` and
    :option:`-2`/:option:`--output-second`, which will override the
    :option:`-d`/:option:`--output-dir` setting on a file-specific basis.

    :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
    and orphaned reads will be saved separately, to the specified file.

    Example::

        split-paired-reads.py tests/test-data/paired.fq

    Example::

        split-paired-reads.py -0 reads-output-file tests/test-data/paired.fq

    Example::

        split-paired-reads.py -1 reads.1 -2 reads.2 tests/test-data/paired.fq
    """
    parser = argparse.ArgumentParser(
        description="Split interleaved reads into two files, left and right.",
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter,
    )

    parser.add_argument("infile", nargs="?", default="/dev/stdin")

    parser.add_argument(
        "-d",
        "--output-dir",
        metavar="output_directory",
        dest="output_directory",
        default="",
        help="Output " "split reads to specified directory. Creates " "directory if necessary",
    )
    parser.add_argument(
        "-0",
        "--output-orphaned",
        metavar="output_orphaned",
        help='Allow "orphaned" reads and extract them to ' + "this file",
        type=argparse.FileType("wb"),
    )
    parser.add_argument(
        "-1",
        "--output-first",
        metavar="output_first",
        default=None,
        help='Output "left" reads to this ' "file",
        type=argparse.FileType("wb"),
    )
    parser.add_argument(
        "-2",
        "--output-second",
        metavar="output_second",
        default=None,
        help='Output "right" reads to this ' "file",
        type=argparse.FileType("wb"),
    )
    parser.add_argument("--version", action=_VersionStdErrAction, version="khmer {v}".format(v=__version__))
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    add_output_compression_type(parser)
    return parser
Esempio n. 30
0
def get_parser():
    epilog = """\
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/:option:`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force_single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
    will be saved to the specified file after all sequences have been
    processed. :option:`-l`/:option:`--loadgraph` will load the
    specified k-mer countgraph before processing the specified
    files.  Note that these graphs are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

        normalize-by-median.py -p -k 17 \\
        tests/test-data/test-abund-read-paired.fa

    Example::

        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\
        >> appended-output.fq

    Example::

        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\
        tests/test-data/test-fastq-reads.fq

    Example::

        normalize-by-median.py -k 17 -s test.ct \\
        tests/test-data/test-abund-read-2.fa \\
        tests/test-data/test-fastq-reads.fq"""
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog),
        citations=['diginorm'])
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s',
                        '--savegraph',
                        metavar="filename",
                        default=None,
                        help='save the k-mer countgraph to disk after all '
                        'reads are loaded.')
    parser.add_argument('-R',
                        '--report',
                        help='write progress report to report_filename',
                        metavar='report_filename',
                        type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency',
                        type=int,
                        default=100000,
                        help='report progress every report_frequency reads')
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    parser.add_argument('-o',
                        '--output',
                        metavar="filename",
                        type=khFileType('wb'),
                        default=None,
                        dest='single_output_file',
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadgraph_args(parser)
    parser.add_argument('-z',
                        '--loadgraph2',
                        metavar="filename",
                        default=None,
                        help='load a second k-mer graph')
    add_output_compression_type(parser)
    return parser
Esempio n. 31
0
def get_parser():
    epilog = """\
    The output is one file for each input file, ``<input file>.abundtrim``,
    placed in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The :option:`-V`/:option:`--variable-coverage` parameter will, if
    specified, prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use :program:`load-into-counting.py` and :program:`filter-abund.py`.
    However, read pairs will be kept together, in "broken-paired" format; you
    can use :program:`extract-paired-reads.py` to extract read pairs and
    orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--trim-at-coverage', '-Z', '--normalize-to',
                        type=int,
                        help='trim reads when entire read above this coverage',
                        default=DEFAULT_TRIM_AT_COVERAGE)

    parser.add_argument('-o', '--output', metavar="output_filename",
                        type=argparse.FileType('wb'),
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s', '--savegraph', metavar="filename", default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./',
                        help="Set location of temporary directory for "
                        "second pass")
    add_output_compression_type(parser)

    parser.add_argument('--diginorm', default=False, action='store_true',
                        help="Eliminate high-coverage reads altogether "
                        "(digital normalization).")
    parser.add_argument('--diginorm-coverage', type=int,
                        default=DEFAULT_DIGINORM_COVERAGE,
                        help="Coverage threshold for --diginorm")
    parser.add_argument('--single-pass', default=False, action='store_true',
                        help="Do not do a second pass across the low coverage "
                        "data")

    return parser
Esempio n. 32
0
def get_parser():
    epilog = """\
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/:option:`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force_single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
    will be saved to the specified file after all sequences have been
    processed. :option:`-l`/:option:`--loadgraph` will load the
    specified k-mer countgraph before processing the specified
    files.  Note that these graphs are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

        normalize-by-median.py -p -k 17 \\
        tests/test-data/test-abund-read-paired.fa

    Example::

        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\
        >> appended-output.fq

    Example::

        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\
        tests/test-data/test-fastq-reads.fq

    Example::

        normalize-by-median.py -k 17 -s test.ct \\
        tests/test-data/test-abund-read-2.fa \\
        tests/test-data/test-fastq-reads.fq"""
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog),
        citations=['diginorm'])
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    parser.add_argument('-C', '--cutoff', help="when the median "
                        "k-mer coverage level is above this number the "
                        "read is not kept.",
                        type=check_argument_range(0, 256, "cutoff"),
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single', dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u', '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s', '--savegraph', metavar="filename", default=None,
                        help='save the k-mer countgraph to disk after all '
                        'reads are loaded.')
    parser.add_argument('-R', '--report',
                        help='write progress report to report_filename',
                        metavar='report_filename', type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency', type=int, default=100000,
                        help='report progress every report_frequency reads')
    parser.add_argument('-f', '--force', dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        default=None, dest='single_output_file',
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.', nargs='+')
    add_loadgraph_args(parser)
    add_output_compression_type(parser)
    return parser
Esempio n. 33
0
def get_parser():
    epilog = """\
    The output is one file for each input file, ``<input file>.abundtrim``,
    placed in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The :option:`-V`/:option:`--variable-coverage` parameter will, if
    specified, prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use :program:`load-into-counting.py` and :program:`filter-abund.py`.
    However, read pairs will be kept together, in "broken-paired" format; you
    can use :program:`extract-paired-reads.py` to extract read pairs and
    orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff',
                        '-C',
                        type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--trim-at-coverage',
                        '-Z',
                        '--normalize-to',
                        type=int,
                        help='trim reads when entire read above this coverage',
                        default=DEFAULT_TRIM_AT_COVERAGE)

    parser.add_argument('-o',
                        '--output',
                        metavar="output_filename",
                        type=argparse.FileType('wb'),
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s',
                        '--savegraph',
                        metavar="filename",
                        default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        default='./',
                        help="Set location of temporary directory for "
                        "second pass")
    add_output_compression_type(parser)

    parser.add_argument('--diginorm',
                        default=False,
                        action='store_true',
                        help="Eliminate high-coverage reads altogether "
                        "(digital normalization).")
    parser.add_argument('--diginorm-coverage',
                        type=int,
                        default=DEFAULT_DIGINORM_COVERAGE,
                        help="Coverage threshold for --diginorm")
    parser.add_argument('--single-pass',
                        default=False,
                        action='store_true',
                        help="Do not do a second pass across the low coverage "
                        "data")

    return parser
Esempio n. 34
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance " "(in memory version).",
        epilog=textwrap.dedent(epilog),
        citations=["counting", "SeqAn"],
    )
    add_threading_args(parser)

    parser.add_argument(
        "--cutoff",
        "-C",
        default=DEFAULT_CUTOFF,
        type=check_argument_range(0, 256, "cutoff"),
        help="Trim at k-mers below this abundance.",
    )
    parser.add_argument(
        "--variable-coverage",
        "-V",
        action="store_true",
        dest="variable_coverage",
        default=False,
        help="Only trim low-abundance k-mers from sequences " "that have high coverage.",
    )
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="Base the variable-coverage cutoff on this median" " k-mer abundance.",
        default=DEFAULT_NORMALIZE_LIMIT,
    )
    parser.add_argument(
        "--savegraph",
        metavar="filename",
        default="",
        help="If present, the name of the file to save the " "k-mer countgraph to",
    )
    parser.add_argument(
        "-o",
        "--outfile",
        metavar="optional_output_filename",
        default=None,
        help="Override default output filename " "and output trimmed sequences into a file with the " "given filename.",
    )
    parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim")
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true")
    add_output_compression_type(parser)
    return parser
Esempio n. 35
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force-single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
    will be saved to the specified file after all sequences have been
    processed. :option:`-l`/:option:`--loadgraph` will load the
    specified k-mer countgraph before processing the specified
    files.  Note that these graphs are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)
    )
    parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true")
    parser.add_argument("-C", "--cutoff", type=int, default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument("-p", "--paired", action="store_true", help="require that all sequences be properly paired")
    parser.add_argument(
        "--force-single", dest="force_single", action="store_true", help="treat all sequences as single-ended/unpaired"
    )
    parser.add_argument(
        "-u",
        "--unpaired-reads",
        metavar="unpaired_reads_filename",
        help="include a file of unpaired reads to which " "-p/--paired does not apply.",
    )
    parser.add_argument(
        "-s",
        "--savegraph",
        metavar="filename",
        default="",
        help="save the k-mer countgraph to disk after all" "reads are loaded.",
    )
    parser.add_argument("-R", "--report", metavar="report_filename", type=argparse.FileType("w"))
    parser.add_argument("--report-frequency", metavar="report_frequency", type=int, default=100000)
    parser.add_argument("-f", "--force", dest="force", help="continue past file reading errors", action="store_true")
    parser.add_argument(
        "-o",
        "--output",
        metavar="filename",
        type=argparse.FileType("wb"),
        default=None,
        dest="single_output_file",
        help="only output a single file with "
        'the specified filename; use a single dash "-" to '
        "specify that output should go to STDOUT (the "
        "terminal)",
    )
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", help="Input FAST[AQ] sequence filename.", nargs="+"
    )
    add_loadgraph_args(parser)
    add_output_compression_type(parser)
    return parser