Example #1
0
def get_parser():
    epilog = """\
    Take the ``${graphbase}.subset.#.pmap`` files and merge them all into a
    single ``${graphbase}.pmap.merged`` file for
    :program:`annotate-partitions.py` to use.
    """
    parser = KhmerArgumentParser(
        description="Merge partition map '.pmap' files.",
        epilog=textwrap.dedent(epilog),
        citations=['graph'])
    parser.add_argument('-k',
                        '--ksize',
                        type=int,
                        default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('--keep-subsets',
                        dest='remove_subsets',
                        default=True,
                        action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase',
                        help='basename for input and output '
                        'files')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #2
0
def get_parser():
    epilog = """\
    Load in a partitionmap (generally produced by :program:`partition-graph.py`
    or :program:`merge-partitions.py`) and annotate the sequences in the given
    files with their partition IDs. Use :program:`extract-partitions.py` to
    extract sequences into separate group files.

    Example (results will be in ``random-20-a.fa.part``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
    """
    parser = KhmerArgumentParser(
        description="Annotate sequences with partition IDs.",
        epilog=textwrap.dedent(epilog))

    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('graphbase', help='basename for input and output '
                        'files')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequences to '
                        'annotate.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #3
0
def get_parser():
    epilog = """\
    Count the median/avg k-mer abundance for each sequence in the input file,
    based on the k-mer counts in the given k-mer countgraph.  Can be used
    to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic).

    The output file contains sequence id, median, average, stddev, and
    seq length, in comma-separated value (CSV) format.

    Example::

        load-into-counting.py counts tests/test-data/test-reads.fq.gz
        count-median.py counts tests/test-data/test-reads.fq.gz medians.txt

    NOTE: All 'N's in the input sequences are converted to 'A's.
    """
    parser = KhmerArgumentParser(
        description='Count k-mers summary stats for sequences',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('countgraph', metavar='input_count_graph_filename',
                        help='input k-mer countgraph filename')
    parser.add_argument('input', metavar='input_sequence_filename',
                        help='input FAST[AQ] sequence filename')
    parser.add_argument('output', metavar='output_summary_filename',
                        help='output summary filename',
                        type=argparse.FileType('w'))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #4
0
def get_parser():
    epilog = """\
    Count the median/avg k-mer abundance for each sequence in the input file,
    based on the k-mer counts in the given k-mer countgraph.  Can be used
    to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic).

    The output file contains sequence id, median, average, stddev, and
    seq length, in comma-separated value (CSV) format.

    Example::

        load-into-counting.py counts tests/test-data/test-reads.fq.gz
        count-median.py counts tests/test-data/test-reads.fq.gz medians.txt

    NOTE: All 'N's in the input sequences are converted to 'A's.
    """
    parser = KhmerArgumentParser(
        description='Count k-mers summary stats for sequences',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('countgraph', metavar='input_count_graph_filename',
                        help='input k-mer countgraph filename')
    parser.add_argument('input', metavar='input_sequence_filename',
                        help='input FAST[AQ] sequence filename')
    parser.add_argument('output', metavar='output_summary_filename',
                        help='output summary filename',
                        type=argparse.FileType('w'))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #5
0
def get_parser():
    epilog = """\
    Load in a partitionmap (generally produced by :program:`partition-graph.py`
    or :program:`merge-partitions.py`) and annotate the sequences in the given
    files with their partition IDs. Use :program:`extract-partitions.py` to
    extract sequences into separate group files.

    Example (results will be in ``random-20-a.fa.part``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
    """
    parser = KhmerArgumentParser(
        description="Annotate sequences with partition IDs.",
        epilog=textwrap.dedent(epilog))

    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('graphbase', help='basename for input and output '
                        'files')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequences to '
                        'annotate.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #6
0
def get_parser():
    epilog = """\
    Example::

        extract-long-sequences.py --length 10 tests/test-data/paired-mixed.fa
    """
    parser = KhmerArgumentParser(
        description='Extract FASTQ or FASTA sequences longer than'
        ' specified length (default: 200 bp).',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames',
                        help='Input FAST[AQ]'
                        ' sequence filename.',
                        nargs='+')
    parser.add_argument('-o',
                        '--output',
                        help='The name of the output'
                        ' sequence file.',
                        default=sys.stdout,
                        metavar='output',
                        type=argparse.FileType('wb'))
    parser.add_argument('-l',
                        '--length',
                        help='The minimum length of'
                        ' the sequence file.',
                        type=int,
                        default=200)
    add_output_compression_type(parser)
    return parser
Example #7
0
def get_parser():
    epilog = """\

    """
    parser = KhmerArgumentParser(citations=['counting'])

    parser.add_argument('input_count_graph_filename', help='The name of the'
                        ' input k-mer countgraph file.')
    parser.add_argument('input_sequence_filenames', help='The name of the input'
                        ' FAST[AQ] sequence file.', nargs='+')
    parser.add_argument('-N', type=int, default=10000)
    parser.add_argument('-o', dest='output', type=argparse.FileType('w'), default=sys.stdout)
    
    return parser
Example #8
0
def get_parser():
    descr = "Display summary statistics for one or more FASTA/FASTQ files."
    epilog = """\
    Report number of bases, number of sequences, and average sequence length
    for one or more FASTA/FASTQ files; and report aggregate statistics at end.

    With :option:`-o`/:option:`--output`, the output will be saved to the
    specified file.

    Example::

        readstats.py tests/test-data/test-abund-read-2.fa
    """

    parser = KhmerArgumentParser(description=descr,
                                 epilog=textwrap.dedent(epilog))
    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-o',
                        '--output',
                        dest='outfp',
                        metavar="filename",
                        help="output file for statistics; defaults to stdout.",
                        type=argparse.FileType('w'),
                        default=sys.stdout)
    parser.add_argument('--csv',
                        default=False,
                        action='store_true',
                        help='Use the CSV format for the statistics, '
                        'including column headers.')
    return parser
Example #9
0
def get_parser():
    descr = "Display summary statistics for one or more FASTA/FASTQ files."
    epilog = """\
    Report number of bases, number of sequences, and average sequence length
    for one or more FASTA/FASTQ files; and report aggregate statistics at end.

    With :option:`-o`/:option:`--output`, the output will be saved to the
    specified file.

    Example::

        readstats.py tests/test-data/test-abund-read-2.fa
    """

    parser = KhmerArgumentParser(
        description=descr,
        epilog=textwrap.dedent(epilog))
    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-o', '--output', dest='outfp', metavar="filename",
                        help="output file for statistics; defaults to stdout.",
                        type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('--csv', default=False, action='store_true',
                        help='Use the CSV format for the statistics, '
                        'including column headers.')
    return parser
Example #10
0
def get_parser():
    epilog = """\
    Load stoptags in from the given `.stoptags` file and use them to trim
    or remove the sequences in `<file1-N>`.  Trimmed sequences will be placed
    in `<fileN>.stopfilt`.
    """
    parser = KhmerArgumentParser(
        description="Trim sequences at stoptags.",
        epilog=textwrap.dedent(epilog), citations=['graph'])
    parser.add_argument('-k', '--ksize', default=DEFAULT_K, type=int,
                        help='k-mer size')
    parser.add_argument('stoptags_file', metavar='input_stoptags_filename')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #11
0
def get_parser():
    parser = KhmerArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).')

    parser.add_argument('input_sequence', help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n', '--n_keep', default=False, action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                             'input_sequence file. Default is to drop reads')
    add_output_compression_type(parser)
    return parser
Example #12
0
def get_parser():
    epilog = """\
    Take the ``${graphbase}.subset.#.pmap`` files and merge them all into a
    single ``${graphbase}.pmap.merged`` file for
    :program:`annotate-partitions.py` to use.
    """
    parser = KhmerArgumentParser(
        description="Merge partition map '.pmap' files.",
        epilog=textwrap.dedent(epilog),
        citations=['graph'])
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase', help='basename for input and output '
                        'files')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #13
0
def get_parser():
    epilog = """\
    Load stoptags in from the given `.stoptags` file and use them to trim
    or remove the sequences in `<file1-N>`.  Trimmed sequences will be placed
    in `<fileN>.stopfilt`.
    """
    parser = KhmerArgumentParser(description="Trim sequences at stoptags.",
                                 epilog=textwrap.dedent(epilog),
                                 citations=['graph'])
    parser.add_argument('-k',
                        '--ksize',
                        default=DEFAULT_K,
                        type=int,
                        help='k-mer size')
    parser.add_argument('stoptags_file', metavar='input_stoptags_filename')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        nargs='+')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #14
0
def get_parser():
    parser = KhmerArgumentParser(
        description='Converts FASTQ format (.fq) files to FASTA format (.fa).')

    parser.add_argument('input_sequence', help='The name of the input'
                        ' FASTQ sequence file.')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        help='The name of the output'
                        ' FASTA sequence file.',
                        default=sys.stdout)
    parser.add_argument('-n', '--n_keep', default=False, action='store_true',
                        help='Option to keep reads containing \'N\'s in '
                             'input_sequence file. Default is to drop reads')
    add_output_compression_type(parser)
    return parser
Example #15
0
def get_parser():
    epilog = """\
    Example::

        extract-long-sequences.py --length 10 tests/test-data/paired-mixed.fa
    """
    parser = KhmerArgumentParser(
        description='Extract FASTQ or FASTA sequences longer than'
        ' specified length (default: 200 bp).',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', help='Input FAST[AQ]'
                        ' sequence filename.', nargs='+')
    parser.add_argument('-o', '--output', help='The name of the output'
                        ' sequence file.', default=sys.stdout,
                        metavar='output', type=argparse.FileType('wb'))
    parser.add_argument('-l', '--length', help='The minimum length of'
                        ' the sequence file.',
                        type=int, default=200)
    add_output_compression_type(parser)
    return parser
Example #16
0
def get_parser():
    epilog = """\
    Many read-handling programs (assemblers, mappers, etc.) require
    that you give them either perfectly interleaved files, or files
    containing only single reads. This script takes files that were
    originally interleaved but where reads may have been orphaned (via
    error filtering, application of abundance filtering, digital
    normalization in non-paired mode, or partitioning) and separates
    the interleaved reads from the orphaned reads.

    The default output is two files, `<input file>.pe` and `<input
    file>.se`, placed in the current directory. The .pe file contains
    interleaved and properly paired sequences, while the .se file
    contains orphan sequences.

    The directory into which the interleaved and orphaned reads are
    output may be specified using :option:`-d`/:option:`--output-dir`.
    This directory will be created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-p`/:option:`--output-paired` and
    :option:`-s`/:option:`--output-single`, which will override the
    :option:`-d`/:option:`--output-dir` option.

    Example::

        extract-paired-reads.py tests/test-data/paired.fq
    """
    parser = KhmerArgumentParser(
        description='Take a mixture of reads and split into pairs and '
        'orphans.', epilog=textwrap.dedent(epilog))
    parser.add_argument('infile', nargs='?', default='/dev/stdin')
    parser.add_argument('-d', '--output-dir', default='', help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('--output-paired', '-p', metavar="filename",
                        type=khFileType('wb'),
                        default=None, help='Output paired reads to this '
                        'file')
    parser.add_argument('--output-single', '-s', metavar="filename",
                        type=khFileType('wb'), default=None,
                        help='Output orphaned reads to this file')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #17
0
def get_parser():
    epilog = """\
    Some programs want paired-end read input in the One True Format, which is
    interleaved; other programs want input in the Insanely Bad Format, with
    left- and right- reads separated. This reformats the former to the latter.

    The directory into which the left- and right- reads are output may be
    specified using :option:`-d`/:option:`--output-dir`. This directory will be
    created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-1`/:option:`--output-first` and
    :option:`-2`/:option:`--output-second`, which will override the
    :option:`-d`/:option:`--output-dir` setting on a file-specific basis.

    :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
    and orphaned reads will be saved separately, to the specified file.

    Example::

        split-paired-reads.py tests/test-data/paired.fq

    Example::

        split-paired-reads.py -0 reads-output-file tests/test-data/paired.fq

    Example::

        split-paired-reads.py -1 reads.1 -2 reads.2 tests/test-data/paired.fq
    """
    parser = KhmerArgumentParser(
        description='Split interleaved reads into two files, left and right.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('infile', nargs='?', default='/dev/stdin')

    parser.add_argument('-d', '--output-dir', metavar="output_directory",
                        dest='output_directory', default='', help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('-0', '--output-orphaned', metavar='output_orphaned',
                        help='Allow "orphaned" reads and extract them to ' +
                        'this file',
                        type=khFileType('wb'))
    parser.add_argument('-1', '--output-first', metavar='output_first',
                        default=None, help='Output "left" reads to this '
                        'file', type=khFileType('wb'))
    parser.add_argument('-2', '--output-second', metavar='output_second',
                        default=None, help='Output "right" reads to this '
                        'file', type=khFileType('wb'))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #18
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt`` for each input sequence file. If
    the input sequences are from RNAseq or metagenome sequencing then
    :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa
        filter-abund.py -C 2 countgraph data/100k-filtered.fa
    """
    parser = KhmerArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        citations=['counting'])
    parser.add_argument('input_graph', metavar='input_count_graph_filename',
                        help='The input k-mer countgraph filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, 'cutoff'),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--output', dest='single_output_file',
                        type=khFileType('wb'),
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Example #19
0
def get_parser():
    descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE."
    epilog = """\
    A HyperLogLog counter is used to do cardinality estimation. Since this
    counter is based on a tradeoff between precision and memory consumption,
    the :option:`-e`/:option:`--error-rate` can be used to control how much
    memory will be used. In practice the memory footprint is small even
    at low error rates (< 0.01).

    :option:`-k`/:option:`--ksize` should be set to the desired k-mer size.

    Informational output is sent to STDERR, but a report file can be generated
    with :option:`-R`/:option:`--report`.

    :option:`--stream-records` will write the sequences taken in to STDOUT.
    This is useful for workflows: count unique kmers in a stream, then do
    digital normalization.

    :option:`--diagnostics` will provide detailed options for tablesize
    and memory limitations for various false positive rates. This is useful for
    configuring other khmer scripts. This will be written to STDERR.

    Example::

        unique-kmers.py -k 17 tests/test-data/test-abund-read{,-2,-3}.fa

    Example::

        unique-kmers.py -k 17 --diagnostics tests/test-data/test-abund-read.fa

    Example::

        unique-kmers.py --stream-records -k 17 tests/test-data/test-reads.fa | \\
        normalize-by-median.py -k 17 -o normalized /dev/stdin

    Example::

        unique-kmers.py -R unique_count -k 30 \\
        tests/test-data/test-abund-read-paired.fa"""  # noqa
    parser = KhmerArgumentParser(
        description=descr, epilog=textwrap.dedent(epilog),
        citations=['SeqAn', 'hll'])

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)

    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')

    parser.add_argument('-k', '--ksize', type=int, default=env_ksize,
                        help='k-mer size to use')

    parser.add_argument('-e', '--error-rate', type=float, default=0.01,
                        help='Acceptable error rate')

    parser.add_argument('-R', '--report',
                        metavar='filename', type=argparse.FileType('w'),
                        help='generate informational report and write to'
                        ' filename')

    parser.add_argument('-S', '--stream-records', default=False,
                        action='store_true',
                        help='write input sequences to STDOUT')

    parser.add_argument('--diagnostics', default=False, action='store_true',
                        help='print out recommended tablesize arguments and '
                             'restrictions')

    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename(s).', nargs='+')

    return parser
Example #20
0
def get_parser():
    epilog = """\
    The resulting partition maps are saved as ``${basename}.subset.#.pmap``
    files.
    """
    parser = KhmerArgumentParser(
        description="Partition a sequence graph based upon waypoint "
        "connectivity", epilog=textwrap.dedent(epilog),
        citations=['graph'])

    parser.add_argument('basename', help="basename of the input k-mer "
                        "nodegraph  + tagset files")
    parser.add_argument('-S', '--stoptags', metavar='filename', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('-s', '--subset-size', default=DEFAULT_SUBSET_SIZE,
                        type=float, help='Set subset size (usually 1e5-1e6 is '
                        'good)')
    parser.add_argument('--no-big-traverse', action='store_true',
                        default=False, help='Truncate graph joins at big '
                        'traversals')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_threading_args(parser)
    return parser
Example #21
0
def get_parser():
    epilog = """\
    Some programs want paired-end read input in the One True Format, which is
    interleaved; other programs want input in the Insanely Bad Format, with
    left- and right- reads separated. This reformats the former to the latter.

    The directory into which the left- and right- reads are output may be
    specified using :option:`-d`/:option:`--output-dir`. This directory will be
    created if it does not already exist.

    Alternatively, you can specify the filenames directly with
    :option:`-1`/:option:`--output-first` and
    :option:`-2`/:option:`--output-second`, which will override the
    :option:`-d`/:option:`--output-dir` setting on a file-specific basis.

    :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
    and orphaned reads will be saved separately, to the specified file.

    Example::

        split-paired-reads.py tests/test-data/paired.fq

    Example::

        split-paired-reads.py -0 reads-output-file tests/test-data/paired.fq

    Example::

        split-paired-reads.py -1 reads.1 -2 reads.2 tests/test-data/paired.fq
    """
    parser = KhmerArgumentParser(
        description='Split interleaved reads into two files, left and right.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('infile', nargs='?', default='/dev/stdin')

    parser.add_argument('-d',
                        '--output-dir',
                        metavar="output_directory",
                        dest='output_directory',
                        default='',
                        help='Output '
                        'split reads to specified directory. Creates '
                        'directory if necessary')
    parser.add_argument('-0',
                        '--output-orphaned',
                        metavar='output_orphaned',
                        help='Allow "orphaned" reads and extract them to ' +
                        'this file',
                        type=khFileType('wb'))
    parser.add_argument('-1',
                        '--output-first',
                        metavar='output_first',
                        default=None,
                        help='Output "left" reads to this '
                        'file',
                        type=khFileType('wb'))
    parser.add_argument('-2',
                        '--output-second',
                        metavar='output_second',
                        default=None,
                        help='Output "right" reads to this '
                        'file',
                        type=khFileType('wb'))
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #22
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = KhmerArgumentParser(
        description="Separate sequences that are annotated with partitions " "into grouped files.",
        epilog=textwrap.dedent(epilog),
        citations=["graph"],
    )
    parser.add_argument("prefix", metavar="output_filename_prefix")
    parser.add_argument("part_filenames", metavar="input_partition_filename", nargs="+")
    parser.add_argument(
        "--max-size", "-X", dest="max_size", default=DEFAULT_MAX_SIZE, type=int, help="Max group size (n sequences)"
    )
    parser.add_argument(
        "--min-partition-size",
        "-m",
        dest="min_part_size",
        default=DEFAULT_THRESHOLD,
        type=int,
        help="Minimum partition size worth keeping",
    )
    parser.add_argument(
        "--no-output-groups",
        "-n",
        dest="output_groups",
        default=True,
        action="store_false",
        help="Do not actually output groups files.",
    )
    parser.add_argument(
        "--output-unassigned", "-U", default=False, action="store_true", help="Output unassigned sequences, too"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    add_output_compression_type(parser)
    return parser
Example #23
0
def get_parser():
    descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE."
    epilog = """\
    A HyperLogLog counter is used to do cardinality estimation. Since this
    counter is based on a tradeoff between precision and memory consumption,
    the :option:`-e`/:option:`--error-rate` can be used to control how much
    memory will be used. In practice the memory footprint is small even
    at low error rates (< 0.01).

    :option:`-k`/:option:`--ksize` should be set to the desired k-mer size.

    Informational output is sent to STDERR, but a report file can be generated
    with :option:`-R`/:option:`--report`.

    :option:`--stream-records` will write the sequences taken in to STDOUT.
    This is useful for workflows: count unique kmers in a stream, then do
    digital normalization.

    :option:`--diagnostics` will provide detailed options for tablesize
    and memory limitations for various false positive rates. This is useful for
    configuring other khmer scripts. This will be written to STDERR.

    Example::

        unique-kmers.py -k 17 tests/test-data/test-abund-read{,-2,-3}.fa

    Example::

        unique-kmers.py -k 17 --diagnostics tests/test-data/test-abund-read.fa

    Example::

        unique-kmers.py --stream-records -k 17 tests/test-data/test-reads.fa | \\
        normalize-by-median.py -k 17 -o normalized /dev/stdin

    Example::

        unique-kmers.py -R unique_count -k 30 \\
        tests/test-data/test-abund-read-paired.fa"""  # noqa
    parser = KhmerArgumentParser(description=descr,
                                 epilog=textwrap.dedent(epilog),
                                 citations=['SeqAn', 'hll'])

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)

    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')

    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        default=env_ksize,
                        help='k-mer size to use')

    parser.add_argument('--error-rate',
                        '-e',
                        type=float,
                        default=0.01,
                        help='Acceptable error rate')

    parser.add_argument('--report',
                        '-R',
                        metavar='filename',
                        type=argparse.FileType('w'),
                        help='generate informational report and write to'
                        ' filename')

    parser.add_argument('--stream-records',
                        '-S',
                        default=False,
                        action='store_true',
                        help='write input sequences to STDOUT')

    parser.add_argument('--diagnostics',
                        default=False,
                        action='store_true',
                        help='print out recommended tablesize arguments and '
                        'restrictions')

    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename(s).',
                        nargs='+')

    return parser
Example #24
0
def get_parser():
    epilog = """\
    Example::

        load-into-counting.py -x 1e7 -N 2 -k 17 counts \\
                tests/test-data/test-abund-read-2.fa
        abundance-dist.py counts tests/test-data/test-abund-read-2.fa test-dist
    """
    parser = KhmerArgumentParser(
        description="Calculate abundance distribution of the k-mers in "
        "the sequence file using a pre-made k-mer countgraph.",
        epilog=textwrap.dedent(epilog),
        citations=['counting'])

    parser.add_argument('input_count_graph_filename',
                        help='The name of the'
                        ' input k-mer countgraph file.')
    parser.add_argument('input_sequence_filename',
                        help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename',
                        help='The columns are: '
                        '(1) k-mer abundance, (2) k-mer count, (3) cumulative '
                        'count, (4) fraction of total distinct k-mers.')
    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output zero-count bins')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite existing output_histogram_filename')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Continue even if specified input files '
                        'do not exist or are empty.')
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    return parser
Example #25
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = KhmerArgumentParser(
        description="Separate sequences that are annotated with partitions "
        "into grouped files.",
        epilog=textwrap.dedent(epilog),
        citations=['graph'])
    parser.add_argument('prefix', metavar='output_filename_prefix')
    parser.add_argument('part_filenames',
                        metavar='input_partition_filename',
                        nargs='+')
    parser.add_argument('-X',
                        '--max-size',
                        dest='max_size',
                        default=DEFAULT_MAX_SIZE,
                        type=int,
                        help='Max group size (n sequences)')
    parser.add_argument('-m',
                        '--min-partition-size',
                        dest='min_part_size',
                        default=DEFAULT_THRESHOLD,
                        type=int,
                        help='Minimum partition size worth keeping')
    parser.add_argument('-n',
                        '--no-output-groups',
                        dest='output_groups',
                        default=True,
                        action='store_false',
                        help='Do not actually output groups files.')
    parser.add_argument('-U',
                        '--output-unassigned',
                        default=False,
                        action='store_true',
                        help='Output unassigned sequences, too')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #26
0
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).
    This reformatting can be switched off with the
    :option:`--no-reformat` flag.

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = KhmerArgumentParser(
        description='Produce interleaved files from R1/R2 paired files',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('left')
    parser.add_argument('right')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        default=sys.stdout)
    parser.add_argument('--no-reformat', default=False, action='store_true',
                        help='Do not reformat read names or enforce\
                              consistency')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #27
0
def get_parser():
    """Create parser for extract-partitions.py."""
    epilog = """
    Example (results will be in ``example.group0000.fa``)::

        load-graph.py -k 20 example tests/test-data/random-20-a.fa
        partition-graph.py example
        merge-partitions.py -k 20 example
        annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa
        extract-partitions.py example random-20-a.fa.part

    (:program:`extract-partitions.py` will produce a partition size
    distribution in <base>.dist. The columns are: (1) number of reads,
    (2) count of partitions with n reads, (3) cumulative sum of partitions,
    (4) cumulative sum of reads.)
    """
    parser = KhmerArgumentParser(
        description="Separate sequences that are annotated with partitions "
        "into grouped files.", epilog=textwrap.dedent(epilog),
        citations=['graph'])
    parser.add_argument('prefix', metavar='output_filename_prefix')
    parser.add_argument('part_filenames', metavar='input_partition_filename',
                        nargs='+')
    parser.add_argument('-X', '--max-size', dest='max_size',
                        default=DEFAULT_MAX_SIZE, type=int,
                        help='Max group size (n sequences)')
    parser.add_argument('-m', '--min-partition-size', dest='min_part_size',
                        default=DEFAULT_THRESHOLD, type=int,
                        help='Minimum partition size worth keeping')
    parser.add_argument('-n', '--no-output-groups', dest='output_groups',
                        default=True, action='store_false',
                        help='Do not actually output groups files.')
    parser.add_argument('-U', '--output-unassigned', default=False,
                        action='store_true',
                        help='Output unassigned sequences, too')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #28
0
def get_parser():
    epilog = """\
    Take a list of files containing sequences, and subsample 100,000
    sequences (:option:`-N`/:option:`--num_reads`) uniformly, using
    reservoir sampling.  Stop after first 100m sequences
    (:option:`-M`/:option:`--max_reads`). By default take one subsample,
    but take :option:`-S`/:option:`--samples` samples if specified.

    The output is placed in :option:`-o`/:option:`--output` <file>
    (for a single sample) or in ``<file>.subset.0`` to ``<file>.subset.S-1``
    (for more than one sample).

    This script uses the `reservoir sampling
    <http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.
    """

    parser = KhmerArgumentParser(
        description="Uniformly subsample sequences from a collection of files",
        epilog=textwrap.dedent(epilog))

    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-N',
                        '--num_reads',
                        type=int,
                        dest='num_reads',
                        default=DEFAULT_NUM_READS,
                        help='samples the '
                        'number of sequences or pairs specified with -N')
    parser.add_argument('-M',
                        '--max_reads',
                        type=int,
                        dest='max_reads',
                        default=DEFAULT_MAX_READS)
    parser.add_argument('-S',
                        '--samples',
                        type=int,
                        dest='num_samples',
                        default=1)
    parser.add_argument('-R',
                        '--random-seed',
                        type=int,
                        dest='random_seed',
                        help='Provide a random seed for the generator')
    parser.add_argument('--force_single',
                        default=False,
                        action='store_true',
                        help='Ignore read pair information if present')
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        type=argparse.FileType('wb'),
                        metavar="filename",
                        default=None)
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exits')
    add_output_compression_type(parser)
    return parser
Example #29
0
def get_parser():
    epilog = """\
    The output is an interleaved set of reads, with each read in <R1> paired
    with a read in <R2>. By default, the output goes to stdout unless
    :option:`-o`/:option:`--output` is specified.

    As a "bonus", this file ensures that if read names are not already
    formatted properly, they are reformatted consistently, such that
    they look like the pre-1.8 Casava format (`@name/1`, `@name/2`).
    This reformatting can be switched off with the
    :option:`--no-reformat` flag.

    Example::

        interleave-reads.py tests/test-data/paired.fq.1 \\
                tests/test-data/paired.fq.2 -o paired.fq"""
    parser = KhmerArgumentParser(
        description='Produce interleaved files from R1/R2 paired files',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('left')
    parser.add_argument('right')
    parser.add_argument('-o',
                        '--output',
                        metavar="filename",
                        type=khFileType('wb'),
                        default=sys.stdout)
    parser.add_argument('--no-reformat',
                        default=False,
                        action='store_true',
                        help='Do not reformat read names or enforce\
                              consistency')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Example #30
0
def get_parser():
    epilog = """\
    Example::

        load-into-counting.py -x 1e7 -N 2 -k 17 counts \\
                tests/test-data/test-abund-read-2.fa
        abundance-dist.py counts tests/test-data/test-abund-read-2.fa test-dist
    """
    parser = KhmerArgumentParser(
        description="Calculate abundance distribution of the k-mers in "
        "the sequence file using a pre-made k-mer countgraph.",
        epilog=textwrap.dedent(epilog), citations=['counting'])

    parser.add_argument('input_count_graph_filename', help='The name of the'
                        ' input k-mer countgraph file.')
    parser.add_argument('input_sequence_filename', help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename', help='The columns are: '
                        '(1) k-mer abundance, (2) k-mer count, (3) cumulative '
                        'count, (4) fraction of total distinct k-mers.')
    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output zero-count bins')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite existing output_histogram_filename')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Continue even if specified input files '
                        'do not exist or are empty.')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    return parser
Example #31
0
def get_parser():
    epilog = """\
    Take a list of files containing sequences, and subsample 100,000
    sequences (:option:`-N`/:option:`--num_reads`) uniformly, using
    reservoir sampling.  Stop after first 100m sequences
    (:option:`-M`/:option:`--max_reads`). By default take one subsample,
    but take :option:`-S`/:option:`--samples` samples if specified.

    The output is placed in :option:`-o`/:option:`--output` <file>
    (for a single sample) or in ``<file>.subset.0`` to ``<file>.subset.S-1``
    (for more than one sample).

    This script uses the `reservoir sampling
    <http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.
    """

    parser = KhmerArgumentParser(
        description="Uniformly subsample sequences from a collection of files",
        epilog=textwrap.dedent(epilog))

    parser.add_argument('filenames', nargs='+')
    parser.add_argument('-N', '--num_reads', type=int, dest='num_reads',
                        default=DEFAULT_NUM_READS, help='samples the '
                        'number of sequences or pairs specified with -N')
    parser.add_argument('-M', '--max_reads', type=int, dest='max_reads',
                        default=DEFAULT_MAX_READS)
    parser.add_argument('-S', '--samples', type=int, dest='num_samples',
                        default=1)
    parser.add_argument('-R', '--random-seed', type=int, dest='random_seed',
                        help='Provide a random seed for the generator')
    parser.add_argument('--force_single', default=False, action='store_true',
                        help='Ignore read pair information if present')
    parser.add_argument('-o', '--output', dest='output_file',
                        type=argparse.FileType('wb'),
                        metavar="filename", default=None)
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exits')
    add_output_compression_type(parser)
    return parser