Ejemplo n.º 1
0
def get_parser():
    epilog = """\
    Note: with :option:`-b`/:option:`--no-bigcount` the output will be the
    exact size of the k-mer countgraph and this script will use a constant
    amount of memory. In exchange k-mer counts will stop at 255. The memory
    usage of this script with :option:`-b` will be about 1.15x the product of
    the :option:`-x` and :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer countgraph from the given"
        " sequences.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countgraph_filename',
                        help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename',
                        nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help="The default behaviour is "
                        "to count past 255 using bigcount. This flag turns "
                        "bigcount off, limiting counts to 255.")
    parser.add_argument('--summary-info',
                        '-s',
                        type=str,
                        default=None,
                        metavar="FORMAT",
                        choices=[str('json'), str('tsv')],
                        help="What format should the machine readable run "
                        "summary be in? (`json` or `tsv`, disabled by"
                        " default)")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    return parser
Ejemplo n.º 2
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load_into_counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer counting table from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename', help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    return parser
Ejemplo n.º 3
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer counting table from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename', help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    return parser
Ejemplo n.º 4
0
def get_parser():
    epilog = """\
    Load an k-mer nodegraph/tagset pair created by
    :program:`load-graph.py`, and a set of pmap files created by
    :program:`partition-graph.py`. Go through each pmap file,
    select the largest partition in each, and do the same kind of traversal as
    in :program:`make-initial-stoptags.py` from each of the waypoints in that
    partition; this should identify all of the Highly Connected Kmers in that
    partition. These HCKs are output to ``<graphbase>.stoptags`` after each
    pmap file.

    Parameter choice is reasonably important. See the pipeline in
    :doc:`partitioning-big-data` for an example run.

    This script is not very scalable and may blow up memory and die horribly.
    You should be able to use the intermediate stoptags to restart the
    process, and if you eliminate the already-processed pmap files, you can
    continue where you left off.
    """
    parser = build_counting_args(
        descr="Find all highly connected k-mers.",
        epilog=textwrap.dedent(epilog),
        citations=['graph'])

    parser.add_argument('graphbase', help='Basename for the input and output '
                        'files.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Continue past warnings')
    return parser
Ejemplo n.º 5
0
def get_parser():
    epilog = """
    Loads a k-mer nodegraph/tagset pair created by load-graph.py, and does
    a small set of traversals from graph waypoints; on these traversals, looks
    for k-mers that are repeatedly traversed in high-density regions of the
    graph, i.e. are highly connected. Outputs those k-mers as an initial set of
    stoptags, which can be fed into partition-graph.py, find-knots.py, and
    filter-stoptags.py.

    The k-mer countgraph size options parameters are for a k-mer countgraph
    to keep track of repeatedly-traversed k-mers. The subset size option
    specifies the number of waypoints from which to traverse; for highly
    connected data sets, the default (1000) is probably ok.
    """
    parser = build_counting_args(
        descr="Find an initial set of highly connected k-mers.",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (default 1e4 is prob ok)')
    parser.add_argument('--stoptags', '-S', metavar='filename', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('graphbase', help='basename for input and output '
                        'filenames')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Ejemplo n.º 6
0
def test_check_tablespace(graph_type, buckets_per_byte):
    oldstderr = sys.stderr
    sys.stderr = StringIO()

    outfile = utils.get_test_data('truncated.fq')
    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '16G'])

    buckets_per_table = khmer_args.calculate_graphsize(args, graph_type)
    total_buckets = buckets_per_table * args.n_tables
    space_needed = total_buckets / buckets_per_byte

    # First, try with insufficient space
    with pytest.raises(SystemExit) as se:
        khmer.kfile.check_space_for_graph(outfile, space_needed, force=False,
                                          _testhook_free_space=10e9)
    assert 'ERROR: Not enough free space' in str(se)

    # Now, try with insufficient space, but in force mode
    khmer.kfile.check_space_for_graph(outfile, space_needed, force=True,
                                      _testhook_free_space=10e9)
    assert 'WARNING: Not enough free space' in sys.stderr.getvalue()

    # Finally, try with sufficient space
    sys.stderr = StringIO()
    khmer.kfile.check_space_for_graph(outfile, space_needed, force=False,
                                      _testhook_free_space=20e9)
    assert sys.stderr.getvalue() == ''
    sys.stderr = oldstderr
Ejemplo n.º 7
0
def get_parser():
    epilog = """
    Loads a k-mer presence table/tagset pair created by load-graph.py, and does
    a small set of traversals from graph waypoints; on these traversals, looks
    for k-mers that are repeatedly traversed in high-density regions of the
    graph, i.e. are highly connected. Outputs those k-mers as an initial set of
    stoptags, which can be fed into partition-graph.py, find-knots.py, and
    filter-stoptags.py.

    The k-mer counting table size options parameters are for a k-mer counting
    table to keep track of repeatedly-traversed k-mers. The subset size option
    specifies the number of waypoints from which to traverse; for highly
    connected data sets, the default (1000) is probably ok.
    """
    parser = build_counting_args(
        descr="Find an initial set of highly connected k-mers.",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('--subset-size',
                        '-s',
                        default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size',
                        type=float,
                        help='Set subset size (default 1e4 is prob ok)')
    parser.add_argument('--stoptags',
                        '-S',
                        metavar='filename',
                        default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('graphbase',
                        help='basename for input and output '
                        'filenames')
    return parser
Ejemplo n.º 8
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savetable', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer counting table to")
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    return parser
Ejemplo n.º 9
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.abundtrim, placed
    in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The ``-V/--variable-coverage`` parameter will, if specified,
    prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use ``load-into-counting.py`` and ``filter-abund.py``.  However, read
    pairs will be kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o', '--output', metavar="output_filename",
                        type=argparse.FileType('wb'),
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s', '--savegraph', metavar="filename", default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')
    add_output_compression_type(parser)

    return parser
Ejemplo n.º 10
0
def get_parser():
    epilog = """\
    Load an k-mer nodegraph/tagset pair created by
    :program:`load-graph.py`, and a set of pmap files created by
    :program:`partition-graph.py`. Go through each pmap file,
    select the largest partition in each, and do the same kind of traversal as
    in :program:`make-initial-stoptags.py` from each of the waypoints in that
    partition; this should identify all of the Highly Connected Kmers in that
    partition. These HCKs are output to ``<graphbase>.stoptags`` after each
    pmap file.

    Parameter choice is reasonably important. See the pipeline in
    :doc:`partitioning-big-data` for an example run.

    This script is not very scalable and may blow up memory and die horribly.
    You should be able to use the intermediate stoptags to restart the
    process, and if you eliminate the already-processed pmap files, you can
    continue where you left off.
    """
    parser = build_counting_args(descr="Find all highly connected k-mers.",
                                 epilog=textwrap.dedent(epilog))

    parser.add_argument('graphbase',
                        help='Basename for the input and output '
                        'files.')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Continue past warnings')
    return parser
Ejemplo n.º 11
0
def get_parser():
    epilog = """
    Loads a k-mer presence table/tagset pair created by load-graph.py, and does
    a small set of traversals from graph waypoints; on these traversals, looks
    for k-mers that are repeatedly traversed in high-density regions of the
    graph, i.e. are highly connected. Outputs those k-mers as an initial set of
    stoptags, which can be fed into partition-graph.py, find-knots.py, and
    filter-stoptags.py.

    The k-mer counting table size options parameters are for a k-mer counting
    table to keep track of repeatedly-traversed k-mers. The subset size option
    specifies the number of waypoints from which to traverse; for highly
    connected data sets, the default (1000) is probably ok.
    """
    parser = build_counting_args(
        descr="Find an initial set of highly connected k-mers.", epilog=textwrap.dedent(epilog)
    )
    parser.add_argument(
        "--subset-size",
        "-s",
        default=DEFAULT_SUBSET_SIZE,
        dest="subset_size",
        type=float,
        help="Set subset size (default 1e4 is prob ok)",
    )
    parser.add_argument(
        "--stoptags", "-S", metavar="filename", default="", help="Use stoptags in this file during partitioning"
    )
    parser.add_argument("graphbase", help="basename for input and output " "filenames")
    return parser
Ejemplo n.º 12
0
def get_parser():
    epilog = '''
    Note that with :option:`-b` this script is constant memory; in exchange,
    k-mer counts will stop at 255. The memory usage of this script with
    :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.
    '''
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a "
        "single sequence file.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename', help='The name of the '
                        'output histogram file. The columns are: (1) k-mer '
                        'abundance, (2) k-mer count, (3) cumulative count, '
                        '(4) fraction of total distinct k-mers.')
    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savetable', default='', metavar="filename",
                        help="Save the k-mer counting table to the specified "
                        "filename.")
    return parser
Ejemplo n.º 13
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)

    parser.add_argument("--cutoff", "-C", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.")
    parser.add_argument(
        "--savetable",
        metavar="filename",
        default="",
        help="If present, the name of the file to save the " "k-mer counting table to",
    )
    parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim")

    return parser
Ejemplo n.º 14
0
def get_parser():
    epilog = """
    The memory usage of this script with :option:`-b` will be about
    1.15x the product of the :option:`-x` and :option:`-N` numbers.

    Example::

        collect-reads.py -k 20 -x 5e7 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args("Collect reads until a given avg coverage.",
                                 epilog=textwrap.dedent(epilog))
    parser.add_argument('output_countgraph_filename', help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-C', '--coverage', type=int, default=50,
                        help='Collect reads until this coverage, then exit.')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'),
                        help='Write collect reads into this file.')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    return parser
Ejemplo n.º 15
0
def test_check_tablespace_nodegraph(graph_type, exp_buckets):
    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '3G'])
    buckets_per_table = khmer_args.calculate_graphsize(args, graph_type)
    total_buckets = buckets_per_table * args.n_tables
    sizestr = '{:.1f} million buckets'.format(float(total_buckets) / 1e9)
    assert sizestr == exp_buckets
Ejemplo n.º 16
0
def get_parser():
    epilog = """
    The memory usage of this script with :option:`-b` will be about
    1.15x the product of the :option:`-x` and :option:`-N` numbers.

    Example::

        collect-reads.py -k 20 -x 5e7 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args("Collect reads until a given avg coverage.",
                                 epilog=textwrap.dedent(epilog))
    parser.add_argument('output_countgraph_filename', help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-C', '--coverage', type=int, default=50,
                        help='Collect reads until this coverage, then exit.')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'),
                        help='Write collect reads into this file.')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    return parser
Ejemplo n.º 17
0
def test_check_tablespace(graph_type, buckets_per_byte):
    oldstderr = sys.stderr
    sys.stderr = StringIO()

    outfile = utils.get_test_data('truncated.fq')
    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '16G'])

    buckets_per_table = khmer_args.calculate_graphsize(args, graph_type)
    total_buckets = buckets_per_table * args.n_tables
    space_needed = total_buckets / buckets_per_byte

    # First, try with insufficient space
    with pytest.raises(SystemExit) as se:
        khmer.kfile.check_space_for_graph(outfile, space_needed, force=False,
                                          _testhook_free_space=10e9)
    assert 'ERROR: Not enough free space' in str(se)

    # Now, try with insufficient space, but in force mode
    khmer.kfile.check_space_for_graph(outfile, space_needed, force=True,
                                      _testhook_free_space=10e9)
    assert 'WARNING: Not enough free space' in sys.stderr.getvalue()

    # Finally, try with sufficient space
    sys.stderr = StringIO()
    khmer.kfile.check_space_for_graph(outfile, space_needed, force=False,
                                      _testhook_free_space=20e9)
    assert sys.stderr.getvalue() == ''
    sys.stderr = oldstderr
Ejemplo n.º 18
0
def test_check_tablespace_nodegraph(graph_type, exp_buckets):
    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '3G'])
    buckets_per_table = khmer_args.calculate_graphsize(args, graph_type)
    total_buckets = buckets_per_table * args.n_tables
    sizestr = '{:.1f} million buckets'.format(float(total_buckets) / 1e9)
    assert sizestr == exp_buckets
Ejemplo n.º 19
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savetable', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer counting table to")
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Ejemplo n.º 20
0
def get_parser():
    epilog = """
    The memory usage of this script with :option:`-b` will be about
    1.15x the product of the :option:`-x` and :option:`-N` numbers.

    Example::

        collect-reads.py -k 20 -x 5e7 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args("Collect reads until a given avg coverage.", epilog=textwrap.dedent(epilog))
    parser.add_argument(
        "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to."
    )
    parser.add_argument(
        "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files."
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("-C", "--coverage", type=int, default=50, help="Collect reads until this coverage, then exit.")
    parser.add_argument("-o", "--output", type=argparse.FileType("w"), help="Write collect reads into this file.")
    parser.add_argument(
        "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255"
    )
    return parser
Ejemplo n.º 21
0
def get_parser():
    epilog = '''\
    Note that with :option:`-b`/:option:`--no-bigcount` this script is constant
    memory; in exchange, k-mer counts will stop at 255. The memory usage of
    this script with :option:`-b` will be about 1.15x the product of the
    :option:`-x` and :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.

    Example::

        abundance-dist-single.py -x 1e7 -N 2 -k 17 \\
                tests/test-data/test-abund-read-2.fa test-dist
    '''
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a "
        "single sequence file.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename',
                        help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename',
                        help='The name of the '
                        'output histogram file. The columns are: (1) k-mer '
                        'abundance, (2) k-mer count, (3) cumulative count, '
                        '(4) fraction of total distinct k-mers.')
    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output zero-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savegraph',
                        default='',
                        metavar="filename",
                        help="Save the k-mer countgraph to the specified "
                        "filename.")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Ejemplo n.º 22
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer counting table from the given"
        " sequences.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename',
                        help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename',
                        nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--summary-info',
                        '-s',
                        default=None,
                        metavar="FORMAT",
                        choices=['json', 'tsv'],
                        help="What format should the machine readable run "
                        "summary be in? (json or tsv, disabled by default)")
    parser.add_argument('--report-total-kmers',
                        '-t',
                        action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Ejemplo n.º 23
0
def get_parser():
    epilog = """
    Note that with :option:`-b` this script is constant memory; in exchange,
    k-mer counts will stop at 255. The memory usage of this script with
    :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.
    """
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a " "single sequence file.",
        epilog=textwrap.dedent(epilog),
    )
    add_threading_args(parser)

    parser.add_argument("input_sequence_filename", help="The name of the input" " FAST[AQ] sequence file.")
    parser.add_argument(
        "output_histogram_filename",
        help="The name of the "
        "output histogram file. The columns are: (1) k-mer "
        "abundance, (2) k-mer count, (3) cumulative count, "
        "(4) fraction of total distinct k-mers.",
    )
    parser.add_argument(
        "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins"
    )
    parser.add_argument(
        "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255"
    )
    parser.add_argument(
        "-s",
        "--squash",
        dest="squash_output",
        default=False,
        action="store_true",
        help="Overwrite output file if it exists",
    )
    parser.add_argument(
        "--csv",
        default=False,
        action="store_true",
        help="Use the CSV format for the histogram. " "Includes column headers.",
    )
    parser.add_argument(
        "--savetable",
        default="",
        metavar="filename",
        help="Save the k-mer counting table to the specified " "filename.",
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
Ejemplo n.º 24
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)
    parser.add_argument(
        "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to."
    )
    parser.add_argument(
        "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files."
    )
    parser.add_argument(
        "-b",
        "--no-bigcount",
        dest="bigcount",
        default=True,
        action="store_false",
        help="The default behaviour is "
        "to count past 255 using bigcount. This flag turns "
        "bigcount off, limiting counts to 255.",
    )
    parser.add_argument(
        "--summary-info",
        "-s",
        type=str,
        default=None,
        metavar="FORMAT",
        choices=[str("json"), str("tsv")],
        help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)",
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
Ejemplo n.º 25
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.corr, placed
    in the current directory.  This output contains the input sequences,
    corrected at low-abundance k-mers.

    Note that the output reads will not necessarily be in the same
    order as the reads in the input files. However, read pairs will be
    kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        correct-reads.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Correct reads using a semi-streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='k-mers below this abundance are not trusted',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o', '--out', metavar="filename",
                        type=argparse.FileType('w'),
                        default=None, help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only correct sequences that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s', '--savegraph', metavar="filename", default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    return parser
Ejemplo n.º 26
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        help="output file for histogram; defaults to "
                        "<first filename>.corr in cwd.",
                        type=khFileType('w'),
                        default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = Countgraph.load(args.counts_table)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Ejemplo n.º 27
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.corr, placed
    in the current directory.  This output contains the input sequences,
    corrected at low-abundance k-mers.

    Note that the output reads will not necessarily be in the same
    order as the reads in the input files. However, read pairs will be
    kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        correct-reads.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Correct reads using a semi-streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff', '-C', type=int,
                        help='k-mers below this abundance are not trusted',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o', '--out', metavar="filename",
                        type=argparse.FileType('w'),
                        default=None, help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        default=False,
                        help='Only correct sequences that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s', '--savegraph', metavar="filename", default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    return parser
Ejemplo n.º 28
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt
    for each input sequence file. If the input sequences are from RNAseq or
    metagenome sequencing then :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 table.kh data/100k-filtered.fa
        filter-abund.py -C 2 table.kh data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('input_table',
                        metavar='input_presence_table_filename',
                        help='The input k-mer presence table filename')
    parser.add_argument('input_filename',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename',
                        nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        dest='variable_coverage',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o',
                        '--out',
                        dest='single_output_filename',
                        default='',
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    return parser
Ejemplo n.º 29
0
def get_parser():
    parser = build_counting_args(
        descr="Output abundances of the k-mers in the sequence file.")
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The input'
                        ' FAST[AQ] sequence file.')

    parser.add_argument('-o', '--out', metavar="output_file",
                        dest='output_file',
                        type=argparse.FileType('w'),
                        default=None, help='output counts to this file')

    return parser
Ejemplo n.º 30
0
def test_check_tablespace_force():
    save_stderr, sys.stderr = sys.stderr, io.StringIO()

    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '1e9'])

    try:
        khmer.kfile.check_space_for_hashtable(args, 'countgraph', True,
                                              _testhook_free_space=0)
        assert True, "this should pass"
    except SystemExit as e:
        print(str(e))
    finally:
        sys.stderr = save_stderr
Ejemplo n.º 31
0
def get_parser():
    parser = build_counting_args(
        descr="Output abundances of the k-mers in the sequence file.")
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The input'
                        ' FAST[AQ] sequence file.')

    parser.add_argument('-o', '--out', metavar="output_file",
                        dest='output_file',
                        type=argparse.FileType('w'),
                        default=None, help='output counts to this file')

    return parser
Ejemplo n.º 32
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o', '--output', dest='output_file',
                        help="output file for histogram; defaults to "
                             "<first filename>.corr in cwd.",
                        type=khFileType('w'), default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = Countgraph.load(args.counts_table)

    aligner = khmer.ReadAligner(ht,
                                args.trusted_cov,
                                args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Ejemplo n.º 33
0
def test_check_tablespace():
    outfile = utils.get_test_data('truncated.fq')
    save_stderr, sys.stderr = sys.stderr, io.StringIO()

    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '1e9'])

    try:
        tablesize = khmer_args.calculate_graphsize(args, 'countgraph')
        khmer.kfile.check_space_for_graph(outfile, tablesize,
                                          False, _testhook_free_space=0)
        assert 0, "this should fail"
    except SystemExit as e:
        print(str(e))
    finally:
        sys.stderr = save_stderr
Ejemplo n.º 34
0
def test_check_tablespace_force():
    save_stderr, sys.stderr = sys.stderr, io.StringIO()

    parser = khmer_args.build_counting_args()
    args = parser.parse_args(['-M', '1e9'])

    try:
        khmer.kfile.check_space_for_hashtable(args,
                                              'countgraph',
                                              True,
                                              _testhook_free_space=0)
        assert True, "this should pass"
    except SystemExit as e:
        print(str(e))
    finally:
        sys.stderr = save_stderr
Ejemplo n.º 35
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-V', '--variable-coverage', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 36
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 37
0
def get_parser():
    epilog = """\
    Note: with :option:`-b`/:option:`--no-bigcount` the output will be the
    exact size of the k-mer countgraph and this script will use a constant
    amount of memory. In exchange k-mer counts will stop at 255. The memory
    usage of this script with :option:`-b` will be about 1.15x the product of
    the :option:`-x` and :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer countgraph from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog),
                                 citations=['counting', 'SeqAn'])
    add_threading_args(parser)
    parser.add_argument('output_countgraph_filename', help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false', help="The default behaviour is "
                        "to count past 255 using bigcount. This flag turns "
                        "bigcount off, limiting counts to 255.")
    parser.add_argument('-s', '--summary-info', type=str, default=None,
                        metavar="FORMAT", choices=[str('json'), str('tsv')],
                        help="What format should the machine readable run "
                        "summary be in? (`json` or `tsv`, disabled by"
                        " default)")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    return parser
Ejemplo n.º 38
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >> sys.stderr, 'file with ht: %s' % counting_ht

    print >> sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(
        ht, args.trusted_cov, args.theta
    )  # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)

    ### the filtering loop
    for infile in infiles:
        print >> sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >> sys.stderr, name
            print >> sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(
                seq)
            print >> sys.stderr, score
            print >> sys.stderr, graph_alignment
            print >> sys.stderr, read_alignment
            print >> sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Ejemplo n.º 39
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile)
Ejemplo n.º 40
0
def main():

    parser = build_counting_args()
    parser.add_argument('--min-abundance', default=50, type=int)
    parser.add_argument('input_files', nargs='+')
    parser.add_argument('-o', '--out', type=argparse.FileType('wb'), default=sys.stdout)
    args = parser.parse_args()

    countgraph = khmer_args.create_countgraph(args, multiplier=1.1)

    count = 0
    for fn in args.input_files:
        short = os.path.basename(fn)
        for n, record in enumerate(screed.open(fn)):
            if n % 100000 == 0:
                print('Processed {n} reads...'.format(n=n), file=sys.stderr)
            countgraph.consume(record.sequence)
            if countgraph.median_at_least(record.sequence, args.min_abundance):
                args.out.write('>{fn}:{name}:{c}\n{seq}\n'.format(fn=short, c=count, name=record.name, seq=record.sequence))
                count += 1
Ejemplo n.º 41
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph',
                        metavar="filename",
                        default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('datafile',
                        metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 42
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer counting table from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename', help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT",
                        choices=['json', 'tsv'],
                        help="What format should the machine readable run "
                        "summary be in? (json or tsv, disabled by default)")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Ejemplo n.º 43
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Ejemplo n.º 44
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >>sys.stderr, 'file with ht: %s' % counting_ht

    print >>sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(ht, args.trusted_cov, args.theta) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)
    
    ### the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >>sys.stderr, name
            print >>sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(seq)
            print >>sys.stderr, score
            print >>sys.stderr, graph_alignment
            print >>sys.stderr, read_alignment
            print >>sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Ejemplo n.º 45
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 46
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt
    for each input sequence file. If the input sequences are from RNAseq or
    metagenome sequencing then :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 table.kh data/100k-filtered.fa
        filter-abund.py -C 2 table.kh data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('input_table', metavar='input_presence_table_filename',
                        help='The input k-mer presence table filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--out', dest='single_output_filename',
                        default='', metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    return parser
Ejemplo n.º 47
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('-o',
                   type=argparse.FileType('w'),
                   default='assembly-stats.csv')
    args = p.parse_args()

    cg = create_countgraph(args)

    kept = 0
    hdn = khmer.HashSet(args.ksize)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()
    statswriter = csv.DictWriter(args.o,
                                 delimiter=',',
                                 fieldnames=[
                                     'read_n', 'action', 'cov', 'n_hdn',
                                     'contig_n', 'orf_n', 'new'
                                 ])

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < args.ksize:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
                statswriter.writerow({
                    'read_n': n,
                    'action': 'c',
                    'cov': cov,
                    'n_hdn': None,
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < args.ksize:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
                statswriter.writerow({
                    'read_n': n,
                    'action': 'l',
                    'cov': cov,
                    'n_hdn': len(hdn),
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov == 30:
                contigs = lh.assemble_labeled_path(
                    record.sequence[:args.ksize])
                for contig_n, contig in enumerate(contigs):
                    statswriter.writerow({
                        'read_n': n,
                        'action': 'a',
                        'cov': cov,
                        'n_hdn': None,
                        'contig_n': contig_n,
                        'orf_n': None,
                        'new': None
                    })
                    for t in translate(contig):
                        for orf_n, o in enumerate(extract_orfs(t)):
                            if hash(o) not in output:
                                new = True
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
                            else:
                                new = False
                            statswriter.writerow({
                                'read_n': n,
                                'action': 'a',
                                'cov': cov,
                                'n_hdn': None,
                                'contig_n': contig_n,
                                'orf_n': orf_n,
                                'new': new
                            })
Ejemplo n.º 48
0
def get_parser():
    epilog = """\
    The output is one file for each input file, ``<input file>.abundtrim``,
    placed in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The :option:`-V`/:option:`--variable-coverage` parameter will, if
    specified, prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use :program:`load-into-counting.py` and :program:`filter-abund.py`.
    However, read pairs will be kept together, in "broken-paired" format; you
    can use :program:`extract-paired-reads.py` to extract read pairs and
    orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff',
                        '-C',
                        type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--trim-at-coverage',
                        '-Z',
                        '--normalize-to',
                        type=int,
                        help='trim reads when entire read above this coverage',
                        default=DEFAULT_TRIM_AT_COVERAGE)

    parser.add_argument('-o',
                        '--output',
                        metavar="output_filename",
                        type=argparse.FileType('wb'),
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadgraph_args(parser)
    parser.add_argument('-s',
                        '--savegraph',
                        metavar="filename",
                        default='',
                        help='save the k-mer countgraph to disk after all'
                        'reads are loaded.')
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        default='./',
                        help="Set location of temporary directory for "
                        "second pass")
    add_output_compression_type(parser)

    parser.add_argument('--diginorm',
                        default=False,
                        action='store_true',
                        help="Eliminate high-coverage reads altogether "
                        "(digital normalization).")
    parser.add_argument('--diginorm-coverage',
                        type=int,
                        default=DEFAULT_DIGINORM_COVERAGE,
                        help="Coverage threshold for --diginorm")
    parser.add_argument('--single-pass',
                        default=False,
                        action='store_true',
                        help="Do not do a second pass across the low coverage "
                        "data")

    return parser
Ejemplo n.º 49
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument(
        "--bits-theta",
        help=
        "Tuning parameter controlling trade off of speed vs alignment sensitivity",
        default=1.0,
        type=float,
        dest="bits_theta")
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >> sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(
                record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    .format(score, graph_alignment, read_alignment, truncated,
                            keep, seq, record.sequence, record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Ejemplo n.º 50
0
def main():
    parser = build_counting_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('-R',
                        '--report-to-file',
                        dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ' - paired =	      %s \t\t(-p)' % args.paired
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepmedpct'
        outfp = open(output_name, 'w')

        n = -1
        for n, batch in enumerate(
                batchwise(screed.open(input_filename), batch_size)):
            if n > 0 and n % 100000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print >> sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (
                        batch[0].name, batch[1].name)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, avg, dev = ht.get_median_count(seq)

                pct = 0.
                if avg:
                    pct = dev / avg * 100

                if med < DESIRED_COVERAGE and pct < 100:
                    ht.consume(seq)
                    passed_filter = True

            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record, 'quality'):
                        outfp.write(
                            '@%s\n%s\n+\n%s\n' %
                            (record.name, record.sequence, record.quality))
                    else:
                        outfp.write('>%s\n%s\n' %
                                    (record.name, record.sequence))
            else:
                discarded += batch_size

        if -1 < n:
            print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
                total, 'or', int(100. - discarded / float(total) * 100.), '%'
            print 'output in', output_name
        else:
            print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Ejemplo n.º 51
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3)
    parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta")
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)
            
    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name))


            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Ejemplo n.º 52
0
def get_parser():
    epilog = ("""
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

""" "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
    """

    Example::

""" "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads")   # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C', '--cutoff', type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savetable', metavar="filename", default='')
    parser.add_argument('-R', '--report',
                        metavar='filename', type=argparse.FileType('w'))
    parser.add_argument('-f', '--fault-tolerant', dest='force',
                        help='continue on next file if read errors are \
                         encountered', action='store_true')
    parser.add_argument('--save-on-failure', dest='fail_save',
                        action='store_false', default=True,
                        help='Save k-mer counting table when an error occurs')
    parser.add_argument('-d', '--dump-frequency', dest='dump_frequency',
                        type=int, help='dump k-mer counting table every d '
                        'files', default=-1)
    parser.add_argument('-o', '--out', metavar="filename",
                        dest='single_output_filename',
                        default='', help='only output a single'
                        ' file with the specified filename')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.', nargs='+')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers"
                        " post-normalization to stderr")
    parser.add_argument('--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 53
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    Paired end reads will be considered together if :option:`-p` is set. If
    either read will be kept, then both will be kept. This should result in
    keeping (or discarding) each sequencing fragment. This helps with retention
    of repeats, especially.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    :option:`-f`/:option:`--fault-tolerant` will force the program to continue
    upon encountering a formatting error in a sequence file; the k-mer counting
    table up to that point will be dumped, and processing will continue on the
    next file.

    Example::

        saturate-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        saturate-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        saturate-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savetable', metavar="filename", default='')
    parser.add_argument('-R',
                        '--report',
                        metavar='filename',
                        type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency',
                        default=100000,
                        type=int)
    parser.add_argument('-f',
                        '--fault-tolerant',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        dest='single_output_filename',
                        default='',
                        help='only output a single'
                        ' file with the specified filename')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 54
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('--prefix', default='transcriptome')
    args = p.parse_args()

    cg = create_countgraph(args)
    asm = khmer.JunctionCountAssembler(cg)

    tr_fn = '{0}.transcripts.fa'.format(args.prefix)
    orf_fn = '{0}.orfs.fa'.format(args.prefix)
    stats_fn = '{0}.stats.fa'.format(args.prefix)

    with open(tr_fn, 'w') as tr_fp,\
         open(orf_fn, 'w') as orf_fp,\
         open(stats_fn, 'w') as stats_fp:

        kept = 0
        next_contig = 1
        next_orf = 1
        output = set()
        statswriter = csv.DictWriter(
            stats_fp,
            delimiter=',',
            fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n'])

        for filename in args.fastq_files:
            for n, record in enumerate(screed.open(filename)):
                if n and n % 10000 == 0:
                    print('...', n, file=sys.stderr)

                if len(record.sequence) < args.ksize:
                    continue

                cov, _, _ = cg.get_median_count(record.sequence)
                if cov < 20:
                    kept += 1
                    cg.consume(record.sequence)
                    statswriter.writerow({
                        'read_n': n,
                        'action': 'c',
                        'cov': cov,
                        'n_junctions': None,
                        'contig_n': None
                    })
                elif cov < 30:
                    seq, pos = cg.trim_on_abundance(record.sequence, 3)
                    if len(seq) < args.ksize:
                        continue

                    n_junctions = asm.consume(seq)
                    statswriter.writerow({
                        'read_n': n,
                        'action': 't',
                        'cov': cov,
                        'n_junctions': n_junctions,
                        'contig_n': None
                    })
                elif cov == 30:
                    contigs = asm.assemble(record.sequence[:args.ksize])
                    for contig_n, contig in enumerate(contigs):
                        statswriter.writerow({
                            'read_n':
                            n,
                            'action':
                            'a',
                            'cov':
                            cov,
                            'n_junctions':
                            None,
                            'contig_n': (next_contig, contig_n)
                        })
                        tr_fp.write('>contig%d\n%s\n' % (next_contig, contig))
                        next_contig += 1

                        for t in translate(contig):
                            for orf_n, o in enumerate(extract_orfs(t)):
                                if hash(o) not in output:
                                    new = True
                                    output.add(hash(o))
                                    orf_fp.write('>orf%d\n%s\n' %
                                                 (next_orf, o))
                                    next_orf += 1
                                else:
                                    new = False
                else:
                    statswriter.writerow({
                        'read_n': n,
                        'action': 's',
                        'cov': cov,
                        'n_junctions': None,
                        'contig_n': None
                    })
Ejemplo n.º 55
0
def get_parser():
    epilog = (
        """
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force-single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savetable`, the k-mer counting table
    will be saved to the specified file after all sequences have been
    processed. With :option:`-d`, the k-mer counting table will be
    saved every d files for multifile runs; if :option:`-s` is set,
    the specified name will be used, and if not, the name `backup.ct`
    will be used.  :option:`-l`/:option:`--loadtable` will load the
    specified k-mer counting table before processing the specified
    files.  Note that these tables are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

"""
        "        normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq"  # noqa
        """

    Example::

"""
        "        normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads"
    )  # noqa
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog))
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force-single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s',
                        '--savetable',
                        metavar="filename",
                        default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')
    parser.add_argument('-R',
                        '--report',
                        metavar='filename',
                        type=argparse.FileType('w'))
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        dest='single_output_file',
                        type=argparse.FileType('w'),
                        default=None,
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadhash_args(parser)
    return parser
Ejemplo n.º 56
0
def get_parser():
    epilog = """\
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/:option:`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force_single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
    will be saved to the specified file after all sequences have been
    processed. :option:`-l`/:option:`--loadgraph` will load the
    specified k-mer countgraph before processing the specified
    files.  Note that these graphs are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

        normalize-by-median.py -p -k 17 \\
        tests/test-data/test-abund-read-paired.fa

    Example::

        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\
        >> appended-output.fq

    Example::

        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\
        tests/test-data/test-fastq-reads.fq

    Example::

        normalize-by-median.py -k 17 -s test.ct \\
        tests/test-data/test-abund-read-2.fa \\
        tests/test-data/test-fastq-reads.fq"""
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog),
        citations=['diginorm'])
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s',
                        '--savegraph',
                        metavar="filename",
                        default=None,
                        help='save the k-mer countgraph to disk after all '
                        'reads are loaded.')
    parser.add_argument('-R',
                        '--report',
                        help='write progress report to report_filename',
                        metavar='report_filename',
                        type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency',
                        type=int,
                        default=100000,
                        help='report progress every report_frequency reads')
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    parser.add_argument('-o',
                        '--output',
                        metavar="filename",
                        type=khFileType('wb'),
                        default=None,
                        dest='single_output_file',
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    add_loadgraph_args(parser)
    parser.add_argument('-z',
                        '--loadgraph2',
                        metavar="filename",
                        default=None,
                        help='load a second k-mer graph')
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 57
0
def get_parser():
    epilog = """\
    Discard sequences based on whether or not their median k-mer abundance lies
    above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

    By default, paired end reads will be considered together; if
    either read should be kept, both will be kept. (This keeps both
    reads from a fragment, and helps with retention of repeats.)
    Unpaired reads are treated individually.

    If :option:`-p`/:option:`--paired` is set, then proper pairing is required
    and the script will exit on unpaired reads, although
    :option:`--unpaired-reads` can be used to supply a file of orphan
    reads to be read after the paired reads.

    :option:`--force_single` will ignore all pairing information and treat
    reads individually.

    With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
    will be saved to the specified file after all sequences have been
    processed. :option:`-l`/:option:`--loadgraph` will load the
    specified k-mer countgraph before processing the specified
    files.  Note that these graphs are are in the same format as those
    produced by :program:`load-into-counting.py` and consumed by
    :program:`abundance-dist.py`.

    To append reads to an output file (rather than overwriting it), send output
    to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
    append to the file.

    Example::

        normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa

    Example::

        normalize-by-median.py -p -k 17 \\
        tests/test-data/test-abund-read-paired.fa

    Example::

        normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\
        >> appended-output.fq

    Example::

        normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\
        tests/test-data/test-fastq-reads.fq

    Example::

        normalize-by-median.py -k 17 -s test.ct \\
        tests/test-data/test-abund-read-2.fa \\
        tests/test-data/test-fastq-reads.fq"""
    parser = build_counting_args(
        descr="Do digital normalization (remove mostly redundant sequences)",
        epilog=textwrap.dedent(epilog),
        citations=['diginorm'])
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    parser.add_argument('-C', '--cutoff', help="when the median "
                        "k-mer coverage level is above this number the "
                        "read is not kept.",
                        type=check_argument_range(0, 256, "cutoff"),
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single', dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u', '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-s', '--savegraph', metavar="filename", default=None,
                        help='save the k-mer countgraph to disk after all '
                        'reads are loaded.')
    parser.add_argument('-R', '--report',
                        help='write progress report to report_filename',
                        metavar='report_filename', type=argparse.FileType('w'))
    parser.add_argument('--report-frequency',
                        metavar='report_frequency', type=int, default=100000,
                        help='report progress every report_frequency reads')
    parser.add_argument('-f', '--force', dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    parser.add_argument('-o', '--output', metavar="filename",
                        type=khFileType('wb'),
                        default=None, dest='single_output_file',
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.', nargs='+')
    add_loadgraph_args(parser)
    add_output_compression_type(parser)
    return parser
Ejemplo n.º 58
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument("--bits-theta",
                        help="Tuning parameter controlling"
                        "trade off of speed vs alignment sensitivity",
                        default=1.0,
                        type=float,
                        dest="bits_theta")
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_tables, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.max_tablesize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % \
            (args.n_tables * args.max_tablesize), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    filenames = args.input_filenames

    if args.loadhash:
        print('loading hashtable from', args.loadhash)
        ht = khmer.load_countgraph(args.loadhash)
    else:
        print('making hashtable')
        ht = khmer.Countgraph(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out is not None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepvar'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print('... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%')
                print('... in file', input_filename)

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(record.sequence)

            # next, decide whether or to keep it.
            keep = False
            if truncated:
                keep = True  # keep all truncated alignments - why?
            else:

                # build a better sequence -- this is the corrected one.
                graph_seq = graph_alignment.replace("-", "")
                # OR?
                #graph_seq = ""
                #for i in range(len(graph_alignment)):
                #    if graph_alignment[i] == "-":
                #        graph_seq += read_alignment[i]
                #    else:
                #        graph_seq += graph_alignment[i]

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    keep = True

            if details_out is not None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      "
                    "{6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    "".format(score, graph_alignment, read_alignment,
                              truncated, keep, seq, record.sequence,
                              record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        if total:
            print('DONE with', input_filename, \
                '; kept', total - discarded, 'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%')
        print('output in', output_name)

    if args.savehash:
        print('Saving hashfile through', input_filename)
        print('...saving to', args.savehash)
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
Ejemplo n.º 59
0
def get_parser():
    epilog = """
    The output is one file for each input file, <input file>.abundtrim, placed
    in the current directory.  This output contains the input sequences
    trimmed at low-abundance k-mers.

    The ``-V/--variable-coverage`` parameter will, if specified,
    prevent elimination of low-abundance reads by only trimming
    low-abundance k-mers from high-abundance reads; use this for
    non-genomic data sets that may have variable coverage.

    Note that the output reads will not necessarily be in the same order
    as the reads in the input files; if this is an important consideration,
    use ``load-into-counting.py`` and ``filter-abund.py``.  However, read
    pairs will be kept together, in "broken-paired" format; you can use
    ``extract-paired-reads.py`` to extract read pairs and orphans.

    Example::

        trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
    """

    parser = build_counting_args(
        descr='Trim low-abundance k-mers using a streaming algorithm.',
        epilog=textwrap.dedent(epilog))

    parser.add_argument('input_filenames', nargs='+')

    parser.add_argument('--cutoff',
                        '-C',
                        type=int,
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        help='base cutoff on this median k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('-o',
                        '--out',
                        metavar="filename",
                        type=argparse.FileType('w'),
                        default=None,
                        help='only output a single file with '
                        'the specified filename; use a single dash "-" to '
                        'specify that output should go to STDOUT (the '
                        'terminal)')

    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')

    add_loadhash_args(parser)
    parser.add_argument('-s',
                        '--savetable',
                        metavar="filename",
                        default='',
                        help='save the k-mer counting table to disk after all'
                        'reads are loaded.')

    # expert options
    parser.add_argument('--force', default=False, action='store_true')
    parser.add_argument('--ignore-pairs', default=False, action='store_true')
    parser.add_argument('--tempdir', '-T', type=str, default='./')

    return parser