Example #1
0
def get_parser():
    """
    returns the parser object for the oxli subcommand handler
    """

    parser = argparse.ArgumentParser(
        description="Single entry point script for khmer", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    subparsers = parser.add_subparsers()

    # build-graph (formerly load-graph) parsers here
    parser_build_graph = subparsers.add_parser(
        "build-graph",
        help="Load sequences into the compressible graph" "format plus optional tagset",
        description="Load sequences into the " "compressible graph format plus optional tagset",
    )

    khmer_args.build_hashbits_args(
        "Load sequences into the compressible" "graph format plus optional tagset.", None, parser=parser_build_graph
    )
    build_graph.build_parser(parser_build_graph)
    parser_build_graph.set_defaults(func=build_graph.main)

    return parser
Example #2
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset',
                        '-n',
                        default=False,
                        action='store_true',
                        dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename',
                        help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        nargs='+',
                        help='input FAST[AQ] sequence filename')
    parser.add_argument('--report-total-kmers',
                        '-t',
                        action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('--write-fp-rate',
                        '-w',
                        action='store_true',
                        help="Write false positive rate into .info file")
    return parser
Example #3
0
def get_parser():
    epilog = """
    An additional report will be written to ${output_report_filename}.curve
    containing the increase of overlap k-mers as the number of sequences in the
    second database increases.
    """
    parser = build_hashbits_args(
        descr='Count the overlap k-mers which are the k-mers appearing in two '
        'sequence datasets.',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('ptfile',
                        metavar='input_presence_table_filename',
                        help="input k-mer presence table filename")
    parser.add_argument('fafile',
                        metavar='input_sequence_filename',
                        help="input sequence filename")
    parser.add_argument('report_filename',
                        metavar='output_report_filename',
                        help='output report filename')
    parser.add_argument('--csv',
                        default=False,
                        action='store_true',
                        help='Use the CSV format for the curve output. '
                        'Includes column headers.')
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #4
0
def get_parser():
    epilog = """
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of :program:`load-graph.py`,
    :program:`partition-graph.py`, :program:`merge-partitions.py`, and
    :program:`annotate-partitions.py` into one script. This is convenient
    but should probably not be used for large data sets, because
    :program:`do-partition.py` doesn't provide save/resume functionality.
    """
    parser = build_hashbits_args(
        descr='Load, partition, and annotate FAST[AQ] sequences',
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')
    parser.add_argument('--no-big-traverse', dest='no_big_traverse',
                        action='store_true', default=False,
                        help='Truncate graph joins at big traversals')
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase', help="base name for output files")
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filenames')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #5
0
def get_parser():
    epilog = """
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of :program:`load-graph.py`,
    :program:`partition-graph.py`, :program:`merge-partitions.py`, and
    :program:`annotate-partitions.py` into one script. This is convenient
    but should probably not be used for large data sets, because
    :program:`do-partition.py` doesn't provide save/resume functionality.
    """
    parser = build_hashbits_args(
        descr='Load, partition, and annotate FAST[AQ] sequences',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')
    parser.add_argument('--no-big-traverse', dest='no_big_traverse',
                        action='store_true', default=False,
                        help='Truncate graph joins at big traversals')
    parser.add_argument('--threads', '-T', dest='n_threads',
                        default=DEFAULT_N_THREADS,
                        help='Number of simultaneous threads to execute')
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase', help="base name for output files")
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filenames')
    return parser
Example #6
0
def get_parser():
    parser = build_hashbits_args(
        "Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to"
    )
    parser.epilog = EPILOG
    parser.add_argument(
        "-r",
        "--traversal_range",
        type=int,
        dest="traversal_range",
        default=DEFAULT_RANGE,
        help="depth of breadth-first search to perform\
                                    from each read",
    )
    parser.add_argument("--max_queue_size", type=int, default=1000)
    parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files")
    parser.add_argument(
        "--outdir",
        dest="outdir",
        default="",
        help="output directory; default is location of \
                              fastp file",
    )
    parser.add_argument("--query", dest="query", nargs="+", help="Reads to be swept and sorted")
    parser.add_argument("--db", dest="db", nargs="+", help="Database reads for sweep", required=True)

    return parser
Example #7
0
def get_parser():
    parser = build_hashbits_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument('-r',
                        '--traversal_range',
                        type=int,
                        dest='traversal_range',
                        default=DEFAULT_RANGE,
                        help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('--max_queue_size', type=int, default=1000)
    parser.add_argument('--prefix',
                        dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir',
                        dest='outdir',
                        default='',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('--query',
                        dest='query',
                        nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('--db',
                        dest='db',
                        nargs='+',
                        help='Database reads for sweep',
                        required=True)

    return parser
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        nargs='+',
                        help='input FAST[AQ] sequence filename')
    return parser
Example #10
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename', help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
Example #11
0
def get_parser():
    epilog = """
    An additional report will be written to ${output_report_filename}.curve
    containing the increase of overlap k-mers as the number of sequences in the
    second database increases.
    """
    parser = build_hashbits_args(
        descr='Count the overlap k-mers which are the k-mers appearing in two '
        'sequence datasets.', epilog=textwrap.dedent(epilog))
    parser.add_argument('ptfile', metavar='input_presence_table_filename',
                        help="input k-mer presence table filename")
    parser.add_argument('fafile', metavar='input_sequence_filename',
                        help="input sequence filename")
    parser.add_argument('report_filename', metavar='output_report_filename',
                        help='output report filename')

    return parser
Example #12
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename', help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('--write-fp-rate', '-w', action='store_true',
                        help="Write false positive rate into .info file")
    return parser
Example #13
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset',
                        '-n',
                        default=False,
                        action='store_true',
                        dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename',
                        help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        nargs='+',
                        help='input FAST[AQ] sequence filename')
    return parser
Example #14
0
def get_parser():
    epilog = """
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of :program:`load-graph.py`,
    :program:`partition-graph.py`, :program:`merge-partitions.py`, and
    :program:`annotate-partitions.py` into one script. This is convenient
    but should probably not be used for large data sets, because
    :program:`do-partition.py` doesn't provide save/resume functionality.
    """
    parser = build_hashbits_args(
        descr="Load, partition, and annotate FAST[AQ] sequences", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)
    parser.add_argument(
        "--subset-size",
        "-s",
        default=DEFAULT_SUBSET_SIZE,
        dest="subset_size",
        type=float,
        help="Set subset size (usually 1e5-1e6 is good)",
    )
    parser.add_argument(
        "--no-big-traverse",
        dest="no_big_traverse",
        action="store_true",
        default=False,
        help="Truncate graph joins at big traversals",
    )
    parser.add_argument(
        "--keep-subsets",
        dest="remove_subsets",
        default=True,
        action="store_false",
        help="Keep individual subsets (default: False)",
    )
    parser.add_argument("graphbase", help="base name for output files")
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filenames"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
Example #15
0
def get_parser():
    parser = build_hashbits_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
                        default=DEFAULT_MAX_READS,
                        help='Max total reads to buffer before flushing')
    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
                        default=DEFAULT_BUFFER_SIZE,
                        help='Max length of an individual label buffer \
                              before flushing')
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                        default=DEFAULT_NUM_BUFFERS,
                        help='Max individual label buffers before flushing')
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument('--label-by-pid', dest='label_by_pid',
                          action='store_true', help='separate reads by\
                        reference partition id')
    labeling.add_argument('--label-by-seq', dest='label_by_seq',
                          action='store_true', help='separate reads by\
                        reference sequence')
    labeling.add_argument('--label-by-group', dest='group_size', type=int,
                          help='separate reads by arbitrary sized groups\
                        of reference sequences')
    parser.add_argument(dest='input_fastp', help='Reference fasta or fastp')
    parser.add_argument('input_files', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #16
0
def get_parser():
    parser = build_hashbits_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
                        default=DEFAULT_MAX_READS,
                        help='Max total reads to buffer before flushing')
    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
                        default=DEFAULT_BUFFER_SIZE,
                        help='Max length of an individual label buffer \
                              before flushing')
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                        default=DEFAULT_NUM_BUFFERS,
                        help='Max individual label buffers before flushing')
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument('--label-by-pid', dest='label_by_pid',
                          action='store_true', help='separate reads by\
                        referece partition id')
    labeling.add_argument('--label-by-seq', dest='label_by_seq',
                          action='store_true', help='separate reads by\
                        reference sequence')
    labeling.add_argument('--label-by-group', dest='group_size', type=int,
                          help='separate reads by arbitrary sized groups\
                        of reference sequences')
    parser.add_argument(dest='input_fastp', help='Reference fasta or fastp')
    parser.add_argument('input_files', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #17
0
def get_parser():
    epilog = """
    An additional report will be written to ${output_report_filename}.curve
    containing the increase of overlap k-mers as the number of sequences in the
    second database increases.
    """
    parser = build_hashbits_args(
        descr='Count the overlap k-mers which are the k-mers appearing in two '
        'sequence datasets.', epilog=textwrap.dedent(epilog))
    parser.add_argument('ptfile', metavar='input_presence_table_filename',
                        help="input k-mer presence table filename")
    parser.add_argument('fafile', metavar='input_sequence_filename',
                        help="input sequence filename")
    parser.add_argument('report_filename', metavar='output_report_filename',
                        help='output report filename')
    parser.add_argument('--csv', default=False, action='store_true',
                        help='Use the CSV format for the curve output. '
                        'Includes column headers.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #18
0
def get_parser():
    epilog = """
    An additional report will be written to ${output_report_filename}.curve
    containing the increase of overlap k-mers as the number of sequences in the
    second database increases.
    """
    parser = build_hashbits_args(
        descr='Count the overlap k-mers which are the k-mers appearing in two '
        'sequence datasets.',
        epilog=textwrap.dedent(epilog))
    parser.add_argument('ptfile',
                        metavar='input_presence_table_filename',
                        help="input k-mer presence table filename")
    parser.add_argument('fafile',
                        metavar='input_sequence_filename',
                        help="input sequence filename")
    parser.add_argument('report_filename',
                        metavar='output_report_filename',
                        help='output report filename')

    return parser
Example #19
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument(
        "--no-build-tagset",
        "-n",
        default=False,
        action="store_true",
        dest="no_build_tagset",
        help="Do NOT construct tagset while loading sequences",
    )
    parser.add_argument(
        "output_filename", metavar="output_presence_table_filename", help="output" " k-mer presence table filename."
    )
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename"
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("--write-fp-rate", "-w", action="store_true", help="Write false positive rate into .info file")
    return parser
Example #20
0
def get_parser():
    parser = build_hashbits_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('--max_queue_size', type=int, default=1000)
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir', default='',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('--query', dest='query', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('--db', dest='db', nargs='+',
                        help='Database reads for sweep', required=True)

    return parser
Example #21
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                       "graph format plus optional tagset.")

    parser = build_graph.build_parser(parser)
    return parser
Example #22
0
def get_parser():
    parser = build_hashbits_args(
        "Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to"
    )
    parser.epilog = EPILOG
    parser.add_argument(
        "-r",
        "--traversal_range",
        type=int,
        dest="traversal_range",
        default=DEFAULT_RANGE,
        help="depth of breadth-first search to perform\
                                    from each read",
    )
    parser.add_argument(
        "-b",
        "--buffer_size",
        dest="max_reads",
        type=int,
        default=DEFAULT_MAX_READS,
        help="Max total reads to buffer before flushing",
    )
    parser.add_argument(
        "-l",
        "--buffer_length",
        dest="buffer_size",
        type=int,
        default=DEFAULT_BUFFER_SIZE,
        help="Max length of an individual label buffer \
                              before flushing",
    )
    parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files")
    parser.add_argument(
        "--outdir",
        dest="outdir",
        help="output directory; default is location of \
                              fastp file",
    )
    parser.add_argument(
        "-m",
        "--max_buffers",
        dest="max_buffers",
        type=int,
        default=DEFAULT_NUM_BUFFERS,
        help="Max individual label buffers before flushing",
    )
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument(
        "--label-by-pid",
        dest="label_by_pid",
        action="store_true",
        help="separate reads by\
                        referece partition id",
    )
    labeling.add_argument(
        "--label-by-seq",
        dest="label_by_seq",
        action="store_true",
        help="separate reads by\
                        reference sequence",
    )
    labeling.add_argument(
        "--label-by-group",
        dest="group_size",
        type=int,
        help="separate reads by arbitrary sized groups\
                        of reference sequences",
    )
    parser.add_argument(dest="input_fastp", help="Reference fasta or fastp")
    parser.add_argument("input_files", nargs="+", help="Reads to be swept and sorted")

    return parser
Example #23
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")

    parser = build_graph.build_parser(parser)
    return parser
Example #24
0
def main():
    parser = build_hashbits_args()
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_tablesize == DEFAULT_MIN_TABLESIZE:
            print >>sys.stderr, "** WARNING: tablesize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_tables
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_tablesize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_tables x min_tablesize / 8)' % (
                args.n_tables * args.min_tablesize * len(args.input_filenames) / 8.)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    inputlist = args.input_filenames
    readsfile = args.read_filename

    query_list = []
    for n, inp_name in enumerate(inputlist):
        # create a hashbits data structure
        ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

        outfile = os.path.basename(inp_name) + '.sweep3'
        outfp = open(outfile, 'w')
        query_list.append((ht, outfp))

    for n, inp_name in enumerate(inputlist):
        ht = query_list[n][0]

        # load contigs, connect into N partitions
        print 'loading input reads from', inp_name
        ht.consume_fasta(inp_name)

    print 'starting sweep.'

    n = 0
    m = 0
    for n, record in enumerate(screed.open(readsfile)):
        if len(record.sequence) < K:
            continue

        if n % 10000 == 0:
            print '...', n, m

        for ht, outfp in query_list:
            count = ht.get_median_count(record.sequence)[0]
            if count:
                outfp.write(output_single(record))