def get_parser(): """ returns the parser object for the oxli subcommand handler """ parser = argparse.ArgumentParser( description="Single entry point script for khmer", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) subparsers = parser.add_subparsers() # build-graph (formerly load-graph) parsers here parser_build_graph = subparsers.add_parser( "build-graph", help="Load sequences into the compressible graph" "format plus optional tagset", description="Load sequences into the " "compressible graph format plus optional tagset", ) khmer_args.build_hashbits_args( "Load sequences into the compressible" "graph format plus optional tagset.", None, parser=parser_build_graph ) build_graph.build_parser(parser_build_graph) parser_build_graph.set_defaults(func=build_graph.main) return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename', metavar='output_presence_table_filename', help='output' ' k-mer presence table filename.') parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('--write-fp-rate', '-w', action='store_true', help="Write false positive rate into .info file") return parser
def get_parser(): epilog = """ An additional report will be written to ${output_report_filename}.curve containing the increase of overlap k-mers as the number of sequences in the second database increases. """ parser = build_hashbits_args( descr='Count the overlap k-mers which are the k-mers appearing in two ' 'sequence datasets.', epilog=textwrap.dedent(epilog)) parser.add_argument('ptfile', metavar='input_presence_table_filename', help="input k-mer presence table filename") parser.add_argument('fafile', metavar='input_sequence_filename', help="input sequence filename") parser.add_argument('report_filename', metavar='output_report_filename', help='output report filename') parser.add_argument('--csv', default=False, action='store_true', help='Use the CSV format for the curve output. ' 'Includes column headers.') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. """ parser = build_hashbits_args( descr='Load, partition, and annotate FAST[AQ] sequences', epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase', help="base name for output files") parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filenames') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. """ parser = build_hashbits_args( descr='Load, partition, and annotate FAST[AQ] sequences', epilog=textwrap.dedent(epilog)) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--threads', '-T', dest='n_threads', default=DEFAULT_N_THREADS, help='Number of simultaneous threads to execute') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase', help="base name for output files") parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filenames') return parser
def get_parser(): parser = build_hashbits_args( "Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to" ) parser.epilog = EPILOG parser.add_argument( "-r", "--traversal_range", type=int, dest="traversal_range", default=DEFAULT_RANGE, help="depth of breadth-first search to perform\ from each read", ) parser.add_argument("--max_queue_size", type=int, default=1000) parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files") parser.add_argument( "--outdir", dest="outdir", default="", help="output directory; default is location of \ fastp file", ) parser.add_argument("--query", dest="query", nargs="+", help="Reads to be swept and sorted") parser.add_argument("--db", dest="db", nargs="+", help="Database reads for sweep", required=True) return parser
def get_parser(): parser = build_hashbits_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('--max_queue_size', type=int, default=1000) parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', default='', help='output directory; default is location of \ fastp file') parser.add_argument('--query', dest='query', nargs='+', help='Reads to be swept and sorted') parser.add_argument('--db', dest='db', nargs='+', help='Database reads for sweep', required=True) return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename', metavar='output_presence_table_filename', help='output' ' k-mer presence table filename.') parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') return parser
def get_parser(): epilog = """ An additional report will be written to ${output_report_filename}.curve containing the increase of overlap k-mers as the number of sequences in the second database increases. """ parser = build_hashbits_args( descr='Count the overlap k-mers which are the k-mers appearing in two ' 'sequence datasets.', epilog=textwrap.dedent(epilog)) parser.add_argument('ptfile', metavar='input_presence_table_filename', help="input k-mer presence table filename") parser.add_argument('fafile', metavar='input_sequence_filename', help="input sequence filename") parser.add_argument('report_filename', metavar='output_report_filename', help='output report filename') return parser
def get_parser(): epilog = """ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. """ parser = build_hashbits_args( descr="Load, partition, and annotate FAST[AQ] sequences", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument( "--subset-size", "-s", default=DEFAULT_SUBSET_SIZE, dest="subset_size", type=float, help="Set subset size (usually 1e5-1e6 is good)", ) parser.add_argument( "--no-big-traverse", dest="no_big_traverse", action="store_true", default=False, help="Truncate graph joins at big traversals", ) parser.add_argument( "--keep-subsets", dest="remove_subsets", default=True, action="store_false", help="Keep individual subsets (default: False)", ) parser.add_argument("graphbase", help="base name for output files") parser.add_argument( "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filenames" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): parser = build_hashbits_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, default=DEFAULT_MAX_READS, help='Max total reads to buffer before flushing') parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, default=DEFAULT_BUFFER_SIZE, help='Max length of an individual label buffer \ before flushing') parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', help='output directory; default is location of \ fastp file') parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, default=DEFAULT_NUM_BUFFERS, help='Max individual label buffers before flushing') labeling = parser.add_mutually_exclusive_group(required=True) labeling.add_argument('--label-by-pid', dest='label_by_pid', action='store_true', help='separate reads by\ reference partition id') labeling.add_argument('--label-by-seq', dest='label_by_seq', action='store_true', help='separate reads by\ reference sequence') labeling.add_argument('--label-by-group', dest='group_size', type=int, help='separate reads by arbitrary sized groups\ of reference sequences') parser.add_argument(dest='input_fastp', help='Reference fasta or fastp') parser.add_argument('input_files', nargs='+', help='Reads to be swept and sorted') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): parser = build_hashbits_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, default=DEFAULT_MAX_READS, help='Max total reads to buffer before flushing') parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, default=DEFAULT_BUFFER_SIZE, help='Max length of an individual label buffer \ before flushing') parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', help='output directory; default is location of \ fastp file') parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, default=DEFAULT_NUM_BUFFERS, help='Max individual label buffers before flushing') labeling = parser.add_mutually_exclusive_group(required=True) labeling.add_argument('--label-by-pid', dest='label_by_pid', action='store_true', help='separate reads by\ referece partition id') labeling.add_argument('--label-by-seq', dest='label_by_seq', action='store_true', help='separate reads by\ reference sequence') labeling.add_argument('--label-by-group', dest='group_size', type=int, help='separate reads by arbitrary sized groups\ of reference sequences') parser.add_argument(dest='input_fastp', help='Reference fasta or fastp') parser.add_argument('input_files', nargs='+', help='Reads to be swept and sorted') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument( "--no-build-tagset", "-n", default=False, action="store_true", dest="no_build_tagset", help="Do NOT construct tagset while loading sequences", ) parser.add_argument( "output_filename", metavar="output_presence_table_filename", help="output" " k-mer presence table filename." ) parser.add_argument( "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename" ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("--write-fp-rate", "-w", action="store_true", help="Write false positive rate into .info file") return parser
def get_parser(): parser = build_hashbits_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') parser.epilog = EPILOG parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE, help='depth of breadth-first search to perform\ from each read') parser.add_argument('--max_queue_size', type=int, default=1000) parser.add_argument('--prefix', dest='output_prefix', default=DEFAULT_OUT_PREF, help='Prefix for sorted read files') parser.add_argument('--outdir', dest='outdir', default='', help='output directory; default is location of \ fastp file') parser.add_argument('--query', dest='query', nargs='+', help='Reads to be swept and sorted') parser.add_argument('--db', dest='db', nargs='+', help='Database reads for sweep', required=True) return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") parser = build_graph.build_parser(parser) return parser
def get_parser(): parser = build_hashbits_args( "Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to" ) parser.epilog = EPILOG parser.add_argument( "-r", "--traversal_range", type=int, dest="traversal_range", default=DEFAULT_RANGE, help="depth of breadth-first search to perform\ from each read", ) parser.add_argument( "-b", "--buffer_size", dest="max_reads", type=int, default=DEFAULT_MAX_READS, help="Max total reads to buffer before flushing", ) parser.add_argument( "-l", "--buffer_length", dest="buffer_size", type=int, default=DEFAULT_BUFFER_SIZE, help="Max length of an individual label buffer \ before flushing", ) parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files") parser.add_argument( "--outdir", dest="outdir", help="output directory; default is location of \ fastp file", ) parser.add_argument( "-m", "--max_buffers", dest="max_buffers", type=int, default=DEFAULT_NUM_BUFFERS, help="Max individual label buffers before flushing", ) labeling = parser.add_mutually_exclusive_group(required=True) labeling.add_argument( "--label-by-pid", dest="label_by_pid", action="store_true", help="separate reads by\ referece partition id", ) labeling.add_argument( "--label-by-seq", dest="label_by_seq", action="store_true", help="separate reads by\ reference sequence", ) labeling.add_argument( "--label-by-group", dest="group_size", type=int, help="separate reads by arbitrary sized groups\ of reference sequences", ) parser.add_argument(dest="input_fastp", help="Reference fasta or fastp") parser.add_argument("input_files", nargs="+", help="Reads to be swept and sorted") return parser
def main(): parser = build_hashbits_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_tablesize == DEFAULT_MIN_TABLESIZE: print >>sys.stderr, "** WARNING: tablesize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_tables print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_tablesize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_tables x min_tablesize / 8)' % ( args.n_tables * args.min_tablesize * len(args.input_filenames) / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables inputlist = args.input_filenames readsfile = args.read_filename query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.sweep3' outfp = open(outfile, 'w') query_list.append((ht, outfp)) for n, inp_name in enumerate(inputlist): ht = query_list[n][0] # load contigs, connect into N partitions print 'loading input reads from', inp_name ht.consume_fasta(inp_name) print 'starting sweep.' n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if len(record.sequence) < K: continue if n % 10000 == 0: print '...', n, m for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] if count: outfp.write(output_single(record))