def test_load_graph_1(): in1 = utils.get_test_data('test-abund-read-2.fa') out1 = utils.get_temp_filename('out.ct') cmd = """ cat {in1} | {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) run_shell_cmd(cmd) assert os.path.exists(out1) Nodegraph.load(out1)
def test_hashbits_file_version_check(): inpath = utils.get_test_data('badversion-k12.htable') try: nodegraph = Nodegraph.load(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_nodegraph_file_type_check(): kh = khmer.Countgraph(12, 1, 1) savepath = utils.get_temp_filename('tempcountingsave0.ct') kh.save(savepath) try: nodegraph = Nodegraph.load(savepath) assert 0, "this should fail" except OSError as e: print(str(e))
def main(): args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = Nodegraph.load(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) divvy = list(divvy) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD, subs=subset) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def test_n_occupied_save_load(): filename = utils.get_test_data('random-20-a.fa') nodegraph = khmer.Nodegraph(20, 100000, 3) for _, record in enumerate(screed.open(filename)): nodegraph.consume(record.sequence) assert nodegraph.n_occupied() == 3884 assert nodegraph.n_unique_kmers() == 3960 savefile = utils.get_temp_filename('out') nodegraph.save(savefile) ng2 = Nodegraph.load(savefile) assert ng2.n_occupied() == 3884, ng2.n_occupied() assert ng2.n_unique_kmers() == 0 # this is intended behavior, sigh.
def main(): args = sanitize_help(get_parser()).parse_args() basename = args.basename filenames = [basename, basename + '.tagset'] for _ in filenames: check_input_files(_, args.force) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) if args.stoptags: print('stoptag file:', args.stoptags, file=sys.stderr) print('--', file=sys.stderr) print('loading nodegraph %s' % basename, file=sys.stderr) nodegraph = Nodegraph.load(basename) nodegraph.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps:', ' stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print('starting %d threads' % n_threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (basename,), file=sys.stderr)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.io/en/stable/user/" "partitioning-big-data.html") args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = Nodegraph.load(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print('(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = SubsetPartition.load(subset_file, graph) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD, subs=subset) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = \ graph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.io/en/stable/user/" "partitioning-big-data.html" ) args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = Nodegraph.load(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print( '(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = SubsetPartition.load(subset_file, graph) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD, subs=subset) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = \ graph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)