def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_above_x(1, 2**31) orig = khmer.CountingHash(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_counting_hash(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
#! /usr/bin/env python import sys, khmer K = 10 ### output = sys.argv[1] fa_files = sys.argv[2:] HT_SIZE = int(4**6) HT_SIZE = khmer.get_n_primes_above_x(1, HT_SIZE)[0] print HT_SIZE ht = khmer.new_hashtable(K, HT_SIZE) for filename in fa_files: ht.consume_fasta(filename) print 'preparing hist...' z = ht.abundance_distribution() fp = open(output, 'w') for n, i in enumerate(z[1:]): print >> fp, n + 1, i
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run * N_HT:run * N_HT + N_HT] ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float( HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()
#! /usr/bin/env python import sys, khmer K = 10 ### output = sys.argv[1] fa_files = sys.argv[2:] HT_SIZE = int(4**6) HT_SIZE = khmer.get_n_primes_above_x(1, HT_SIZE)[0] print HT_SIZE ht = khmer.new_hashtable(K, HT_SIZE) for filename in fa_files: ht.consume_fasta(filename) print 'preparing hist...' z = ht.abundance_distribution() fp = open(output, 'w') for n, i in enumerate(z[1:]): print >>fp, n + 1, i
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run*N_HT:run*N_HT+N_HT] ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i+1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()