Esempio n. 1
0
    def do_test(ctfile):
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        sizes = khmer.get_n_primes_above_x(1, 2**31)

        orig = khmer.CountingHash(12, sizes)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_counting_hash(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3966, orig_count
        assert loaded_count == orig_count, loaded_count
Esempio n. 2
0
#! /usr/bin/env python
import sys, khmer

K = 10

###

output = sys.argv[1]
fa_files = sys.argv[2:]

HT_SIZE = int(4**6)
HT_SIZE = khmer.get_n_primes_above_x(1, HT_SIZE)[0]
print HT_SIZE

ht = khmer.new_hashtable(K, HT_SIZE)

for filename in fa_files:
    ht.consume_fasta(filename)

print 'preparing hist...'
z = ht.abundance_distribution()
fp = open(output, 'w')
for n, i in enumerate(z[1:]):
    print >> fp, n + 1, i
Esempio n. 3
0
def main(filename):
    global ht

    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")

    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run * N_HT:run * N_HT + N_HT]

        ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')

            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(
                HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i + 1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename,
             'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i, )
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i, )
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()
#! /usr/bin/env python
import sys, khmer

K = 10

###

output = sys.argv[1]
fa_files = sys.argv[2:]

HT_SIZE = int(4**6)
HT_SIZE = khmer.get_n_primes_above_x(1, HT_SIZE)[0]
print HT_SIZE

ht = khmer.new_hashtable(K, HT_SIZE)

for filename in fa_files:
    ht.consume_fasta(filename)

print 'preparing hist...'
z = ht.abundance_distribution()
fp = open(output, 'w')
for n, i in enumerate(z[1:]):
    print >>fp, n + 1, i
def main(filename):
    global ht
    
    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")
    
    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run*N_HT:run*N_HT+N_HT]

        ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    
        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')
            
            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i+1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i,)
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i,)
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()