Example #1
0
def create_kmer_hash(f,
                     s,
                     w=1,
                     wheel_path='/mnt/Wheels.txt',
                     block_size=10000,
                     out_path='/mnt/Kmer_Hash.txt',
                     reverse_compliments=True):
    W = get_wheels(wheel_path, spoke_limit=s, wheel_limit=w)
    k = len(W[0]['p'])
    f.seek(0)
    H = bitarray(2**s)
    # APPARENTLY BITARRAY IS NOT GUARANTEED TO INITIALIZE EMPTY
    H.setall(False)
    last = None
    while last != f.tell():
        last = f.tell()
        try:
            A, B = generator_to_bins(read_generator(f,
                                                    max_reads=block_size,
                                                    kmer_size=k),
                                     W,
                                     rc=reverse_compliments)
            for b in range(len(B)):
                for a in range(len(A)):
                    H[B[b][a]] = True
        except:
            pass
        print f.tell()
    fo = open(out_path, 'wb')
    H.tofile(fo)
    fo.close()
    return H
Example #2
0
def write_hashed_reads(read_file,
                       out_file,
                       s,
                       w=1,
                       wheel_path='/mnt/Wheels.txt',
                       block_size=10000):
    W = get_wheels(wheel_path, spoke_limit=s, wheel_limit=w)
    k = len(W[0]['p'])
    hash_prefix = 'k, bins: '
    read_file.seek(0)
    last = None
    while last != read_file.tell():
        last = read_file.tell()
        try:
            A, B = generator_to_bins(
                read_generator(read_file,
                               max_reads=block_size,
                               verbose_ids=True,
                               kmer_size=k), W)
            # WRITING JUST ONE WHEEL HERE, ASSUMING SORTED BY READ
            B0 = []
            last_a = None
            for a in range(len(A)):
                if A[a] != last_a:
                    if B0:
                        out_file.write(last_a + hash_prefix + str([k] + B0) +
                                       '\n')
                        B0 = []
                    last_a = A[a]
                B0.append(B[0][a])
        except Exception, err:
            print Exception, str(err)
        print read_file.tell()
Example #3
0
def do_wheels(s, w=1):
    db = conn['test_genome']
    d = len(db.kmers.find_one()['s'])
    set_wheels(d,
               realm='test_genome',
               spokes=s,
               wheels=w,
               out_path='/mnt/test_')
    W = get_wheels('/mnt/test_Wheels.txt')
    return W
def create_kmer_hash_counts_fasta(f,s,w=1,wheel_path='/mnt/Wheels.txt',block_size=1,out_path='/mnt/Kmer_Hash_Counts.txt'):
	W = get_wheels(wheel_path,spoke_limit=s,wheel_limit=w)
	k = len(W[0]['p'])
	H = (c_uint8*2**s)()
	last = None
	f.seek(0)
	while last != f.tell():
		last = f.tell()
		try:
			A,B = generator_to_bins(read_generator(f,max_reads=block_size,kmer_size=k),W)
			for b in range(len(B)):
				for a in range(len(A)):
					H[B[b][a]] = min(255,H[B[b][a]]+1)
		except Exception,err:
			print str(err)
Example #5
0
def create_kmer_hash_counts(f,
                            s,
                            w=1,
                            wheel_path='/mnt/Wheels.txt',
                            out_path='/mnt/Kmer_Hash_Counts.txt',
                            temp_file_size=5 * 10**5):
    W = get_wheels(wheel_path, spoke_limit=s, wheel_limit=w)
    kmer_size = len(W[0]['p'])
    H = (c_uint8 * 2**s)()
    block = []
    last = None
    f.seek(0)
    while last != f.tell():
        last = f.tell()
        f0 = []
        f0 += f.readlines(temp_file_size)
        f0 += read_until_new(f)
        block.append((f0, kmer_size, W))
        if len(block) > 500:
            pool = Pool()
            results = pool.map(hash_count_part, block, max(1, len(block) / 8))
            for result in results:
                for k, v in result.items():
                    # would be great to have overflow checking
                    H[k] = min(255, H[k] + v)
            pool.close()
            pool.join()
            block = []
    if block:
        pool = Pool()
        results = pool.map(hash_count_part, block, max(1, len(block) / 8))
        for result in results:
            for k, v in result.items():
                # would be great to have overflow checking
                H[k] = min(255, H[k] + v)
        pool.close()
        pool.join()
        block = []
    f0 = open(out_path, 'wb')
    f0.write(H)
    f0.close()
    return H
Example #6
0
def create_kmer_hash_counts_fasta(f,
                                  s,
                                  w=1,
                                  wheel_path='/mnt/Wheels.txt',
                                  block_size=1,
                                  out_path='/mnt/Kmer_Hash_Counts.txt'):
    W = get_wheels(wheel_path, spoke_limit=s, wheel_limit=w)
    k = len(W[0]['p'])
    H = (c_uint8 * 2**s)()
    last = None
    f.seek(0)
    while last != f.tell():
        last = f.tell()
        try:
            A, B = generator_to_bins(
                read_generator(f, max_reads=block_size, kmer_size=k), W)
            for b in range(len(B)):
                for a in range(len(A)):
                    H[B[b][a]] = min(255, H[B[b][a]] + 1)
        except Exception, err:
            print str(err)
def create_kmer_hash(f,s,w=1,wheel_path='/mnt/Wheels.txt',block_size=10000,out_path='/mnt/Kmer_Hash.txt',reverse_compliments=True):
	W = get_wheels(wheel_path,spoke_limit=s,wheel_limit=w)
	k = len(W[0]['p'])
	f.seek(0)
	H = bitarray(2**s)
	# APPARENTLY BITARRAY IS NOT GUARANTEED TO INITIALIZE EMPTY
	H.setall(False)
	last = None
	while last != f.tell():
		last = f.tell()
		try:
			A,B = generator_to_bins(read_generator(f,max_reads=block_size,kmer_size=k),W,rc=reverse_compliments)
			for b in range(len(B)):
				for a in range(len(A)):
					H[B[b][a]] = True
		except:
			pass
		print f.tell()
	fo = open(out_path,'wb')
	H.tofile(fo)
	fo.close()
	return H
def write_hashed_reads(read_file,out_file,s,w=1,wheel_path='/mnt/Wheels.txt',block_size=10000):
	W = get_wheels(wheel_path,spoke_limit=s,wheel_limit=w)
	k = len(W[0]['p'])
	hash_prefix = 'k, bins: '
	read_file.seek(0)
	last = None
	while last != read_file.tell():
		last = read_file.tell()
		try:
			A,B = generator_to_bins(read_generator(read_file,max_reads=block_size,verbose_ids=True,kmer_size=k),W)
			# WRITING JUST ONE WHEEL HERE, ASSUMING SORTED BY READ
			B0 = []
			last_a = None
			for a in range(len(A)):
				if A[a] != last_a:
					if B0:
						out_file.write(last_a+hash_prefix+str([k] + B0)+'\n')
						B0 = []
					last_a = A[a]
				B0.append(B[0][a])
		except Exception, err:
			print Exception,str(err)
		print read_file.tell()
def create_kmer_hash_counts(f,s,w=1,wheel_path='/mnt/Wheels.txt',out_path='/mnt/Kmer_Hash_Counts.txt',temp_file_size=5*10**5):
	W = get_wheels(wheel_path,spoke_limit=s,wheel_limit=w)
	kmer_size = len(W[0]['p'])
	H = (c_uint8*2**s)()
	block = []
	last = None
	f.seek(0)
	while last != f.tell():
		last = f.tell()
		f0 = []
		f0 += f.readlines(temp_file_size)
		f0 += read_until_new(f)
		block.append((f0,kmer_size,W))
		if len(block) > 500:
			pool = Pool()
			results = pool.map(hash_count_part,block,max(1,len(block)/8))
			for result in results:
				for k,v in result.items():
					# would be great to have overflow checking
					H[k] = min(255,H[k]+v)
			pool.close()
			pool.join()
			block = []
	if block:
		pool = Pool()
		results = pool.map(hash_count_part,block,max(1,len(block)/8))
		for result in results:
			for k,v in result.items():
				# would be great to have overflow checking
				H[k] = min(255,H[k]+v)
		pool.close()
		pool.join()
		block = []
	f0 = open(out_path,'wb')
	f0.write(H)
	f0.close()
	return H
Example #10
0
def do_wheels(s,w=1):
	db = conn['test_genome']
	d = len(db.kmers.find_one()['s'])
	set_wheels(d,realm='test_genome',spokes=s,wheels=w,out_path='/mnt/test_')
	W = get_wheels('/mnt/test_Wheels.txt')
	return W