Example #1
0
    def __init__(self):
        # Define Supported hashes
        hashes = dict()
        hashes['md2'] = lambda x: self._get_md2_hash(x)
        hashes['md4'] = lambda x: self._get_hashlib_hash('md4', x)
        hashes['md5'] = lambda x: hashlib.md5(x).hexdigest()
        hashes['sha'] = lambda x: self._get_hashlib_hash('sha', x)
        hashes['sha1'] = lambda x: hashlib.sha1(x).hexdigest()
        hashes['sha256'] = lambda x: hashlib.sha256(x).hexdigest()
        hashes['sha224'] = lambda x: hashlib.sha224(x).hexdigest()
        hashes['sha384'] = lambda x: hashlib.sha384(x).hexdigest()
        hashes['sha512'] = lambda x: hashlib.sha512(x).hexdigest()
        hashes['sha3_224'] = lambda x: sha3.sha3_224(x).hexdigest()
        hashes['sha3_256'] = lambda x: sha3.sha3_256(x).hexdigest()
        hashes['sha3_384'] = lambda x: sha3.sha3_384(x).hexdigest()
        hashes['sha3_512'] = lambda x: sha3.sha3_512(x).hexdigest()
        hashes['mmh2'] = lambda x: str(mmhash.get_hash(x))
        hashes['mmh2_unsigned'] = lambda x: str(mmhash.get_unsigned_hash(x))
        hashes['mmh3_32'] = lambda x: str(mmh3.hash(x))
        hashes['mmh3_64_1'] = lambda x: str(mmh3.hash64(x)[0])
        hashes['mmh3_64_2'] = lambda x: str(mmh3.hash64(x)[1])
        hashes['mmh3_128'] = lambda x: str(mmh3.hash128(x))
        hashes['ripemd160'] = lambda x: self._get_hashlib_hash('ripemd160', x)
        hashes['whirlpool'] = lambda x: self._get_hashlib_hash('whirlpool', x)
        hashes['blake2b'] = lambda x: pyblake2.blake2b(x).hexdigest()
        hashes['blake2s'] = lambda x: pyblake2.blake2s(x).hexdigest()
        hashes['crc32'] = lambda x: str(zlib.crc32(x))
        hashes['adler32'] = lambda x: str(zlib.adler32(x))

        self._hashes = hashes
        self.hashes_and_checksums = self._hashes.keys()
        self.supported_hashes = HASHES
Example #2
0
def term_to_list_varbyte(q, dict, index_file):
    if q.is_term:
        word = mmhash.get_unsigned_hash (q.value.lower())

        place = dict.find_term (word)
        if place[0] < 0:
            found = []
        else:
            index_file.seek(place[0])
            blob = index_file.read(place[1])
            found = bitstream.decompress_varbyte(blob)
        q.value = found
    else:
        if q.left is not None:
            term_to_list_varbyte(q.left, dict, index_file)
        if q.right is not None:
            term_to_list_varbyte(q.right, dict, index_file)
Example #3
0
def murmur(x):
    return np.uint64(mmhash.get_unsigned_hash(x))
Example #4
0
import docreader
from docreader import DocumentStreamReader
import index_creation
import bitstream
import cPickle
import mmhash
import dict_hash

if __name__ == '__main__':
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    index = index_creation.Url_Index()
    for doc in reader:
        index.scan_text(doc)
    blob = []
    term = dict()
    for k, v in index.terms.iteritems():
        prev_len = len(blob)
        compr = bitstream.compress_varbyte(v)
        blob.extend(compr)
        term[mmhash.get_unsigned_hash(
            k.encode('utf8'))] = [prev_len, len(compr)]

    index_file = open("index.txt", "wb")
    index_file.write(bytearray(blob))

    url_file = open("url_file.txt", "wb")
    cPickle.dump(index.url, url_file)

    dict_hash.store_dict(term)
Example #5
0
with open('genres.list', 'r') as fin:
	lastname = ""
	lastyear = ""
	lasttableid = 0
	mask = 0
	row = 1
	limits = 10000
	while row <= limits:
		line = fin.readline()
		if line:
			print '%d processing..' % row
			row += 1
			
			name, year, genre = split(line)
			tableid = mmhash.get_unsigned_hash(name) % prime
			
			if lastname == "":
				lastname, lasttableid, lastyear, mask = name, tableid, year, 0

			if name == lastname:
				mask |= (1 << genre)
			else:
				sqlwrite[lasttableid].write(template % (lasttableid, lastname, lastyear, mask));
				lastname, lasttableid, lastyear, mask = name, tableid, year, (1 << genre)

		else:
			sqlwrite[lasttableid].write(template % (lasttableid, lastname, lastyear, mask));
			break

for i in xrange(prime):