def __init__(self): # Define Supported hashes hashes = dict() hashes['md2'] = lambda x: self._get_md2_hash(x) hashes['md4'] = lambda x: self._get_hashlib_hash('md4', x) hashes['md5'] = lambda x: hashlib.md5(x).hexdigest() hashes['sha'] = lambda x: self._get_hashlib_hash('sha', x) hashes['sha1'] = lambda x: hashlib.sha1(x).hexdigest() hashes['sha256'] = lambda x: hashlib.sha256(x).hexdigest() hashes['sha224'] = lambda x: hashlib.sha224(x).hexdigest() hashes['sha384'] = lambda x: hashlib.sha384(x).hexdigest() hashes['sha512'] = lambda x: hashlib.sha512(x).hexdigest() hashes['sha3_224'] = lambda x: sha3.sha3_224(x).hexdigest() hashes['sha3_256'] = lambda x: sha3.sha3_256(x).hexdigest() hashes['sha3_384'] = lambda x: sha3.sha3_384(x).hexdigest() hashes['sha3_512'] = lambda x: sha3.sha3_512(x).hexdigest() hashes['mmh2'] = lambda x: str(mmhash.get_hash(x)) hashes['mmh2_unsigned'] = lambda x: str(mmhash.get_unsigned_hash(x)) hashes['mmh3_32'] = lambda x: str(mmh3.hash(x)) hashes['mmh3_64_1'] = lambda x: str(mmh3.hash64(x)[0]) hashes['mmh3_64_2'] = lambda x: str(mmh3.hash64(x)[1]) hashes['mmh3_128'] = lambda x: str(mmh3.hash128(x)) hashes['ripemd160'] = lambda x: self._get_hashlib_hash('ripemd160', x) hashes['whirlpool'] = lambda x: self._get_hashlib_hash('whirlpool', x) hashes['blake2b'] = lambda x: pyblake2.blake2b(x).hexdigest() hashes['blake2s'] = lambda x: pyblake2.blake2s(x).hexdigest() hashes['crc32'] = lambda x: str(zlib.crc32(x)) hashes['adler32'] = lambda x: str(zlib.adler32(x)) self._hashes = hashes self.hashes_and_checksums = self._hashes.keys() self.supported_hashes = HASHES
def term_to_list_varbyte(q, dict, index_file): if q.is_term: word = mmhash.get_unsigned_hash (q.value.lower()) place = dict.find_term (word) if place[0] < 0: found = [] else: index_file.seek(place[0]) blob = index_file.read(place[1]) found = bitstream.decompress_varbyte(blob) q.value = found else: if q.left is not None: term_to_list_varbyte(q.left, dict, index_file) if q.right is not None: term_to_list_varbyte(q.right, dict, index_file)
def murmur(x): return np.uint64(mmhash.get_unsigned_hash(x))
import docreader from docreader import DocumentStreamReader import index_creation import bitstream import cPickle import mmhash import dict_hash if __name__ == '__main__': reader = DocumentStreamReader(docreader.parse_command_line().files) index = index_creation.Url_Index() for doc in reader: index.scan_text(doc) blob = [] term = dict() for k, v in index.terms.iteritems(): prev_len = len(blob) compr = bitstream.compress_varbyte(v) blob.extend(compr) term[mmhash.get_unsigned_hash( k.encode('utf8'))] = [prev_len, len(compr)] index_file = open("index.txt", "wb") index_file.write(bytearray(blob)) url_file = open("url_file.txt", "wb") cPickle.dump(index.url, url_file) dict_hash.store_dict(term)
with open('genres.list', 'r') as fin: lastname = "" lastyear = "" lasttableid = 0 mask = 0 row = 1 limits = 10000 while row <= limits: line = fin.readline() if line: print '%d processing..' % row row += 1 name, year, genre = split(line) tableid = mmhash.get_unsigned_hash(name) % prime if lastname == "": lastname, lasttableid, lastyear, mask = name, tableid, year, 0 if name == lastname: mask |= (1 << genre) else: sqlwrite[lasttableid].write(template % (lasttableid, lastname, lastyear, mask)); lastname, lasttableid, lastyear, mask = name, tableid, year, (1 << genre) else: sqlwrite[lasttableid].write(template % (lasttableid, lastname, lastyear, mask)); break for i in xrange(prime):