def __init__(self, capacity, error_rate=0.0001, fname=None, h1=pyhash.murmur3_x64_128(), h2=pyhash.spooky_128()): """ :param capacity: size of possible input elements :param error_rate: posi :param fname: :param h1: :param h2: """ # calculate m & k self.capacity = capacity self.error_rate = error_rate self.num_of_bits, self.num_of_hashes = self._adjust_param( 4096 * 8, error_rate) self._fname = fname self._data_store = MmapBitSet(self.num_of_bits) self._size = len(self._data_store) self._hashes = functools.partial(hashes, h1=h1, h2=h2, number=self.num_of_hashes)
def __init__(self, size=65536, k=7, name='bf', load=False): if load: self.load(name) else: self.size = size if k > 18 or k <= 0: print('k should be > 0 & <= 18') return None self.k = k self.name = name self.bitarray = bitarray.bitarray('0' * self.size) self.tables = [[set() for j in range(self.size)] for i in range(self.k)] self.hashes = [ pyhash.fnv1_64(), pyhash.murmur2_x64_64a(), pyhash.murmur3_x64_128(), pyhash.lookup3(), pyhash.super_fast_hash(), pyhash.city_128(), pyhash.spooky_128(), pyhash.farm_128(), pyhash.metro_128(), pyhash.mum_64(), pyhash.t1_64(), pyhash.xx_64(), lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16), lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha224(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha256(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha384(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha512(str.encode('utf-8')).hexdigest(), 16) ]
#! /usr/bin/python2 # vim: set fileencoding=utf-8 from operator import itemgetter from datetime import datetime import CommonMongo as cm import csv import persistent import pyhash TO_BE_INSERTED = [] HASHER = pyhash.spooky_128() VENUE_LOC = persistent.load_var('venue_loc.my') TRAD = {} with open('trad.dat', 'r') as f: for line in f: old, new = line.strip().split(';') TRAD[old] = new def reformat(line_dict): vid = line_dict['vid'] if vid in TRAD: vid = TRAD[vid] if vid not in VENUE_LOC: return None if line_dict['_id'] == 'ICWSM': txt = ''.join(itemgetter('uid', 'vid', 'time')(line_dict)) line_dict['_id'] = hex(HASHER(txt))[2:-1] line_dict['uid'] = int(line_dict['uid']) line_dict['loc'], line_dict['city'] = VENUE_LOC[vid] line_dict['time'] = datetime.strptime(line_dict['time'], '%Y-%m-%dT%H:%M:%SZ')