Esempio n. 1
0
    def __init__(self,
                 capacity,
                 error_rate=0.0001,
                 fname=None,
                 h1=pyhash.murmur3_x64_128(),
                 h2=pyhash.spooky_128()):
        """

        :param capacity: size of possible input elements
        :param error_rate: posi
        :param fname:
        :param h1:
        :param h2:
        """
        # calculate m & k
        self.capacity = capacity
        self.error_rate = error_rate
        self.num_of_bits, self.num_of_hashes = self._adjust_param(
            4096 * 8, error_rate)
        self._fname = fname
        self._data_store = MmapBitSet(self.num_of_bits)
        self._size = len(self._data_store)
        self._hashes = functools.partial(hashes,
                                         h1=h1,
                                         h2=h2,
                                         number=self.num_of_hashes)
Esempio n. 2
0
    def __init__(self, size=65536, k=7, name='bf', load=False):
        if load:
            self.load(name)
        else:
            self.size = size
            if k > 18 or k <= 0:
                print('k should be > 0 & <= 18')
                return None
            self.k = k
            self.name = name
            self.bitarray = bitarray.bitarray('0' * self.size)
            self.tables = [[set() for j in range(self.size)]
                           for i in range(self.k)]

        self.hashes = [
            pyhash.fnv1_64(),
            pyhash.murmur2_x64_64a(),
            pyhash.murmur3_x64_128(),
            pyhash.lookup3(),
            pyhash.super_fast_hash(),
            pyhash.city_128(),
            pyhash.spooky_128(),
            pyhash.farm_128(),
            pyhash.metro_128(),
            pyhash.mum_64(),
            pyhash.t1_64(),
            pyhash.xx_64(),
            lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha224(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha256(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha384(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha512(str.encode('utf-8')).hexdigest(), 16)
        ]
Esempio n. 3
0
#! /usr/bin/python2
# vim: set fileencoding=utf-8
from operator import itemgetter
from datetime import datetime
import CommonMongo as cm
import csv
import persistent
import pyhash
TO_BE_INSERTED = []
HASHER = pyhash.spooky_128()
VENUE_LOC = persistent.load_var('venue_loc.my')
TRAD = {}
with open('trad.dat', 'r') as f:
    for line in f:
        old, new = line.strip().split(';')
        TRAD[old] = new


def reformat(line_dict):
    vid = line_dict['vid']
    if vid in TRAD:
        vid = TRAD[vid]
    if vid not in VENUE_LOC:
        return None
    if line_dict['_id'] == 'ICWSM':
        txt = ''.join(itemgetter('uid', 'vid', 'time')(line_dict))
        line_dict['_id'] = hex(HASHER(txt))[2:-1]
    line_dict['uid'] = int(line_dict['uid'])
    line_dict['loc'], line_dict['city'] = VENUE_LOC[vid]
    line_dict['time'] = datetime.strptime(line_dict['time'],
                                          '%Y-%m-%dT%H:%M:%SZ')
Esempio n. 4
0
#! /usr/bin/python2
# vim: set fileencoding=utf-8
from operator import itemgetter
from datetime import datetime
import CommonMongo as cm
import csv
import persistent
import pyhash
TO_BE_INSERTED = []
HASHER = pyhash.spooky_128()
VENUE_LOC = persistent.load_var('venue_loc.my')
TRAD = {}
with open('trad.dat', 'r') as f:
    for line in f:
        old, new = line.strip().split(';')
        TRAD[old] = new


def reformat(line_dict):
    vid = line_dict['vid']
    if vid in TRAD:
        vid = TRAD[vid]
    if vid not in VENUE_LOC:
        return None
    if line_dict['_id'] == 'ICWSM':
        txt = ''.join(itemgetter('uid', 'vid', 'time')(line_dict))
        line_dict['_id'] = hex(HASHER(txt))[2:-1]
    line_dict['uid'] = int(line_dict['uid'])
    line_dict['loc'], line_dict['city'] = VENUE_LOC[vid]
    line_dict['time'] = datetime.strptime(line_dict['time'],
                                          '%Y-%m-%dT%H:%M:%SZ')