Beispiel #1
0
def ft_process(text):
	# splitting on lines
	lines = text.split('\n')
	# assigning max lowest to all possible hash values
	global max_0
	for line in lines:
		# skipping blank entries
		if len(line) == 0:	continue
		# skipping lines that don't begin with 'Q'
		if line[0] =="Q":
			quote = line[2:]

			for i in range(10):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.super_fast_hash(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0
			for i in range(10,20):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.murmur3_32(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0
			for i in range(20,30):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.xx_32(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0

	return
Beispiel #2
0
class GenericObject(ABC):

    _hasher = pyhash.super_fast_hash()

    def _get_name(self):
        return self.__class__.__name__

    def _get_id(self):
        return self._hasher(self._get_name())

    def __str__(self):
        return "GenericObject"
Beispiel #3
0
    def __init__(self, size=65536, k=7, name='bf', load=False):
        if load:
            self.load(name)
        else:
            self.size = size
            if k > 18 or k <= 0:
                print('k should be > 0 & <= 18')
                return None
            self.k = k
            self.name = name
            self.bitarray = bitarray.bitarray('0' * self.size)
            self.tables = [[set() for j in range(self.size)]
                           for i in range(self.k)]

        self.hashes = [
            pyhash.fnv1_64(),
            pyhash.murmur2_x64_64a(),
            pyhash.murmur3_x64_128(),
            pyhash.lookup3(),
            pyhash.super_fast_hash(),
            pyhash.city_128(),
            pyhash.spooky_128(),
            pyhash.farm_128(),
            pyhash.metro_128(),
            pyhash.mum_64(),
            pyhash.t1_64(),
            pyhash.xx_64(),
            lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha224(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha256(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha384(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha512(str.encode('utf-8')).hexdigest(), 16)
        ]
Beispiel #4
0
    def __init__(self, host, port, servers):
        self.host = host
        self.port = port
        self.servers = servers
        self.logger = logging.getLogger("FE-Bridge")
        self.socket = ServerSocket(host, port)
        self.be_conn = []
        self.be_conn_locks = []
        self.hasher = pyhash.super_fast_hash()

        self.logger.info("Connecting with BE servers")
        connections = []
        for i in range(self.servers):
            conn, addr = self.socket.accept_client()
            self.logger.info("Connection accepted %r" % (addr, ))
            connections.append((addr, conn))

        connections.sort()
        for client in connections:
            cs = ServersClientBridgePDUSocket(client[1])
            self.be_conn.append(cs)
            self.be_conn_locks.append(
                (Lock(), Lock()
                 ))  # First lock to coordinate reading, second for writing
Beispiel #5
0
 def super_fast_hash(self):
     return pyhash.super_fast_hash()(self.data)
import pyhash

hasher = pyhash.super_fast_hash()

# constants controlling amount of subindices
NUM_OF_INVERTED_INDEX_SHARDS = 10000

# paths to data
DATA_BASE_PATH = './data/'
DATA_WIKI_PATH = DATA_BASE_PATH + 'wiki-pages/'
DATA_TRAINING_PATH = DATA_BASE_PATH + 'train.jsonl'
DATA_DEV_LABELED_PATH = DATA_BASE_PATH + 'shared_task_dev.jsonl'
DATA_TEST_UNLABELED_PATH = DATA_BASE_PATH + 'shared_task_test.jsonl'
#DATA_PRETRAINED_EMBEDDINGS_PATH = DATA_BASE_PATH + 'GoogleNews-vectors-negative300.bin'
DATA_PRETRAINED_EMBEDDINGS_PATH = DATA_BASE_PATH + 'glove.840B.300d.txt'

# paths to generated auxiliary data and output
GENERATED_BASE_PATH = './generated/'
GENERATED_FIGURES_BASE_PATH = GENERATED_BASE_PATH + 'figures/'
GENERATED_COUNTS_PATH = GENERATED_BASE_PATH + 'accumulated_word_count.jsonl'
GENERATED_IDF_PATH = GENERATED_BASE_PATH + 'words_with_idf.jsonl'
GENERATED_WIKI_PAGE_MAPPINGS_PATH = GENERATED_BASE_PATH + 'wiki_page_batch_mappings.p'
GENERATED_DOCUMENT_NORMS_MAPPING = GENERATED_BASE_PATH + 'docs_to_norms_mapping.jsonl'
GENERATED_DOCUMENT_LENGTH_MAPPING = GENERATED_BASE_PATH + 'docs_to_lengths_mapping.jsonl'
GENERATED_INVERTED_INDEX_DIRECTORY = GENERATED_BASE_PATH + 'inverted_index/'
GENERATED_LR_PREPROCESSED_TRAINING_DATA = GENERATED_BASE_PATH + 'LR_preprocessed_training_data.p'
GENERATED_LR_PREPROCESSED_DEV_DATA = GENERATED_BASE_PATH + 'LR_preprocessed_dev_data.p'
GENERATED_LOGISTIC_REGRESSION_MODEL = GENERATED_BASE_PATH + 'LR_model.p'
GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY = GENERATED_BASE_PATH + 'LR_loss_history.p'
GENERATED_NEURAL_NETWORK_MODEL = GENERATED_BASE_PATH + 'NN_model_{}.p'
GENERATED_NEURAL_NETWORK_LOSS_HISTORY = GENERATED_BASE_PATH + 'NN_loss_history_{}.p'


import pyhash
#https://code.google.com/p/pyfasthash/

h_fnv1_32 = pyhash.fnv1_32()
def fnv1_32(req):
    return h_fnv1_32(str(req))


h_lookup3 = pyhash.lookup3_big()
def lookup3(req):
    return h_lookup3(str(req))

h_super_fast_hash = pyhash.super_fast_hash()
def super_fast_hash(req):
    return h_super_fast_hash(str(req))


h_murmur2_x64_64a = pyhash.murmur2_x64_64a()
def murmur2_x64_64a(req):
    return h_murmur2_x64_64a(str(req))


h_murmur3_32 = pyhash.murmur3_32()
def murmur3_32(req):
    return h_murmur3_32(str(req))

h_fnv1a_64 = pyhash.fnv1a_64()
def fnv1a_64(req):
import pyhash
import sys
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

bloomFilterSize = 10
bit_vector = []

#hashFunctions
fnv = pyhash.fnv1a_32()
mur = pyhash.murmur3_32()
lookup = pyhash.lookup3()
super1 = pyhash.super_fast_hash()
city = pyhash.city_64()
spooky = pyhash.spooky_32()
farm = pyhash.farm_32()
metro = pyhash.metro_64()
mum = pyhash.mum_64()
xx = pyhash.xx_32()
#10 hash functions
hashfuncs = [fnv, mur, lookup, super1, city, spooky, farm, metro, mum, xx]
#hash


def insertBloom(kmer, hashFuncCount):
    global bloomFilterSize
    global bit_vector
    index = 0
    for hf in hashfuncs:
        if (index <= hashFuncCount):
            if (bit_vector[hf(kmer) % bloomFilterSize] == 0):
Beispiel #9
0
# TODO: add javascript escape code here so it's available in the template engine

from datetime import datetime
from routes import url_for
from mako import filters
from six.moves.urllib.parse import urlparse, ParseResult
import re
from routes import request_config
from pybald import context
import logging
log = logging.getLogger(__name__)

try:
    import pyhash
    hashfunc = pyhash.super_fast_hash()
except ImportError:
    log.warning("-" * 80 + '''
    Warning
    -------
    Using python built-in hash() for asset URL generation. This is system
    implementation specific and may result in different hosts mapping static
    assets to different static hosts. That may cause inefficient use of browser
    caches. Optionally you can install pyhash to install additional fast,
    non-cryptographic, hashes that are not system dependent.

    pip install pyhash
''' + "-" * 80)
    hashfunc = hash

try:
 def __init__(self, locks_pool_size):
     self.hasher = pyhash.super_fast_hash()
     self.file_locks_len = locks_pool_size
     self.file_locks = []
     for i in range(self.file_locks_len):
         self.file_locks.append(ReadWriteLock())
Beispiel #11
0
# encoding: utf-8
'''HTML page helper functions as well as simple asset tag handling.'''

import os
import project

# from urlparse import urlparse, ParseResult
# global request_config... how can we eliminate?
# from routes import request_config
from pybald.core.helpers import HTMLLiteral, AssetUrl

import logging
console = logging.getLogger(__name__)
try:
    import pyhash
    hashfunc = pyhash.super_fast_hash()
except ImportError:
    console.warn("!"*10 + '''  Using python built-in hash() for asset URL
generation. This is system implementation specific and may result in different
hosts mapping static assets to different static hosts. That may cause
inefficient use of browser caches. Optionally you can install pyhash to
install additional fast, non-cryptographic, hashes that are not system
dependent.

pip install pyhash
''')
    hashfunc = hash



asset_tag_cache = {}
Beispiel #12
0
 def __hash__(self):
     hasher = pyhash.super_fast_hash()
     return hasher(self)
Beispiel #13
0
def ft_superfasthash(text):
	global m
	hash_value = pyhash.super_fast_hash()(text)
	return (hash_value % m)