def get_signature(A, k=200, hash_func=pyhash.city_64()): signature = sortedcontainers.SortedDict() for elm in A: elm_hash = hash_func(elm) if len(signature) < k and elm_hash not in signature: signature[elm_hash] = 1 elif elm_hash < signature.iloc[-1] and elm_hash not in signature: signature.popitem() signature[elm_hash] = 1 return signature.keys()
def get_signature_faster(A, k=200, hash_func=pyhash.city_64()): """ Load all hashes in memory, then return the k lowest. Is faster because the hash list must not be sorted for every hash insert, but needs more memory. """ hashes = [] for elm in A: hashes.append(hash_func(elm)) return sorted(set(hashes))[:k]
def srp_matrix(cls, words, ndims, _hashfunc=city_64(0)): """ Geenrate a random matrix using a hash function. It will have `ndims` columns, and a row for every word, for a total of `len(words)` rows. The values will be determined by a hash function. To create a row, we hash the corresponding word with a tag appended, change the tag and hash it a second time, and continue until we have more than `ndims` random bits available. The hash values are then unpacked into a matrix of bits, and the 0s are changed to -1s (so that the matrix approximately preserves length when used for projection). Because we use a hash function pre-seeded with a fixed value, a given word will always generate the same row of numbers. This is a hasty implementation of Ben Schmidt's Stable Random Projection (https://culturalanalytics.org/article/11033). Errors are mine alone. """ multiplier = (ndims - 1) // 64 + 1 hashes = [ list(map(_hashfunc, ['{}_{}'.format(w, i) for i in range(multiplier)])) for w in words ] # Given a `multipier` value of 5, `hashes` is really a V x 5 # array of 64-bit integers, where V is the vocabulary size... hash_arr = numpy.array(hashes, dtype=numpy.uint64) # ...but we could also think of it as a V x 40 array of bytes... hash_arr = hash_arr.view(dtype=numpy.uint8) # ...or even as an array of bits, where every word is represented # by 320 bits... hash_arr = numpy.unpackbits(hash_arr.ravel()).reshape(-1, 64 * multiplier) # ...or as an array of floating point values, all equal to either # 1.0 or 0.0, and truncated to give a final array of V x ndims. return (hash_arr.astype(numpy.float64) * 2 - 1)[:, :ndims]
from falcon_cors import CORS import sys import json import csv import io from collections import defaultdict from hashlib import sha512 from array import array import secrets import crypt #import dateparser #or see https://opensource.com/article/18/4/python-datetime-libraries import datetime from dateutil.parser import parse from ciso8601 import parse_datetime import pyhash hasher=pyhash.city_64() #consider xxhash? https://pypi.org/project/xxhash/ #Cityhash? Others? https://www.reddit.com/r/programming/comments/700xiv/xxhash_extremely_fast_noncryptographic_hash/ import statistics #or numpy ###### The peewee ORM from peewee import * from playhouse.postgres_ext import * db = PostgresqlExtDatabase('movilidad', user='******',server_side_cursors=True) # password='', host='127.0.0.1') class BaseModel(Model): class Meta: database = db
import pyhash import sys import matplotlib.mlab as mlab import matplotlib.pyplot as plt bloomFilterSize = 10 bit_vector = [] #hashFunctions fnv = pyhash.fnv1a_32() mur = pyhash.murmur3_32() lookup = pyhash.lookup3() super1 = pyhash.super_fast_hash() city = pyhash.city_64() spooky = pyhash.spooky_32() farm = pyhash.farm_32() metro = pyhash.metro_64() mum = pyhash.mum_64() xx = pyhash.xx_32() #10 hash functions hashfuncs = [fnv, mur, lookup, super1, city, spooky, farm, metro, mum, xx] #hash def insertBloom(kmer, hashFuncCount): global bloomFilterSize global bit_vector index = 0 for hf in hashfuncs: if (index <= hashFuncCount): if (bit_vector[hf(kmer) % bloomFilterSize] == 0):