Beispiel #1
0
def get_signature(A, k=200, hash_func=pyhash.city_64()):
    signature = sortedcontainers.SortedDict()
    for elm in A:
        elm_hash = hash_func(elm)
        if len(signature) < k and elm_hash not in signature:
            signature[elm_hash] = 1
        elif elm_hash < signature.iloc[-1] and elm_hash not in signature:
            signature.popitem()
            signature[elm_hash] = 1
    return signature.keys()
Beispiel #2
0
def get_signature_faster(A, k=200, hash_func=pyhash.city_64()):
    """ Load all hashes in memory, then return the k lowest.

        Is faster because the hash list must not be sorted
        for every hash insert, but needs more memory.
    """
    hashes = []
    for elm in A:
        hashes.append(hash_func(elm))
    return sorted(set(hashes))[:k]
    def srp_matrix(cls, words, ndims, _hashfunc=city_64(0)):
        """
        Geenrate a random matrix using a hash function. It will have
        `ndims` columns, and a row for every word, for a total of
        `len(words)` rows. The values will be determined by a hash
        function. To create a row, we hash the corresponding word
        with a tag appended, change the tag and hash it a second time,
        and continue until we have more than `ndims` random bits
        available. The hash values are then unpacked into a matrix
        of bits, and the 0s are changed to -1s (so that the matrix
        approximately preserves length when used for projection).

        Because we use a hash function pre-seeded with a fixed value,
        a given word will always generate the same row of numbers.

        This is a hasty implementation of Ben Schmidt's Stable Random
        Projection (https://culturalanalytics.org/article/11033).
        Errors are mine alone.
        """
        multiplier = (ndims - 1) // 64 + 1
        hashes = [
            list(map(_hashfunc, ['{}_{}'.format(w, i)
                                 for i in range(multiplier)]))
            for w in words
        ]

        # Given a `multipier` value of 5, `hashes` is really a V x 5
        # array of 64-bit integers, where V is the vocabulary size...

        hash_arr = numpy.array(hashes, dtype=numpy.uint64)

        # ...but we could also think of it as a V x 40 array of bytes...

        hash_arr = hash_arr.view(dtype=numpy.uint8)

        # ...or even as an array of bits, where every word is represented
        # by 320 bits...

        hash_arr = numpy.unpackbits(hash_arr.ravel()).reshape(-1,
                                                              64 * multiplier)

        # ...or as an array of floating point values, all equal to either
        # 1.0 or 0.0, and truncated to give a final array of V x ndims.

        return (hash_arr.astype(numpy.float64) * 2 - 1)[:, :ndims]
Beispiel #4
0
from falcon_cors import CORS
import sys
import json
import csv
import io
from collections import defaultdict
from hashlib import sha512
from array import array
import secrets
import crypt
#import dateparser #or see https://opensource.com/article/18/4/python-datetime-libraries
import datetime
from dateutil.parser import parse
from ciso8601 import parse_datetime
import pyhash
hasher=pyhash.city_64()
#consider xxhash? https://pypi.org/project/xxhash/  
#Cityhash? Others? https://www.reddit.com/r/programming/comments/700xiv/xxhash_extremely_fast_noncryptographic_hash/
import statistics #or numpy


###### The peewee ORM
from peewee import *
from playhouse.postgres_ext import *

db = PostgresqlExtDatabase('movilidad', user='******',server_side_cursors=True) # password='', host='127.0.0.1')

class BaseModel(Model):
    class Meta:
        database = db
import pyhash
import sys
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

bloomFilterSize = 10
bit_vector = []

#hashFunctions
fnv = pyhash.fnv1a_32()
mur = pyhash.murmur3_32()
lookup = pyhash.lookup3()
super1 = pyhash.super_fast_hash()
city = pyhash.city_64()
spooky = pyhash.spooky_32()
farm = pyhash.farm_32()
metro = pyhash.metro_64()
mum = pyhash.mum_64()
xx = pyhash.xx_32()
#10 hash functions
hashfuncs = [fnv, mur, lookup, super1, city, spooky, farm, metro, mum, xx]
#hash


def insertBloom(kmer, hashFuncCount):
    global bloomFilterSize
    global bit_vector
    index = 0
    for hf in hashfuncs:
        if (index <= hashFuncCount):
            if (bit_vector[hf(kmer) % bloomFilterSize] == 0):