Esempi in Python per hash, esempi in Python per mmh3.hash

Esempio n. 1

0

Mostra file

File: bloom.py Progetto: jettify/aioredis_bloom

 def _hash_bits(self, key):
     # http://spyced.blogspot.com/2009
     # /01/all-you-ever-wanted-to-know-about.html
     hash1 = mmh3.hash(key, 0)
     hash2 = mmh3.hash(key, hash1)
     for i in range(self._hash_funcs):
         yield abs((hash1 + i * hash2) % self._bits_per_slice)

Esempio n. 2

0

Mostra file

File: fast_solution.py Progetto: timpalpant/KaggleTSTextClassification

def data(path, label_path=None):
    fd = open(path)
    fd.readline() # skip headers
    hash_cols = [3,4,34,35,61,64,65,91,94,95]
    npairs = len(hash_cols)
    x = [0] * (146 + npairs*(npairs-1)/2)
    if label_path:
        label = open(label_path)
        label.readline() # skip headers
    for t, line in enumerate(fd):
        # parse x
        row = line.rstrip().split(',')
        for m, feat in enumerate(row):
            if m == 0:
                ID = int(feat)
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                x[m] = abs(mmh3.hash(str(m) + '_' + feat)) % D
        for i in xrange(10):
            for j in xrange(i+1,10):
                m += 1
                x[m] = abs(mmh3.hash(str(m)+'_'+row[hash_cols[i]]+"_x_"+row[hash_cols[j]])) % D
        # parse y, if provided
        if label_path:
            # use float() to prevent future type casting, [1:] to ignore id
            y = [float(y) for y in label.readline().split(',')[1:]]
        yield (ID, x, y) if label_path else (ID, x)

Esempio n. 3

0

Mostra file

File: m_api.py Progetto: J4LP/mumble

 def authenticate(self, name, password, certificates, certhash, certstrong, current=None):
     with self.app.app_context():
         if name == 'SuperUser':
             return RET_FALLTHROUGH
         user = User.query.filter_by(user_id=name).first()
         if not user:
             try:
                 uuid.UUID(name, version=4)
             except ValueError:
                 return RET_DENIED
             guest_user = GuestUser.query.get(name)
             if guest_user:
                 if not guest_user.password == password or guest_user.banned:
                     return RET_DENIED
                 if guest_user.corporation:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']
                 else:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), ['Guest']
             else:
                 return RET_DENIED
         if not user.mumble_password == password:
             return RET_DENIED
         self.app.logger.debug('Authenticating user with: {} {} {}'.format(mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups))
         return mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups

Esempio n. 4

0

Mostra file

File: bloom.py Progetto: dariajung/bloom

    def hash(self, string):
        hash_arr = []
        hash1 = mmh3.hash(string, 0)
        hash2 = mmh3.hash(string, hash1)
        for i in range(self.k):
            hash_arr.append(abs((hash1 + i * hash2) % self.m))

        return hash_arr

Esempio n. 5

0

Mostra file

File: sketching_files.py Progetto: bajib/Log_Clustering

 def Hashmap_WordVector(self,nbits):
     length=len(self.Words_Vector)
     self.bl_bits=nbits
     self.bloom_vector=self.bl_bits*bitarray('0')
     for i in range(length):
         self.hashmap1.append(mmh3.hash(self.Words_Vector[i]) % self.bl_bits )
         self.hashmap2.append(mmh3.hash(self.Words_Vector[i],self.hashmap1[i]) % self.bl_bits )
         self.hashmap3.append(mmh3.hash(self.Words_Vector[i],self.hashmap2[i]) % self.bl_bits )
         self.bloom_vector[self.hashmap1[i]]=1
         self.bloom_vector[self.hashmap2[i]]=1
         self.bloom_vector[self.hashmap3[i]]=1

Esempio n. 6

0

Mostra file

File: vw_hash.py Progetto: Faye2014/seldon-server

def get_hash(label,namespace,feature,stride,mask):
    if namespace:
        namespace_hash = mmh3.hash(namespace,0)
    else:
        namespace_hash = 0
    if is_number(feature):
        feature_hash = int(feature) + namespace_hash
    else:
        feature_hash = mmh3.hash(feature,namespace_hash)
    feature_hash_oaa = feature_hash * stride
    return (feature_hash_oaa + label - 1) & mask

Esempio n. 7

0

Mostra file

File: bsbi.py Progetto: bogdancarpusor/html_indexer

 def parse_block(block):
     index_block = []
     for file_path in block:
         file_path_hash = mmh3.hash(file_path)
         with open(file_path, 'r') as input_file:
             for line in input_file:
                 items = line.strip().split(' ')
                 index_block.append(
                     (mmh3.hash(items[0]),
                      [file_path_hash,
                      items[1]])
                 )
     return index_block

Esempio n. 8

0

Mostra file

File: searcher.py Progetto: KopBob/technosphere

def select_terms_meta(query_terms, term_dict_stream):
    """
    reads term dictionary generator and selects query terms meta info
    """
    terms_meta_dict = {}

    for term in query_terms:
        term_hash = mmh3.hash(term.encode("utf-8"))
        terms_meta_dict[term_hash] = {
            "term": term,
            "seek_offset": None,
            "size": None
        }

    seek_offset = 0
    unseen_terms = terms_meta_dict.keys()
    for dict_term_hash, dict_term_size in term_dict_stream:
        if dict_term_hash in unseen_terms:
            terms_meta_dict[dict_term_hash]["seek_offset"] = seek_offset
            terms_meta_dict[dict_term_hash]["size"] = dict_term_size

            unseen_terms.remove(dict_term_hash)
            if len(unseen_terms) == 0:
                break

        seek_offset += dict_term_size

    query_terms_dict = {}
    for _, term_meta in terms_meta_dict.items():
        query_terms_dict[term_meta["term"]] = {
            "seek_offset": term_meta["seek_offset"],
            "size": term_meta["size"]
        }

    return query_terms_dict

Esempio n. 9

0

Mostra file

File: FTRLProximal.py Progetto: cckk3333/rt-pred

    def _indices(self, x):
        ''' A helper generator that yields the indices in x

            The purpose of this generator is to make the following
            code a bit cleaner when doing feature interaction.
        '''

        # first yield index of the bias term
        yield 0, 1.

        # then yield the linear indices
        if self.interaction != 2:
            for i,val in x:
                yield i,val

        # now yield interactions (if applicable)
        if self.interaction:
            D = self.D
            L = len(x)

            x = sorted(x)
            for i in xrange(L):
                for j in xrange(i+1, L):
                    # one-hot encode interactions with hash trick
                    yield abs(hash(str(x[i][0]) + '_' + str(x[j][0]))) % D, x[i][1]*x[j][1]

Esempio n. 10

0

Mostra file

File: processor.py Progetto: pangbo-1988/data_diff

 def process(self):
     # load data
     data = self.load()
     # index to elastic search
     print "\nStart processing"
     cursor = Cursor(self.es, self.data_from)
     cursor_num = cursor.get_new_cursor()
     for each_data in data:
         key_string = ''
         for each_key_string in key_value:
             key_string += each_data[each_key_string]
         hashkey = mmh3.hash(key_string)
         print "parsing id: ", hashkey
         # try to read record
         try:
             res = self.es.get(   index="deltadb", 
                             doc_type="data", 
                             id=hashkey)
             if res["found"]:
                 node = self.update_node(res["_source"], each_data, cursor_num)
             else:
                 node = self.create_node(each_data, cursor_num)
         except:
             node = self.create_node(each_data, cursor_num)
         # insert back to es
         try:
             res = self.es.index(index="deltadb", 
                                 doc_type="data", 
                                 id=hashkey, 
                                 body=node)
         except:
             continue
     print "\nProcess finish."

Esempio n. 11

0

Mostra file

File: work.py Progetto: kenluck2001/customHashing

def getHash(word):
	'''
		This return the hash value and does the anding with 0xffffffffL on a 32 bit system
	'''
	curHash = mmh3.hash(word)
	curHash = curHash & 0xffffffffL
	return curHash

Esempio n. 12

0

Mostra file

File: dupli.py Progetto: c0ns0le/technosphere

def shingles2sketch(shingles, m_baskets=20):
    baskets = defaultdict(lambda: -float("inf"))
    for shingle in shingles:
        h = mmh3.hash(shingle.encode('utf8'))
        if baskets[h % m_baskets] < h:
            baskets[h % m_baskets] = h
    return sorted(baskets.values())

Esempio n. 13

0

Mostra file

File: cursor.py Progetto: pangbo-1988/data_diff

 def save_cursor(self, cursor_data):
     cursor_id = mmh3.hash(self.data_from)
     res = self.es.index(index="lookup", 
                         doc_type="data", 
                         id=cursor_id, 
                         body=cursor_data)
     return

Esempio n. 14

0

Mostra file

File: api.py Progetto: fulltrader/cccrawler

def saveSuccessCrawlDoc(crawldoc):
    '''step1: save crawl success crawldoc to crawl_result, make sure docid is unique
       step2: save outlinks(found new url) to crawl_pending
       step3: update crawl url status which at crawl_pending to crawled'''
    values = crawldoc.convert
    # only string can save to db, change dict or list to string.
    # reference: cccrawler.proto.db.models.CrawlResult
    values['reservation_dict'] = str(crawldoc.reservation_dict)
    values['history'] = str(values['history'])
    values['header'] = str(values['header'])
    values['created_at'] = timeutils.utcnow_ts()
    utils.convert_datetimes(values, 'created_at', 'deleted_at', 'updated_at')
    crawldoc_ref = models.CrawlResult()
    crawldoc_ref.update(values)
    crawldoc_ref.save()

    _updateCrawlStatus(crawldoc.pending_id,'crawled',crawlfail=False)

    cl = deweight.get_client()
    fresh_docs = []
    for doc in crawldoc.outlinks:
        real_url = urlutils.normalize(doc.url)
        docid = mmh3.hash(real_url)
        if not cl.has(docid):
            fresh_doc = addPendingCrawlDocDict(doc.url, int(crawldoc.level),
                        crawldoc.docid, crawldoc.reservation_dict,doc.text,
                        real_url, docid)
            print '@'*60
            print fresh_doc
            print '@'*60
            fresh_docs.append(fresh_doc)
    rushPendingCrawlDoc(fresh_docs)

Esempio n. 15

0

Mostra file

File: 10.py Progetto: athanasopoulos/python-askiseis-

 def lookup(self, string):
     for seed in xrange(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             
             return "--%s--"%(words_list[i])
     return "Probably"

Esempio n. 16

0

Mostra file

File: 7-1.py Progetto: dimitrisdan/BigData

    def lookup(self, string):

        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]
        for x in hashlist:
            if not Bloom.bit[x]:
                return False
        return True

Esempio n. 17

0

Mostra file

File: 7-1.py Progetto: dimitrisdan/BigData

    def add(self, string):

        # Hash the string
        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]

        for x in hashlist:
            Bloom.bit[x] = 1

Esempio n. 18

0

Mostra file

File: bloom.py Progetto: brainix/pottery

    def _bit_offsets(self, value):
        '''The bit offsets to set/check in this Bloom filter for a given value.

        Instantiate a Bloom filter:

            >>> dilberts = BloomFilter(
            ...     num_values=100,
            ...     false_positives=0.01,
            ...     key='dilberts',
            ... )

        Now let's look at a few examples:

            >>> tuple(dilberts._bit_offsets('rajiv'))
            (183, 319, 787, 585, 8, 471, 711)
            >>> tuple(dilberts._bit_offsets('raj'))
            (482, 875, 725, 667, 109, 714, 595)
            >>> tuple(dilberts._bit_offsets('dan'))
            (687, 925, 954, 707, 615, 914, 620)

        Thus, if we want to insert the value 'rajiv' into our Bloom filter,
        then we must set bits 183, 319, 787, 585, 8, 471, and 711 all to 1.  If
        any/all of them are already 1, no problems.

        Similarly, if we want to check to see if the value 'rajiv' is in our
        Bloom filter, then we must check to see if the bits 183, 319, 787, 585,
        8, 471, and 711 are all set to 1.  If even one of those bits is set to
        0, then the value 'rajiv' must never have been inserted into our Bloom
        filter.  But if all of those bits are set to 1, then the value 'rajiv'
        was *probably* inserted into our Bloom filter.
        '''
        encoded_value = self._encode(value)
        for seed in range(self.num_hashes()):
            yield mmh3.hash(encoded_value, seed=seed) % self.size()

Esempio n. 19

0

Mostra file

File: pypbf.py Progetto: xsswfm/pypbf

def makeHashFuncs(key, size, numHashes):
    hashValue = []
    for i in range(1, (numHashes+1)):
        value = mmh3.hash(key,i) % size
        #print value
        hashValue.append(value)
    return hashValue

Esempio n. 20

0

Mostra file

File: cache.py Progetto: Hameloid/ubuntu-hangups

def get_image_cache_name(url):
    last_segment = url.split('/')[-1]
    if last_segment.count('.') == 1:
        extension = '.' + url.split('.')[-1]
    else:
        extension = ""
    return 'img' + str(mmh3.hash(url.encode('utf-8'))) + extension.lower()

Esempio n. 21

0

Mostra file

File: bloomfilter_test01.py Progetto: curlyCheng/project

	def in_bf(self, elem):
		for x in xrange(self.hash_count):
			index = mmh3.hash(elem, x) % self.size
			if (self.bit_arr[index] == 0):
				return False
		return True

Esempio n. 22

0

Mostra file

File: Indexer.py Progetto: pashna/SearchIndexer

    def add_document_indexes(self, text, url, is_print=False):
        # TODO: Maybe, it is good idea to change key from string to hash
        self.documents.append(url)
        doc_id = len(self.documents)-1
        word_list = self._split_text(text.lower())

        for word in word_list:
            #"""
            try:
                word = word.encode('utf-8')

                w_hash = mmh3.hash(word) % self.count_of_files
                if is_print:
                    print word, w_hash
                r_index = self.full_index[w_hash]

                if r_index.has_key(word):
                    r_index[word]["docs"].append(doc_id)
                else:
                    r_index[word] = {}
                    r_index[word]["docs"] = [doc_id]

                if not r_index.has_key('encoding'):
                    r_index['encoding'] = self._encoding

            except Exception as e:
                print "EXCEPRION", word
                traceback.print_exc()

Esempio n. 23

0

Mostra file

File: lib_redis_insert.py Progetto: caar2000/AIL-framework

def select_hash(hashkind, line):
    """Select the kind of hashing for the line.

    :param hashkind: -- (str) The name of the hash
    :param line: -- (str) The string to hash.

    This function is a kind of hash selector which will use the hash passed
    in argument to hash the string also passed in argument.

    """
    if hashkind == "md5":
        hashline = hashlib.md5(line).hexdigest()

    elif hashkind == "sha1":
        hashline = hashlib.sha1(line).hexdigest()

    elif hashkind == "crc":
        crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
        crc32.update(line)
        hashline = crc32.hexdigest()

    elif hashkind == "murmur":
        hashline = mmh3.hash(line)

    return str(hashline)

Esempio n. 24

0

Mostra file

File: count_min_sketch.py Progetto: grantholly/cms_stream_processing

 def count(self, item):
     counts = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             counts.append(k[search_key])
     return min(counts)

Esempio n. 25

0

Mostra file

File: count_last_seen_sketch.py Progetto: grantholly/cms_stream_processing

 def last_seen(self, item):
     timestamps = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             timestamps.append(k[search_key])
     return max(timestamps)

Esempio n. 26

0

Mostra file

File: royalroads.py Progetto: burjorjee/royal-roads

    def contingentParitiesFunction(pop, verbose=False):
        assert(pop.shape[1] == order * height)
        popMissteps = []
        traceAndFitness = []
        for c in xrange(pop.shape[0]):
            output = 0
            ctr = 0
            length = pop.shape[1]
            loci = np.arange(length)
            missteps = []
            trace = ""
            while ctr < height:
                rng.seed(abs(mmh3.hash(trace)))
                acc = 0
                trace += "|"
                for i in xrange(order):
                    idx = rng.randint(length - (ctr * order + i)) + 1
                    swap = loci[-idx]
                    loci[-idx] = loci[ctr * order + i]
                    loci[ctr * order + i] = swap
                    trace += "%2d:%s|" % (swap + 1, int(pop[c, swap]))
                    acc += pop[c, swap]
                output += acc % 2

                if acc % 2 == 0:
                    missteps.append(ctr + 1)

                ctr +=1
            popMissteps.append(missteps)
            traceAndFitness.append((trace, height - len(missteps)))
        if verbose:
            for t in sorted(traceAndFitness):
                print "%s   %s " % t
        return np.array([height - len(missteps) for missteps in popMissteps]), popMissteps

Esempio n. 27

0

Mostra file

File: sentrygun.py Progetto: s0lst1c3/sentrygun

def alert_factory(location=None,
                bssid=None,
                channel=None,
                essid=None,
                tx=None,
                intent=None):

    # all arguments are required
    assert not any([
                location is None,
                bssid is None,
                channel is None,
                essid is None,
                tx is None,
                intent is None,
            ])

    # return dict from arguments
    _id = str(mmh3.hash(''.join([ bssid, str(channel), intent])))

    return {
        
        'id' : _id,
        'location' : location,
        'bssid' : bssid,
        'channel' : channel,
        'tx' : tx,
        'essid' : essid,
        'intent' : intent,
        'timestamp' : time.time(),
    }

Esempio n. 28

0

Mostra file

File: utils.py Progetto: tmarballi/appscale

def get_scatter_prop(element_list):
  """ Gets the scatter property for an entity's key path.

  This will return a property for only a small percentage of entities.

  Args:
    element_list: A list of entity_pb.Path_Element objects.
  Returns:
    An entity_pb.Property object or None.
  """
  def id_from_element(element):
    if element.has_name():
      return element.name()
    elif element.has_id():
      return str(element.id())
    else:
      return ''

  to_hash = ''.join([id_from_element(element) for element in element_list])
  full_hash = mmh3.hash(to_hash)
  hash_bytes = struct.pack('i', full_hash)[0:2]
  hash_int = struct.unpack('H', hash_bytes)[0]
  if hash_int >= dbconstants.SCATTER_PROPORTION:
    return None

  scatter_property = entity_pb.Property()
  scatter_property.set_name('__scatter__')
  scatter_property.set_meaning(entity_pb.Property.BYTESTRING)
  scatter_property.set_multiple(False)
  property_value = scatter_property.mutable_value()
  property_value.set_stringvalue(hash_bytes)

  return scatter_property

Esempio n. 29

0

Mostra file

File: client32.py Progetto: MarwanG/Graal

	def readHash(self):
		hll = Hll(self.p)
		x = sys.stdin.readline().rstrip('\n')
		while x:
			hll.AddItem(mmh3.hash(x))
			x = sys.stdin.readline().rstrip('\n')
		print hll.Count()

Esempio n. 30

0

Mostra file

File: bloomfilter.py Progetto: datateller/ywbserver-py3

 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             #return "Nope"
             return False
     return True

Esempio n. 31

0

Mostra file

File: BloomFilter.py Progetto: gitrekm/CS4035-Cyber-Data-Analytics

 def lookup(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         if self.hash_values[result] == 0:
             return False
     return True

Esempio n. 32

0

Mostra file

File: bloomfilter.py Progetto: anarmanafov1/kvs

 def exists(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         if self.bit_array[hashed_index] == 0:
             return False
     return True

Esempio n. 33

0

Mostra file

from ll import LL
import math


class HyperLogLog(LL):
    def __len__(self):
        indicator = sum(2**-m.counter for m in self.registers)
        E = self.alpha * (self.num_registers**2) / float(indicator)

        if E <= 5.0 / 2.0 * self.num_registers:
            V = sum(1 for m in self.registers if m.counter == 0)
            if V != 0:
                Estar = self.num_registers * \
                    math.log(self.num_registers / (1.0 * V), 2)
            else:
                Estar = E
        else:
            if E <= 2**32 / 30.0:
                Estar = E
            else:
                Estar = -2**32 * math.log(1 - E / 2**32, 2)
        return Estar


if __name__ == "__main__":
    import mmh3
    hll = HyperLogLog(8)
    for i in xrange(100000):
        hll.add(mmh3.hash(str(i)))
    print len(hll)

Esempio n. 34

0

Mostra file

File: bloom_filters.py Progetto: Moziofmoon/algorithm-python

 def add(self, s):
     for seed in range(self.hash_num):
         result = mmh3.hash(s, seed) % self.size
         self.bit_array[result] = 1

Esempio n. 35

0

Mostra file

    def get_machoc_hash(self):
        # Get Machoc Hash adapted from https://github.com/conix-security/machoke
        binary = self.r2p
        binary.cmd("aaa")
        mmh3_line = ""
        machoke_line = ""

        funcs = json.loads(binary.cmd("aflj"))
        if funcs is None:
            print("r2 could not retrieve functions list")

        def get_machoke_from_function(r2p, function):
            """Return machoke from specific
            :rtype: object
            """
            r2p.cmd("s {}".format(function["offset"]))
            agj_error = 0
            while True:
                try:
                    fcode = json.loads(r2p.cmd("agj"))
                    break
                except:
                    print >> sys.stderr, "Fail agj: %s" % hex(
                        function["offset"])
                if agj_error == 5:
                    break
                agj_error += 1
            blocks = []
            id_block = 1
            try:
                for block in fcode[0]["blocks"]:
                    blocks.append({
                        "id_block": id_block,
                        "offset": hex(block["offset"])
                    })
                    id_block += 1
            except:
                return ""
            line = ""
            id_block = 1
            for block in fcode[0]["blocks"]:
                word = "{}:".format(id_block)
                for instruction in block["ops"]:
                    # Check if call
                    if instruction["type"] == "call":
                        word = "{}c,".format(word)
                        for ublock in blocks:
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if jmp
                    if instruction["type"] == "jmp":
                        for ublock in blocks:
                            if instruction["esil"] == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if conditional jmp
                    elif instruction["type"] == "cjmp":
                        for ublock in blocks:
                            if hex(instruction["jump"]) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                    else:
                        pass
                if word[-2] == "c":
                    for ublock in blocks:
                        if hex(instruction["offset"] + 4) == ublock["offset"]:
                            word = "{}{},".format(word, ublock["id_block"])

                    if word[-2] == "c":
                        word = "{}{},".format(word, id_block + 1)

                if word[-1] == ":" and id_block != len(fcode[0]["blocks"]):
                    word = "{}{},".format(word, id_block + 1)
                # Clean word
                if word[-1] == ",":
                    word = "{};".format(word[:-1])
                elif word[-1] == ":":
                    word = "{};".format(word)
                line = "{}{}".format(line, word)
                id_block += 1
            return line

        for function in funcs:
            machoke = get_machoke_from_function(binary, function)
            machoke_line = "{}{}".format(machoke_line, machoke)
            mmh3_line = "{}{}".format(
                mmh3_line,
                hex(mmh3.hash(machoke) & 0xffffffff).replace("0x", "").replace(
                    "L", ""),
            )
        binary.quit()

        return mmh3_line

Esempio n. 36

0

Mostra file

  def test_hash_values(self):
    """ Test that on randomized data, values computed from mmh3 and pymmh3 match. """

    for i in range(10):
      random_value = str(random.random())
      self.assertEqual(mmh3.hash(random_value), pymmh3.hash(random_value))

Esempio n. 37

0

Mostra file

File: utils.py Progetto: fossabot/unleash-client-python

def normalized_hash(identifier: str, activation_group: str) -> int:
    return mmh3.hash("{}:{}".format(identifier, activation_group)) % 100 + 1

Esempio n. 38

0

Mostra file

 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             return "Nope"
     return "Probably"

Esempio n. 39

0

Mostra file

 def add(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         self.bit_array[result] = 1

Esempio n. 40

0

Mostra file

File: custom_bloom_filter.py Progetto: KumarAbhinav2/elasticfilter

def lookup(string, bit_array, hash_count, size):
    for seed in range(hash_count):
        result = mmh3.hash(string, seed) % size
        if bit_array[result] == 0:
            return False
    return True

Esempio n. 41

0

Mostra file

File: bloom_filter.py Progetto: yakun0113/cmpe273-assignment3

	def add(self, item):
		digests = []
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			self.bit_array_size[digest] = True

Esempio n. 42

0

Mostra file

File: classify.py Progetto: Jamiroquai88/jsalt-lab

def get_feature(feat_str, model):
    # The feature string may be unicode, but MurmurHash3 expects ASCII encoded strings.
    return mmh3.hash(feat_str.encode('ascii', 'xmlcharrefreplace')) % model.num_features

Esempio n. 43

0

Mostra file

def schingling(doc):
    return [mmh3.hash(doc[i:i + 9], signed=False) for i in range(len(doc) - 9)]

Esempio n. 44

0

Mostra file

File: utils.py Progetto: alexschimpf/unleash-client-python

def normalized_hash(identifier, activation_group):
    return mmh3.hash("{}:{}".format(activation_group, identifier),
                     signed=False) % 100 + 1

Esempio n. 45

0

Mostra file

File: bitextor-warc2preprocess.py Progetto: sanjibnarzary/bitextor

                "mimeFile": mimeFile,
                "normHtmlFile": normHtmlFile,
                "plainTextFile": plainTextFile
            }

    # If enabled, remove boilerplate HTML
    if options.boilerpipe:
        logging.info(url + ": deboiling html")
        extractor = ExtrB(extractor='ArticleExtractor', html=text)
        deboiled = str(extractor.getHTML())
    else:
        deboiled = text

    # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled):
    # if we get duplicate files we discard them
    html_hash = mmh3.hash(deboiled, signed=False)
    # checking for duplicate content (duplicates are discarded)
    if html_hash in seen_html:
        logging.info("Repeated file:\t" + url)
        continue

    # get text with Alcazar library
    if options.parser == "alcazar":
        logging.info(url + ": Getting text with Alcazar")
        btext = alcazar.bodytext.parse_article(deboiled)
        if btext.body_text:
            plaintext = btext.body_text
        else:
            plaintext = ""

    # or get text with beautifulsoup

Esempio n. 46

0

Mostra file

def hash32(data: bytes) -> bytes:
    return struct.pack('i', mmh3.hash(data))

Esempio n. 47

0

Mostra file

File: bloom_filter.py Progetto: lybroman/code_snippet

 def _hashes(self, key):
     for i in xrange(self.k):
         yield mmh3.hash(key, self.k)

Esempio n. 48

0

Mostra file

File: bloomfilter.py Progetto: anarmanafov1/kvs

 def add(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         self.bit_array[hashed_index] = 1

Esempio n. 49

0

Mostra file

 def _hash(self, element):
     return [
         b % self.size
         for b in [mmh3.hash(element, i) for i in range(self.hash_count)]
     ]

Esempio n. 50

0

Mostra file

File: bloom_filter.py Progetto: lybroman/code_snippet

 def _hashes_opt(self, key):
     # Kirsch - Mitzenmacher - Optimization
     h0 = mmh3.hash(key, 1)
     h1 = mmh3.hash(key, 10)
     for i in xrange(self.k):
         yield h0 + i * h1

Esempio n. 51

0

Mostra file

def murmur3_32(text):
    val = mmh3.hash(text)
    return val if val >= 0 else val + 2**32

Esempio n. 52

0

Mostra file

File: generateAUCs_pipelined.py Progetto: TavorB/spectral_jaccard_similarity

def runSim(args):

    ## avoid one processes starting multiple threads
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"

    dataset = args[0]
    if dataset[-1] != '/':
        dataset += '/'
    ground_truth, theta = args[1]

    ## load dna reads into reads_lst
    reads_lst = []
    fastaFile = dataset + "/reads.fasta"
    with open(fastaFile) as handle:
        for values in SimpleFastaParser(handle):
            reads_lst.append(values[1])
    n = len(reads_lst)

    ## load precomputed Jaccard Similarities
    JSims = np.loadtxt(dataset + "/minHashes/JSims.txt")

    ## load alignments
    gt_file = "{}/{}_ground_truth.txt".format(dataset, ground_truth)

    with open(gt_file) as f:
        lines = [[float(x) for x in line.rstrip('\n').split('\t')]
                 for line in f]

    refDict = {}
    for i in range(n):
        refDict[i] = {}
    for line in lines:
        refDict[int(line[0]) - 1][int(line[1]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])
        refDict[int(line[1]) - 1][int(line[0]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])

    ## convert each read into it's k-mers
    symLength = 7  # k in k-mer

    def generateSymSets(reads_lst, symLength):
        symSets = {}
        for i, read in enumerate(reads_lst):
            lst = set()
            for j in range(len(read) - symLength):
                lst.add(read[j:j + symLength])
            symSets[i] = lst
        return symSets

    symSets = generateSymSets(reads_lst, symLength)

    ## load precomputed minHashes
    minHashArr = np.zeros((n, 1000))
    for i in range(n):
        minHashArr[i] = np.load(dataset +
                                "minHashes/minHashes_{}.txt".format(i),
                                allow_pickle=True)

    ## load precomputed iid sequences and their minhashes
    numRandReads = 5
    randMinHashArr = np.zeros((numRandReads, 1000))
    for i in range(numRandReads):
        randMinHashArr[i] = np.load(dataset +
                                    "randReads/randMinHashes_{}.txt".format(i),
                                    allow_pickle=True)

    minHashArrExtended = np.vstack((minHashArr[:, :1000], randMinHashArr))

    ## checking to see precomputed minHashes works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, 1000, size=100)
    lst = []
    for iter_round in range(100):

        iterLst = list(symSets[i[iter_round]])

        lst.append(
            min([
                mmh3.hash(sym, j[iter_round], signed=False) for sym in iterLst
            ]))

    assert (np.alltrue(minHashArrExtended[i, j] == lst))

    ## checking to see precomputed JSim works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, n, size=100)
    lst = []
    for iter_round in range(100):
        i1 = i[iter_round]
        j1 = j[iter_round]

        lst.append(JSims[i1, j1] == 1.0 *
                   len(symSets[i1].intersection(symSets[j1])) /
                   (len(symSets[i1].union(symSets[j1]))))

    assert (np.alltrue(lst) and np.allclose(JSims, JSims.T))

    ## Testing SVD, JSimEmp, JSim Exact, reference vs all
    storageArrGround = []
    storageArrpHatSVD = []
    storageArrJsimExact = []
    storageArrJsimEmp = []
    storageArrNumOnesCol = []
    storageArrQjs = []
    storageArrwSJS = []

    h = 1000

    for ref_read in trange(n):

        groundTruthLocs = np.array(list(refDict[ref_read].keys()))
        if len(groundTruthLocs) == 0:  ## read has no alignments in dataset
            continue
        refReadMatches = refDict[ref_read]
        groundTruthVals = [refReadMatches[i] for i in groundTruthLocs]

        lst = set(list(groundTruthLocs))
        rangeN = set(range(n))
        rangeN.discard(ref_read)

        toAppend = list(rangeN - lst)
        groundTruthLocs = np.hstack((groundTruthLocs, np.array(toAppend)))
        groundTruthVals += [0] * len(toAppend)

        empiricalMatrix = (minHashArrExtended == minHashArrExtended[ref_read])
        empiricalMatrix = np.delete(empiricalMatrix, ref_read, axis=0)[:, :h]

        updatedGroundTruthLocs = groundTruthLocs - 1 * (groundTruthLocs >=
                                                        ref_read)
        updatedGroundTruthLocs = updatedGroundTruthLocs.astype(int)

        jSimEmpirical = np.mean(empiricalMatrix, axis=1)
        jSimExact = np.delete(JSims[ref_read], ref_read)

        ## here we can modify what normalization is used without having to rerun SVDs
        # u = np.loadtxt(dataset+"/SVD/raw_pi_refread_{}.txt".format(ref_read))
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[:n-1])) ## normalize median of p_i
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[n-1:])) ## random read normalization
        # pHatSVD = 1-np.abs(u[:n-1])/np.max(np.abs(u[n-1:])) ## naive max normalziation

        pHatSVD = np.loadtxt(dataset +
                             "/SVD/pi_refread_{}.txt".format(ref_read))
        qSVD = np.loadtxt(dataset + "/SVD/qj_refread_{}.txt".format(ref_read))

        ## for approximation
        empQ = empiricalMatrix.sum(axis=0)
        x = np.matmul(empiricalMatrix - np.ones(empiricalMatrix.shape),
                      1 - np.array(empQ / np.max(empQ)))[:n - 1]
        x = np.abs(x - np.min(x))
        x /= np.max(x)
        storageArrwSJS.extend(x[updatedGroundTruthLocs])

        storageArrGround.extend(groundTruthVals)
        storageArrpHatSVD.extend(pHatSVD[updatedGroundTruthLocs])
        storageArrJsimEmp.extend(jSimEmpirical[:n - 1][updatedGroundTruthLocs])
        storageArrJsimExact.extend(jSimExact[updatedGroundTruthLocs])
        storageArrNumOnesCol.extend(np.mean(empiricalMatrix, axis=0))
        storageArrQjs.extend(qSVD)

    fpr, tpr, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrpHatSVD)
    fpr_jsim, tpr_jsim, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimExact)
    fpr_js_emp, tpr_js_emp, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimEmp)
    fpr_wsjs, tpr_wsjs, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrwSJS)

    pickle.dump([
        auc(fpr, tpr),
        auc(fpr_jsim, tpr_jsim),
        auc(fpr_js_emp, tpr_js_emp),
        auc(fpr_wsjs, tpr_wsjs), storageArrNumOnesCol,
        np.corrcoef(storageArrpHatSVD, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimExact, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimEmp, storageArrGround)[0, 1], storageArrQjs,
        'SJS AUC,JS AUC, JS emp AUC, wSJS AUC,numOnes per col,SJS r^2,JS r^2,JS emp r^2,storageArrQjs'
    ],
                open(
                    "AUCs/{}_{}_{}.pkl".format(dataset[:-1], ground_truth,
                                               str(theta % 1).split('.')[1]),
                    "wb"))

Esempio n. 53

0

Mostra file

 def insert(self, item):
   for i in range(self.qty_hash):
     t = mmh3.hash(bytes(item), i) % self.size
     self.bitarray[t] = True

Esempio n. 54

0

Mostra file

File: favUp.py Progetto: rajivraj/fav-up

 def faviconHash(self, data, web_source=None):
     if web_source:
         b64data = base64.encodebytes(data).decode()
     else:
         b64data = base64.encodebytes(data)
     return mmh3.hash(b64data)

Esempio n. 55

0

Mostra file

def hash(flowkey):
    global width
    flowkey_bytes = struct.pack("L", flowkey)
    r = mmh3.hash(flowkey_bytes, signed=False)
    return r % width

Esempio n. 56

0

Mostra file

File: BloomFilter.py Progetto: gitrekm/CS4035-Cyber-Data-Analytics

 def add(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         self.hash_values[result] = 1
     return self.hash_values

Esempio n. 57

0

Mostra file

def string_digest(item, index):
    return mmh3.hash(bytes(item, 'utf-8'), index)

Esempio n. 58

0

Mostra file

File: bloom_filter.py Progetto: yakun0113/cmpe273-assignment3

	def is_member(self, item):
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			if self.bit_array_size[digest] == False:
				return False
		return True

Esempio n. 59

0

Mostra file

def murmur(key):
    return mmh3.hash(key)

Esempio n. 60

0

Mostra file

    #np.dot(m.transpose(),m)
    #Jaccard(m[0],m[10])

    #s=signature(m,10000)
    #s.shape

    #m_new = firma(m)

    m_new2 = m.dot(rndVecs) # projected matrix
    # Indexing text collection
    for doc_id in range(m_new2.shape[0]):
        docSgt = np.array(m_new2[doc_id, :] >= 0, dtype=np.int)
        for blk in range(NRBLK):
            # (blk*BLKSZ):((blk+1)*BLKSZ)
            blkData = docSgt[(blk*BLKSZ):((blk+1)*BLKSZ)]
            docHashVal = mmh3.hash(''.join(map(str, blkData))) % MAXBKTS
            hshTbl_blk = HshTabls[blk]
            if docHashVal not in hshTbl_blk:
                hshTbl_blk[docHashVal] = set()
            hshTbl_blk[docHashVal].add(doc_id + 1)
    collision = np.zeros((m.shape[0], m.shape[0]), dtype=np.int)
    for hshTbl_blk in HshTabls:
        for e in hshTbl_blk:
            for i in hshTbl_blk[e]:
                for o in hshTbl_blk[e]:
                    collision[i - 1][o - 1] += 1
    pldHaming=penalizedHcc(m_new2)
    simcos=(np.pi / 2) * (1 - hmmg(m_new2))
    simcospenalized =(np.pi / 2) * (1 - pldHaming)
    print("End!")