Ejemplo n.º 1
0
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = bfs.shape[1] + sfs.shape[1]
    X = np.hstack((bfs, sfs, ifs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])

    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols * (npaircols - 1) / 2
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    for i, row in enumerate(X):
        if i % 50000 == 0: logging.debug(i)
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            ij[1, start +
               j] = murmurhash3_32('%d_%s' %
                                   (j, x), seed=42, positive=True) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                ij[1, j] = murmurhash3_32('%d_%s_x_%d_%s' %
                                          (j1, row[j1], j2, row[j2]),
                                          seed=42,
                                          positive=True) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
    def tagRawSentenceHash(self, rawLine, DICT, word_dict):
        line = initializeSentence(DICT, rawLine)

        sen = []
        wordTags = line.split()

        for i in range(len(wordTags)):
            fwObject = FWObject.getFWObject(wordTags, i)
            word, tag = getWordTag(wordTags[i])
            node = self.findFiredNode(fwObject)

            #Only hash word once and block out-of-lexicon words
            word_hash = murmurhash3_32(word, seed=0)
            try:
                word_cat = ct.get(word_hash, word_dict)
            except:
                word_cat = 0
                word_hash = 0

#Format and return
            if node.depth > 0:
                sen.append((word_hash, murmurhash3_32(node.conclusion,
                                                      seed=0), word_cat))
            else:  # Fired at root, return initialized tag
                sen.append((word_hash, murmurhash3_32(tag, seed=0), word_cat))

        return sen
Ejemplo n.º 3
0
    def tagRawSentenceHash(self, rawLine, DICT, word_dict):
        line = initializeSentence(DICT, rawLine)

        sen = []
        wordTags = line.split()

        for i in range(len(wordTags)):
            fwObject = FWObject.getFWObject(wordTags, i)
            word, tag = getWordTag(wordTags[i])
            node = self.findFiredNode(fwObject)

            #Format and return tagged word
            if node.depth > 0:
                tag = node.conclusion

            #Hash word / tag
            word = word + "/" + tag
            tag_hash = murmurhash3_32(tag, seed=0)
            word_hash = murmurhash3_32(word, seed=0)

            #Get semantic category
            try:
                word_cat = word_dict[word_hash]

            except:
                word_cat = 0
                word_hash = 0

            #Add to list
            sen.append((word_hash, tag_hash, word_cat))

        return sen
Ejemplo n.º 4
0
	def tagRawSentenceHash(self, rawLine, DICT, word_dict):
		line = initializeSentence(DICT, rawLine)

		sen = []
		wordTags = line.split()

		for i in range(len(wordTags)):
			fwObject = FWObject.getFWObject(wordTags, i)
			word, tag = getWordTag(wordTags[i])
			node = self.findFiredNode(fwObject)
			
			#Format and return tagged word
			if node.depth > 0:
				tag = node.conclusion
	
			#Hash word / tag
			word = word + "/" + tag
			tag_hash = murmurhash3_32(tag, seed=0)
			word_hash = murmurhash3_32(word, seed=0)
			
			#Get semantic category
			try:
				word_cat = word_dict[word_hash]
				
			except:
				word_cat = 0
				word_hash = 0
			
			#Add to list
			sen.append((word_hash, tag_hash, word_cat))

		return sen
Ejemplo n.º 5
0
 def add_element(self, vid):
     start_mult = time.time()
     h_val = murmurhash3_32(vid) % self.hash_size
     cnt = 0
     # not sure how to deal with cycle
     while True:
         # print("caonimmmmmmmma")
         if cnt > 80:
             cnt = 0
             h_val = (h_val + random.randint(1, 40)) % self.hash_size
         if self.hash_space[h_val] != -1:
             h_val = murmurhash3_32(h_val) % self.hash_size
             cnt += 1
         else:
             # print("here?")
             server_id = self.search_server(h_val)
             # print(server_id)
             # print(vid)
             # print(self.servers.keys())
             self.servers[server_id].append(vid)
             break
     # print("zhelimmmmma")
     end_mult = time.time()
     self.total_time += end_mult - start_mult
     return
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = bfs.shape[1] + sfs.shape[1]
    X = np.hstack((bfs, sfs, ifs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])
    
    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols*(npaircols-1)/2
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    for i, row in enumerate(X):
        if i % 50000 == 0: logging.debug(i)
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                ij[1,j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1,row[j1],j2,row[j2]), 
                                         seed=42, positive=True) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
Ejemplo n.º 7
0
    def add_element(self, vid):
        start_bound = time.time()
        h_val = murmurhash3_32(vid) % self.hash_size
        cnt = 0
        # not sure how to deal with cycle
        flag = True
        for _ in self.overhead:
            if _ < 0:
                flag = False
                break
        if flag:
            least = min(self.overhead)
            server_id = self.overhead.index(least)
            self.overhead[server_id] += 1
            self.servers[server_id + 1].append(vid)

        else:
            while True:
                if cnt > 50:
                    cnt = 0
                    h_val = (h_val + random.randint(1, 30)) % self.hash_size
                if self.hash_space[h_val] != -1:
                    h_val = murmurhash3_32(h_val) % self.hash_size
                    cnt += 1
                else:
                    ptr = (h_val - 1) % self.hash_size
                    while True:
                        # print(ptr, h_val)
                        if ptr == h_val:
                            print("There is no server in the hash space")
                            server_id = -1
                            break
                        elif self.hash_space[ptr] != -1:
                            server_id = self.hash_space[ptr]
                            break
                        else:
                            ptr = (ptr - 1) % self.hash_size
                    # server_id = self.search_server(h_val)
                    if server_id == -1:
                        print("no server in the hash space")
                        break
                    if self.overhead[server_id - 1] < 0:
                        self.overhead[server_id - 1] += 1
                        self.servers[server_id].append(vid)
                        break
                    else:
                        h_val = murmurhash3_32(h_val) % self.hash_size
                        cnt += 1
        end_bound = time.time()
        # print(end_bound - start_bound, self.status.count(1))
        self.total_time += end_bound - start_bound
        return
Ejemplo n.º 8
0
    def __init__(self, Loader, word_classes=False, zho_split=False):

        self.language = Loader.language
        self.zho_split = zho_split
        self.Loader = Loader

        MODEL_STRING = Path(__file__).parent / os.path.join(
            "..", "data", "pos_rdr", self.language + ".RDR")
        DICT_STRING = Path(__file__).parent / os.path.join(
            "..", "data", "pos_rdr", self.language + ".DICT")
        DICTIONARY_FILE = Path(__file__).parent / os.path.join(
            "..", "data", "dictionaries", self.language + DICT_CONSTANT)

        #zho needs an additional tokenizer
        if self.language == "zho":

            try:
                import modules.jieba.jeiba as jb
            except:
                import c2xg.modules.jieba.jeiba as jb

            self.tk = jb.Tokenizer()
            self.tk.initialize()
            self.tk.lock = True

        #Universal POS Tags are fixed across languages
        pos_list = [
            "PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX", "ADJ", "INTJ",
            "SCONJ", "PRON", "NUM", "PUNCT", "ADV", "ADP", "X", "NOUN", "PART"
        ]
        self.pos_dict = {murmurhash3_32(pos, seed=0): pos for pos in pos_list}

        #Get semantic dict
        self.word_dict = pd.read_csv(DICTIONARY_FILE,
                                     index_col=0).to_dict()["Cluster"]
        self.domain_dict = {
            murmurhash3_32(str(key), seed=0): self.word_dict[key]
            for key in self.word_dict.keys()
        }
        self.word_dict = {
            murmurhash3_32(str(key), seed=0): key
            for key in self.word_dict.keys()
        }

        #Build decoder
        self.build_decoder()

        #Initialize tagger
        self.DICT = readDictionary(DICT_STRING)
        self.r = RDRPOSTagger(word_dict=self.domain_dict, DICT=self.DICT)
        self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING)
Ejemplo n.º 9
0
 def minhash(self, string, n_components, ngram_range):
     """ Encode a string using murmur hashing function.
     Parameters
     ----------
     string : str
         The string to encode.
     n_components : int
         The number of dimension of encoded string.
     ngram_range : tuple (min_n, max_n)
     The lower and upper boundary of the range of n-values for different
     n-grams to be extracted. All values of n such that min_n <= n <= max_n.
     Returns
     -------
     array, shape (n_components, )
         The encoded string.
     """
     min_hashes = np.ones(n_components) * np.infty
     grams = self.get_unique_ngrams(string, self.ngram_range)
     if len(grams) == 0:
         grams = self.get_unique_ngrams(' Na ', self.ngram_range)
     for gram in grams:
         hash_array = np.array([
             murmurhash3_32(''.join(gram), seed=d, positive=True)
             for d in range(n_components)])
         min_hashes = np.minimum(min_hashes, hash_array)
     return min_hashes/(2**32-1)
Ejemplo n.º 10
0
 def prepare(self, features, indices=None, dtype=float):
     X1 = super(TSOneHotHashingPairsEncoder, self).prepare(features, indices)
     logging.info("One-hot hashing pairs of string and boolean features")
     sfs = features['sfeatures']
     bfs = features['bfeatures']
     if indices is not None:
         sfs = sfs[indices]
         bfs = bfs[indices]
     X = np.hstack((sfs, bfs))
     del sfs, bfs
     nrows = X.shape[0]
     ncols = X.shape[1]*(X.shape[1]-1) / 2
     ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
     for i, row in enumerate(X):
         if i % 100000 == 0: logging.debug(i)
         start = i * ncols
         end = (i+1) * ncols
         ij[0,start:end] = i
         ij[1,start:end] = [murmurhash3_32('%d_%s_x_%d_%s' % (j1,x1,j2,row[j2]), 
                                           seed=42, positive=True) % self.D
                            for j1, x1 in enumerate(row)
                            for j2 in xrange(j1)]
     data = np.ones(ij.shape[1], dtype=dtype) # all ones
     X2 = sparse.csr_matrix((data, ij), shape=(nrows, self.D), dtype=dtype) 
     X = X1 + X2
     X.data[X.data > 1] = 1
     return X
Ejemplo n.º 11
0
 def prepare(self, features, indices=None, dtype=float):
     logging.info("One-hot hashing all features")
     bfs = features['bfeatures']
     ffs = features['ffeatures']
     ifs = features['ifeatures']
     sfs = features['sfeatures']
     if indices is not None:
         bfs = bfs[indices]
         ffs = ffs[indices]
         ifs = ifs[indices]
         sfs = sfs[indices]
     X = np.hstack((bfs, ffs, ifs, sfs))
     del bfs, ffs, ifs, sfs
     nrows = X.shape[0]
     ncols = X.shape[1]
     ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
     for i, row in enumerate(X):
         if i % 100000 == 0: logging.debug(i)
         start = i * ncols
         end = (i+1) * ncols
         ij[0,start:end] = i
         for j, x in enumerate(row):
             ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % self.D
     data = np.ones(ij.shape[1], dtype=dtype) # all ones
     X = sparse.csr_matrix((data, ij), shape=(nrows, self.D), dtype=dtype) 
     return X
Ejemplo n.º 12
0
        def _hash(x, seed):

            # TODO: integrate with padding index
            result = murmurhash3_32(x, seed=seed)
            result[self.padding_idx] = 0

            return result % self.compressed_num_embeddings
Ejemplo n.º 13
0
def get_x(row, D):
    x = [0]  # 0 is the index of the bias term
    for i, value in enumerate(row):
        #index = int(value + str(i), 16) % D  # weakest hash ever ;)
        index = murmurhash3_32(value + str(i), seed=0) % D
        x.append(index)
    return x  # x contains indices of features that have a value of 1
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs

    nrows = X.shape[0]
    ncols = X.shape[1]
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    for i, row in enumerate(X):
        if i % 100000 == 0: logging.debug(i)
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            ij[1, start +
               j] = murmurhash3_32('%d_%s' %
                                   (j, x), seed=42, positive=True) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))

    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
Ejemplo n.º 15
0
def get_x(row, D):
    x = [0]  # 0 is the index of the bias term
    for i, value in enumerate(row):
        #index = int(value + str(i), 16) % D  # weakest hash ever ;)
        index = murmurhash3_32(value + str(i), seed=0) % D
        x.append(index)
    return x  # x contains indices of features that have a value of 1
Ejemplo n.º 16
0
def hash_next(key: int, subword: Union[str, int]):
    next = murmurhash3_32(subword) if isinstance(subword, str) else subword
    if key is None:
        key = next
    else:
        key = (key * 8978948897894561157) ^ ((1 + next) * 17894857484156487943)
    return key
Ejemplo n.º 17
0
def get_x(csv_row, D):
    x = [0]  # 0 is the index of the bias term
    for key, value in [kvp for kvp in csv_row.items() if kvp[0] != 'bgctr']:
        index = murmurhash3_32(value + key[1:],
                               positive=True) % D  # weakest hash ever ;)
        x.append(index)
    return x  # x contains indices of features that have a value of 1
def user_based_train_test_split(interactions,
                                test_percentage=0.2,
                                random_state=None):
    """
    Split interactions between a train and a test set based on
    user ids, so that a given user's entire interaction history
    is either in the train, or the test set.

    Parameters
    ----------

    interactions: :class:`spotlight.interactions.Interactions`
        The interactions to shuffle.
    test_percentage: float, optional
        The fraction of users to place in the test set.
    random_state: np.random.RandomState, optional
        The random state used for the shuffle.

    Returns
    -------

    (train, test): (:class:`spotlight.interactions.Interactions`,
                    :class:`spotlight.interactions.Interactions`)
         A tuple of (train data, test data)
    """

    if random_state is None:
        random_state = np.random.RandomState()

    minint = np.iinfo(np.uint32).min
    maxint = np.iinfo(np.uint32).max

    seed = random_state.randint(minint, maxint, dtype=np.int64)

    in_test = (
        (murmurhash3_32(interactions.user_ids, seed=seed, positive=True) %
         100 / 100.0) < test_percentage)
    in_train = np.logical_not(in_test)

    train = Interactions(interactions.user_ids[in_train],
                         interactions.item_ids[in_train],
                         ratings=_index_or_none(interactions.ratings,
                                                in_train),
                         timestamps=_index_or_none(interactions.timestamps,
                                                   in_train),
                         weights=_index_or_none(interactions.weights,
                                                in_train),
                         num_users=interactions.num_users,
                         num_items=interactions.num_items)
    test = Interactions(interactions.user_ids[in_test],
                        interactions.item_ids[in_test],
                        ratings=_index_or_none(interactions.ratings, in_test),
                        timestamps=_index_or_none(interactions.timestamps,
                                                  in_test),
                        weights=_index_or_none(interactions.weights, in_test),
                        num_users=interactions.num_users,
                        num_items=interactions.num_items)

    return train, test
Ejemplo n.º 19
0
 def add_server(self, server_id, size):
     hash_value = murmurhash3_32(server_id) % self.hash_size
     # not sure how to deal with cycle
     while True:
         flag = True
         for p in range(size):
             if self.hash_space[hash_value + p] != -1:
                 flag = False
         if flag:
             for q in range(size):
                 self.hash_space[hash_value + q] = server_id
             self.status.insert(server_id, 1)
             self.servers[server_id] = []
             break
         else:
             hash_value = murmurhash3_32(hash_value) % self.hash_size
     return
Ejemplo n.º 20
0
 def increase(self,atmId):
     """Given an id, increases occurrence frequency of that item by one.
     """
     data = np.load(self.fname)
     for row in range(self.d):
         col = murmurhash3_32(atmId, seed=row, positive=True) % self.w
         data[row][col]+=1
     np.save(self.fname,data)
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = sfs.shape[1] + bfs.shape[1]
    X = np.hstack((sfs, bfs, ifs, ffs))
    del bfs, ffs, ifs, sfs

    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols * (npaircols - 1) / 2
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    #hasher = pyhash.murmur3_32()
    for i, row in enumerate(X):
        if i % 50000 == 0:
            logging.debug(i)
            gc.collect()
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d
            ij[1,
               start + j] = murmurhash3_32(str(
                   (j, x)), seed=42, positive=True) % d
            #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d
            #ij[1,start+j] = abs(hash((j,x))) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])),
                #                         seed=42)) % d
                ij[1, j] = murmurhash3_32(str((j1, row[j1], j2, row[j2])),
                                          seed=42,
                                          positive=True) % d
                #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]),
                #                     seed=42)) % d
                #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))

    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
Ejemplo n.º 22
0
 def add_server(self, server_id, size):
     for p in range(size):
         h_val = murmurhash3_32(server_id, p) % self.hash_size
         cnt = 0
         # not sure how to deal with cycle
         while True:
             if cnt > 50:
                 cnt = 0
                 h_val = (h_val + random.randint(1, 30)) % self.hash_size
             if self.hash_space[h_val] != -1:
                 h_val = murmurhash3_32(h_val, p) % self.hash_size
                 cnt += 1
             else:
                 self.hash_space[h_val] = server_id
                 break
     self.status.insert(server_id, 1)
     self.servers[server_id] = []
     return
Ejemplo n.º 23
0
 def add_element(self, vid):
     start_spoca = time.time()
     h_val = murmurhash3_32(vid) % self.hash_size
     # not sure how to deal with cycle
     cnt = 0
     while True:
         if cnt > 50:
             cnt = 0
             h_val = (h_val + random.randint(1, 20)) % self.hash_size
         if self.hash_space[h_val] == -1:
             h_val = murmurhash3_32(h_val) % self.hash_size
             cnt += 1
         else:
             server_id = self.hash_space[h_val]
             self.servers[server_id].append(vid)
             break
     end_spoca = time.time()
     self.total_time += end_spoca - start_spoca
     return
Ejemplo n.º 24
0
def lentokens(data, hashbins):
    feats = re.findall(r"([^\x00-\x7F]+|\w+)", data)
    final_feats = []
    rv = np.zeros(8 * hashbins)
    for feat in feats:
        loglength = int(min(8, max(1, math.log(len(feat), 1.4)))) - 1  # 0-7
        shash = murmurhash3_32(feat) % (hashbins)
        rv[loglength * (hashbins) + shash] += 1

    return rv
Ejemplo n.º 25
0
    def get_embedding(self, token, seed=6):
        max_length = 5
        if self.matrix is None:
            self.create(seed)

        if len(token) <= max_length and token.isdigit():
            hash_index = murmurhash3_32(token, positive=True) % self.size
            return self.matrix[hash_index]
        else:
            return np.zeros(self.dim)
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = sfs.shape[1] + bfs.shape[1]
    X = np.hstack((sfs, bfs, ifs, ffs))
    del bfs, ffs, ifs, sfs
    
    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols*(npaircols-1)/2
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    #hasher = pyhash.murmur3_32()
    for i, row in enumerate(X):
        if i % 50000 == 0: 
            logging.debug(i)
            gc.collect()
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d
            ij[1,start+j] = murmurhash3_32(str((j,x)), seed=42, positive=True) % d
            #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d
            #ij[1,start+j] = abs(hash((j,x))) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), 
                #                         seed=42)) % d
                ij[1,j] = murmurhash3_32(str((j1,row[j1],j2,row[j2])), 
                                         seed=42, positive=True) % d
                #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), 
                #                     seed=42)) % d
                #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
def hash_(token: str, hash_size: int) -> int:
    """Convert a token to a hash of given size.
    Args:
        token: a word
        hash_size: hash size

    Returns:
        int, hashed token

    """
    return murmurhash3_32(token, positive=True) % hash_size
Ejemplo n.º 28
0
def hash_(token: str, hash_size: int) -> int:
    """Convert a token to a hash of given size.
    Args:
        token: a word
        hash_size: hash size

    Returns:
        int, hashed token

    """
    return murmurhash3_32(token, positive=True) % hash_size
 def minhash(self, string, n_components, ngram_range):
     min_hashes = np.ones(n_components) * np.infty
     grams = self.get_unique_ngrams(string, self.ngram_range)
     if len(grams) == 0:
         grams = self.get_unique_ngrams(' Na ', self.ngram_range)
     for gram in grams:
         hash_array = np.array([
             murmurhash3_32(''.join(gram), seed=d, positive=True)
             for d in range(n_components)])
         min_hashes = np.minimum(min_hashes, hash_array)
     return min_hashes/(2**32-1)
Ejemplo n.º 30
0
 def calculate(self,atmId):
     """Given an id, returns the approximate historic frequency for that
     event.
     """
     freq = -1
     try:
         data = np.load(self.fname)
         for row in range(self.d):
             col = murmurhash3_32(atmId, seed=row, positive=True) % self.w
             freq = data[row][col] if (data[row][col] < freq or freq < 0) else freq
     except Exception, e:
         return 0
Ejemplo n.º 31
0
def hashword(word, hashsize=16777216):
    '''
        hash the word using murmurhash3_32 to a positive int value
        
        input:
            - word: string format word
            - hashsize: maximum number, default is 16777216
        
        output:
            - int
    '''
    return murmurhash3_32(word, positive=True) % (hashsize)
Ejemplo n.º 32
0
 def _make_hashfuncs(key):
     if isinstance(key, unicode):
         key = key.encode('utf-8')
     else:
         key = str(key)
     rval = []
     current_hash = None
     for i in range(nbr_slices):
         seed = current_hash or 0
         current_hash = utils.murmurhash3_32(key, seed, True)
         rval.append(current_hash % nbr_bits)
     return rval
Ejemplo n.º 33
0
    def tagRawSentenceHash(self, rawLine):
        line = initializeSentence(self.DICT, rawLine)

        sen = []
        wordTags = line.split()

        for i in range(len(wordTags)):
            fwObject = FWObject.getFWObject(wordTags, i)
            word, tag = getWordTag(wordTags[i])
            node = self.findFiredNode(fwObject)

            #Format and return tagged word
            if node.depth > 0:
                tag = node.conclusion

            #Special units
            if "<" in word:
                if word in ["<url>", "<email>" "<phone>", "<cur>"]:
                    tag = "NOUN"
                elif word == "<number>":
                    tag = "NUM"

            #Hash word / tag
            tag_hash = murmurhash3_32(tag, seed=0)
            word_hash = murmurhash3_32(word, seed=0)

            #Get semantic category, if it is an open-class word
            if tag in ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]:
                word_cat = self.word_dict.get(word_hash, -1)

            #Closed class words don't have a semantic category
            else:
                word_cat = -1

            #Add to list
            sen.append((word_hash, tag_hash, word_cat))

        return sen
def MinHash(A, m):
    """
    :param A:  the input string
    :param m:  the number of hash functions
    :return:
    """
    min_hash = []
    for i in range(m):
        temp = []
        for j in range(len(A)):
            temp.append(murmurhash3_32(A[j], seed=i))
        min_hash.append(min(temp))
    #return m-length hashcodes
    return min_hash
Ejemplo n.º 35
0
def hash(token, num_buckets):
    """Unsigned 32 bit murmurhash for feature hashing."""
    ret = murmurhash3_32(token, positive=True)

    if ret >= num_buckets:
        ret = ret % num_buckets
        #print('duplicate')

    #if ret == 12088341:
    #    print('token')
    #    print(token)
    #    #assert False

    return ret
Ejemplo n.º 36
0
 def add_server(self, server_id, size):
     # print("add")
     h_val = murmurhash3_32(server_id) % self.hash_size
     cnt = 0
     # not sure how to deal with cycle
     while True:
         flag = True
         if cnt > 50:
             cnt = 0
             h_val = (h_val + random.randint(1, 30)) % self.hash_size
         for p in range(size):
             if self.hash_space[h_val + p] != -1:
                 flag = False
         if flag:
             for q in range(size):
                 self.hash_space[h_val + q] = server_id
             break
         else:
             h_val = murmurhash3_32(h_val) % self.hash_size
             cnt += 1
     self.status.insert(server_id, 1)
     self.overhead.append(-size)
     self.servers[server_id] = []
     return
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs
    
    nrows = X.shape[0]
    ncols = X.shape[1]
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    for i, row in enumerate(X):
        if i % 100000 == 0: logging.debug(i)
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
Ejemplo n.º 38
0
def user_based_train_test_split(interactions,
                                test_percentage=0.2,
                                random_state=None):
    """
    Split interactions between a train and a test set based on
    user ids, so that a given user's entire interaction history
    is either in the train, or the test set.

    Parameters
    ----------

    interactions: :class:`spotlight.interactions.Interactions`
        The interactions to shuffle.
    test_percentage: float, optional
        The fraction of users to place in the test set.
    random_state: np.random.RandomState, optional
        The random state used for the shuffle.

    Returns
    -------

    (train, test): (:class:`spotlight.interactions.Interactions`,
                    :class:`spotlight.interactions.Interactions`)
         A tuple of (train data, test data)
    """

    if random_state is None:
        random_state = np.random.RandomState()

    minint = np.iinfo(np.uint32).min
    maxint = np.iinfo(np.uint32).max

    seed = random_state.randint(minint, maxint)

    in_test = ((murmurhash3_32(interactions.user_ids,
                               seed=seed,
                               positive=True) % 100 /
                100.0) <
               test_percentage)
    in_train = np.logical_not(in_test)

    train = Interactions(interactions.user_ids[in_train],
                         interactions.item_ids[in_train],
                         ratings=_index_or_none(interactions.ratings,
                                                in_train),
                         timestamps=_index_or_none(interactions.timestamps,
                                                   in_train),
                         weights=_index_or_none(interactions.weights,
                                                in_train),
                         num_users=interactions.num_users,
                         num_items=interactions.num_items)
    test = Interactions(interactions.user_ids[in_test],
                        interactions.item_ids[in_test],
                        ratings=_index_or_none(interactions.ratings,
                                               in_test),
                        timestamps=_index_or_none(interactions.timestamps,
                                                  in_test),
                        weights=_index_or_none(interactions.weights,
                                               in_test),
                        num_users=interactions.num_users,
                        num_items=interactions.num_items)

    return train, test
Ejemplo n.º 39
0
                if timestamp < long(valoldesttimestamp):
                    r.set(keyoldesttimestamp, timestamp)
                    r.set(keytimestampdiff, long(vallatesttimestamp) - timestamp)
                elif timestamp > long(vallatesttimestamp):
                    r.set(keylatesttimestamp, timestamp)
                    r.set(keytimestampdiff, timestamp - long(valoldesttimestamp))
            
#           Localbox. frequency feature
            eidFeatures[eid][3] = eidFeatures[eid][3] + 1
            
#           Redis. frequency feature
            keyfrequency = eid + '_3'
            r.incr(keyfrequency, 1)
                         
#           Localbox. monetary features (from hashing)
            eidFeatures[eid][4][murmurhash3_32(cid) % N] += 1
            eidFeatures[eid][5][murmurhash3_32(src_evt) % N] += 1
            eidFeatures[eid][6][murmurhash3_32(cat) % N] += 1
            eidFeatures[eid][7][murmurhash3_32(obj) % N] += 1
            
##           Redis. monetary features (from hashing)            
            keycid      = eid + '_4' + '_' + str(murmurhash3_32(cid) % N)
            keysrcevt   = eid + '_5' + '_' + str(murmurhash3_32(src_evt) % N)
            keycat      = eid + '_6' + '_' + str(murmurhash3_32(cat) % N)
            keyobj      = eid + '_7' + '_' + str(murmurhash3_32(obj) % N)
            
            r.incr(keycid, 1)
            r.incr(keysrcevt, 1)
            r.incr(keycat, 1)
            r.incr(keyobj, 1)
            
Ejemplo n.º 40
0
	def __init__(self, Loader, word_classes = False, zho_split = False):
		
		self.language = Loader.language
		self.zho_split = zho_split
		self.Loader = Loader
		
		MODEL_STRING = os.path.join(".", "data", "pos_rdr", self.language + ".RDR")
		DICT_STRING = os.path.join(".", "data", "pos_rdr", self.language + ".DICT")
		DICTIONARY_FILE = os.path.join(".", "data", "dictionaries", self.language + DICT_CONSTANT)
		
		#zho needs an additional tokenizer
		if self.language == "zho":
			
			try:
				import modules.jieba.jeiba as jb
			except:
				import c2xg.modules.jieba.jeiba as jb
				
			self.tk = jb.Tokenizer()
			self.tk.initialize()
			self.tk.lock = True

		#Initialize tagger
		self.r = RDRPOSTagger()
		self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING) 
		self.DICT = readDictionary(DICT_STRING) 
				
		# #Initialize emoji remover
		try:
		# Wide UCS-4 build
			self.myre = re.compile(u'['
				u'\U0001F300-\U0001F64F'
				u'\U0001F680-\U0001F6FF'
				u'\u2600-\u26FF\u2700-\u27BF]+', 
				re.UNICODE)
		except re.error:
			# Narrow UCS-2 build
				self.myre = re.compile(u'('
				u'\ud83c[\udf00-\udfff]|'
				u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
				u'[\u2600-\u26FF\u2700-\u27BF])+', 
				re.UNICODE)
		
		#Universal POS Tags are fixed across languages
		pos_list = ["PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX", "ADJ", "INTJ", "SCONJ", "PRON", "NUM", "PUNCT", "ADV", "ADP", "X", "NOUN", "PART"]
		self.pos_dict = {murmurhash3_32(pos, seed=0): pos for pos in pos_list}
		
		#Get semantic dict, unless currently training those dicts
		if word_classes == False:
			
			try:
				with open(DICTIONARY_FILE, "rb") as fo:
					self.word_dict = pickle.load(fo)
			except:
				with open(os.path.join("..", "c2xg", "c2xg", DICTIONARY_FILE), "rb") as fo:
					self.word_dict = pickle.load(fo)

			self.domain_dict = {murmurhash3_32(key, seed=0): self.word_dict[key] for key in self.word_dict.keys()}
			self.word_dict = {murmurhash3_32(key, seed=0): key for key in self.word_dict.keys()}
			
			#Build decoder
			self.build_decoder()
Ejemplo n.º 41
0
# Daniel Rodgers-Pryor, 16/8/2013

###
# I had to build my own bloom filter because the ones I could find used C code that wouldn't compile in windows.
###

from bitarray import bitarray
from sklearn.utils import murmurhash3_32
from math import log, ceil, floor

base_hash = lambda x: murmurhash3_32(x, seed = 0) # Note: might return a negative int
# If you don't have scikit.learn feel free to comment this out and add your
# own hash function here; just make sure that it has appropriate uniformity and speed.
# The fnv hash would be another good choice

class BloomFilter:
    def __init__(self, iterable = (), max_entries = None, false_positive_rate = 0.01):
        '''
            If max_entries is undefined, then iterable must be amenable to len() (and must be reconsumable).
            If this approach is taken, no objects should be added later (if they are, the false_positive_rate will no
            longer apply). If you wish to add objects later, specify a suitable max_entries value.
        '''
        if not max_entries: max_entries = len(iterable)
        max_entries = max(max_entries, 2) # Length 0 or 1 filters are pointless (length 0 filters break some of the maths)
        
        lg_2 = log(2)
        self.n = max_entries
        self.p = false_positive_rate
        optimal_size = -self.n * log(self.p) / (lg_2**2) # Formula from wikipedia
        self.index_bits = int(ceil(log(optimal_size, 2))) # Number of bits needed to address an array of this size
        self.m = 2**self.index_bits # Round up array size to a power of 2
def test_murmurhash_sklearn(iterable):
    [ murmurhash3_32(item) for item in iterable ]
def tidy_data( data_list, label_list=[] ):

        targets = len( label_list ) >0
        
        if (targets and len(data_list)!=len(label_list)):
                print "Error; Not enough labels for data"
                return
        
        labels = []
        item_ids = []

        col = []
        row = []
        cur_row = 0
        data = []

	max_size = 999331 #39916801

        for i in range( len( data_list ) ):

                print 'Processing Samplelist no ' + str(i)
                samples = data_list[i]

                if (targets and len(samples)!=len(label_list[i])):
                        print "Error; Not enough labels for data"
                        return   
                if targets:
                        labels.extend( label_list[i] )             

                for sample in samples:
                        
                        item_ids.append( int(sample[ 'item_id' ]) )

                        data.append( sample['price'] )
                        row.append(cur_row)
                        col.append( murmurhash3_32( 'price') % max_size )

                        data.append( sample['phone'] )
                        row.append(cur_row)
                        col.append( murmurhash3_32('phone') % max_size )

                        data.append( sample['email'] )
                        row.append(cur_row)
                        col.append( murmurhash3_32('email') % max_size )

                        data.append( sample['urls'] )
                        row.append(cur_row)
                        col.append( murmurhash3_32('urls') % max_size )

                        for word, count in sample['words'].iteritems():
                                data.append( count )
                                row.append( cur_row )
                                col.append( murmurhash3_32( word) % max_size )


                        cur_row += 1
        features = sp.csr_matrix((data,(row,col)), shape=( max(cur_row,1), max_size), dtype=np.float64)

        if targets:
                return item_ids, features, labels
        else:
                return item_ids, features
Ejemplo n.º 44
0
def hash(token, num_buckets):
    """Unsigned 32 bit murmurhash for feature hashing."""
    return murmurhash3_32(token, positive=True) % num_buckets
Ejemplo n.º 45
0
def get_x(csv_row, D):
    x = [0]  # 0 is the index of the bias term
    for key, value in [kvp for kvp in csv_row.items() if kvp[0]!='bgctr']:
        index = murmurhash3_32(value + key[1:], positive=True) % D  # weakest hash ever ;)
        x.append(index)
    return x  # x contains indices of features that have a value of 1