def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = bfs.shape[1] + sfs.shape[1] X = np.hstack((bfs, sfs, ifs)) del bfs, ifs, sfs X2 = sparse.csr_matrix(loader['ffeatures']) nrows = X.shape[0] ncols = X.shape[1] + npaircols * (npaircols - 1) / 2 ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): ij[1, start + j] = murmurhash3_32('%d_%s' % (j, x), seed=42, positive=True) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 ij[1, j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1, row[j1], j2, row[j2]), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) X = sparse.hstack((X_hot, X2)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X)
def tagRawSentenceHash(self, rawLine, DICT, word_dict): line = initializeSentence(DICT, rawLine) sen = [] wordTags = line.split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) #Only hash word once and block out-of-lexicon words word_hash = murmurhash3_32(word, seed=0) try: word_cat = ct.get(word_hash, word_dict) except: word_cat = 0 word_hash = 0 #Format and return if node.depth > 0: sen.append((word_hash, murmurhash3_32(node.conclusion, seed=0), word_cat)) else: # Fired at root, return initialized tag sen.append((word_hash, murmurhash3_32(tag, seed=0), word_cat)) return sen
def tagRawSentenceHash(self, rawLine, DICT, word_dict): line = initializeSentence(DICT, rawLine) sen = [] wordTags = line.split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) #Format and return tagged word if node.depth > 0: tag = node.conclusion #Hash word / tag word = word + "/" + tag tag_hash = murmurhash3_32(tag, seed=0) word_hash = murmurhash3_32(word, seed=0) #Get semantic category try: word_cat = word_dict[word_hash] except: word_cat = 0 word_hash = 0 #Add to list sen.append((word_hash, tag_hash, word_cat)) return sen
def add_element(self, vid): start_mult = time.time() h_val = murmurhash3_32(vid) % self.hash_size cnt = 0 # not sure how to deal with cycle while True: # print("caonimmmmmmmma") if cnt > 80: cnt = 0 h_val = (h_val + random.randint(1, 40)) % self.hash_size if self.hash_space[h_val] != -1: h_val = murmurhash3_32(h_val) % self.hash_size cnt += 1 else: # print("here?") server_id = self.search_server(h_val) # print(server_id) # print(vid) # print(self.servers.keys()) self.servers[server_id].append(vid) break # print("zhelimmmmma") end_mult = time.time() self.total_time += end_mult - start_mult return
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = bfs.shape[1] + sfs.shape[1] X = np.hstack((bfs, sfs, ifs)) del bfs, ifs, sfs X2 = sparse.csr_matrix(loader['ffeatures']) nrows = X.shape[0] ncols = X.shape[1] + npaircols*(npaircols-1)/2 ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 ij[1,j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1,row[j1],j2,row[j2]), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) X = sparse.hstack((X_hot, X2)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X)
def add_element(self, vid): start_bound = time.time() h_val = murmurhash3_32(vid) % self.hash_size cnt = 0 # not sure how to deal with cycle flag = True for _ in self.overhead: if _ < 0: flag = False break if flag: least = min(self.overhead) server_id = self.overhead.index(least) self.overhead[server_id] += 1 self.servers[server_id + 1].append(vid) else: while True: if cnt > 50: cnt = 0 h_val = (h_val + random.randint(1, 30)) % self.hash_size if self.hash_space[h_val] != -1: h_val = murmurhash3_32(h_val) % self.hash_size cnt += 1 else: ptr = (h_val - 1) % self.hash_size while True: # print(ptr, h_val) if ptr == h_val: print("There is no server in the hash space") server_id = -1 break elif self.hash_space[ptr] != -1: server_id = self.hash_space[ptr] break else: ptr = (ptr - 1) % self.hash_size # server_id = self.search_server(h_val) if server_id == -1: print("no server in the hash space") break if self.overhead[server_id - 1] < 0: self.overhead[server_id - 1] += 1 self.servers[server_id].append(vid) break else: h_val = murmurhash3_32(h_val) % self.hash_size cnt += 1 end_bound = time.time() # print(end_bound - start_bound, self.status.count(1)) self.total_time += end_bound - start_bound return
def __init__(self, Loader, word_classes=False, zho_split=False): self.language = Loader.language self.zho_split = zho_split self.Loader = Loader MODEL_STRING = Path(__file__).parent / os.path.join( "..", "data", "pos_rdr", self.language + ".RDR") DICT_STRING = Path(__file__).parent / os.path.join( "..", "data", "pos_rdr", self.language + ".DICT") DICTIONARY_FILE = Path(__file__).parent / os.path.join( "..", "data", "dictionaries", self.language + DICT_CONSTANT) #zho needs an additional tokenizer if self.language == "zho": try: import modules.jieba.jeiba as jb except: import c2xg.modules.jieba.jeiba as jb self.tk = jb.Tokenizer() self.tk.initialize() self.tk.lock = True #Universal POS Tags are fixed across languages pos_list = [ "PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX", "ADJ", "INTJ", "SCONJ", "PRON", "NUM", "PUNCT", "ADV", "ADP", "X", "NOUN", "PART" ] self.pos_dict = {murmurhash3_32(pos, seed=0): pos for pos in pos_list} #Get semantic dict self.word_dict = pd.read_csv(DICTIONARY_FILE, index_col=0).to_dict()["Cluster"] self.domain_dict = { murmurhash3_32(str(key), seed=0): self.word_dict[key] for key in self.word_dict.keys() } self.word_dict = { murmurhash3_32(str(key), seed=0): key for key in self.word_dict.keys() } #Build decoder self.build_decoder() #Initialize tagger self.DICT = readDictionary(DICT_STRING) self.r = RDRPOSTagger(word_dict=self.domain_dict, DICT=self.DICT) self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING)
def minhash(self, string, n_components, ngram_range): """ Encode a string using murmur hashing function. Parameters ---------- string : str The string to encode. n_components : int The number of dimension of encoded string. ngram_range : tuple (min_n, max_n) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n. Returns ------- array, shape (n_components, ) The encoded string. """ min_hashes = np.ones(n_components) * np.infty grams = self.get_unique_ngrams(string, self.ngram_range) if len(grams) == 0: grams = self.get_unique_ngrams(' Na ', self.ngram_range) for gram in grams: hash_array = np.array([ murmurhash3_32(''.join(gram), seed=d, positive=True) for d in range(n_components)]) min_hashes = np.minimum(min_hashes, hash_array) return min_hashes/(2**32-1)
def prepare(self, features, indices=None, dtype=float): X1 = super(TSOneHotHashingPairsEncoder, self).prepare(features, indices) logging.info("One-hot hashing pairs of string and boolean features") sfs = features['sfeatures'] bfs = features['bfeatures'] if indices is not None: sfs = sfs[indices] bfs = bfs[indices] X = np.hstack((sfs, bfs)) del sfs, bfs nrows = X.shape[0] ncols = X.shape[1]*(X.shape[1]-1) / 2 ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i ij[1,start:end] = [murmurhash3_32('%d_%s_x_%d_%s' % (j1,x1,j2,row[j2]), seed=42, positive=True) % self.D for j1, x1 in enumerate(row) for j2 in xrange(j1)] data = np.ones(ij.shape[1], dtype=dtype) # all ones X2 = sparse.csr_matrix((data, ij), shape=(nrows, self.D), dtype=dtype) X = X1 + X2 X.data[X.data > 1] = 1 return X
def prepare(self, features, indices=None, dtype=float): logging.info("One-hot hashing all features") bfs = features['bfeatures'] ffs = features['ffeatures'] ifs = features['ifeatures'] sfs = features['sfeatures'] if indices is not None: bfs = bfs[indices] ffs = ffs[indices] ifs = ifs[indices] sfs = sfs[indices] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % self.D data = np.ones(ij.shape[1], dtype=dtype) # all ones X = sparse.csr_matrix((data, ij), shape=(nrows, self.D), dtype=dtype) return X
def _hash(x, seed): # TODO: integrate with padding index result = murmurhash3_32(x, seed=seed) result[self.padding_idx] = 0 return result % self.compressed_num_embeddings
def get_x(row, D): x = [0] # 0 is the index of the bias term for i, value in enumerate(row): #index = int(value + str(i), 16) % D # weakest hash ever ;) index = murmurhash3_32(value + str(i), seed=0) % D x.append(index) return x # x contains indices of features that have a value of 1
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): ij[1, start + j] = murmurhash3_32('%d_%s' % (j, x), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def hash_next(key: int, subword: Union[str, int]): next = murmurhash3_32(subword) if isinstance(subword, str) else subword if key is None: key = next else: key = (key * 8978948897894561157) ^ ((1 + next) * 17894857484156487943) return key
def get_x(csv_row, D): x = [0] # 0 is the index of the bias term for key, value in [kvp for kvp in csv_row.items() if kvp[0] != 'bgctr']: index = murmurhash3_32(value + key[1:], positive=True) % D # weakest hash ever ;) x.append(index) return x # x contains indices of features that have a value of 1
def user_based_train_test_split(interactions, test_percentage=0.2, random_state=None): """ Split interactions between a train and a test set based on user ids, so that a given user's entire interaction history is either in the train, or the test set. Parameters ---------- interactions: :class:`spotlight.interactions.Interactions` The interactions to shuffle. test_percentage: float, optional The fraction of users to place in the test set. random_state: np.random.RandomState, optional The random state used for the shuffle. Returns ------- (train, test): (:class:`spotlight.interactions.Interactions`, :class:`spotlight.interactions.Interactions`) A tuple of (train data, test data) """ if random_state is None: random_state = np.random.RandomState() minint = np.iinfo(np.uint32).min maxint = np.iinfo(np.uint32).max seed = random_state.randint(minint, maxint, dtype=np.int64) in_test = ( (murmurhash3_32(interactions.user_ids, seed=seed, positive=True) % 100 / 100.0) < test_percentage) in_train = np.logical_not(in_test) train = Interactions(interactions.user_ids[in_train], interactions.item_ids[in_train], ratings=_index_or_none(interactions.ratings, in_train), timestamps=_index_or_none(interactions.timestamps, in_train), weights=_index_or_none(interactions.weights, in_train), num_users=interactions.num_users, num_items=interactions.num_items) test = Interactions(interactions.user_ids[in_test], interactions.item_ids[in_test], ratings=_index_or_none(interactions.ratings, in_test), timestamps=_index_or_none(interactions.timestamps, in_test), weights=_index_or_none(interactions.weights, in_test), num_users=interactions.num_users, num_items=interactions.num_items) return train, test
def add_server(self, server_id, size): hash_value = murmurhash3_32(server_id) % self.hash_size # not sure how to deal with cycle while True: flag = True for p in range(size): if self.hash_space[hash_value + p] != -1: flag = False if flag: for q in range(size): self.hash_space[hash_value + q] = server_id self.status.insert(server_id, 1) self.servers[server_id] = [] break else: hash_value = murmurhash3_32(hash_value) % self.hash_size return
def increase(self,atmId): """Given an id, increases occurrence frequency of that item by one. """ data = np.load(self.fname) for row in range(self.d): col = murmurhash3_32(atmId, seed=row, positive=True) % self.w data[row][col]+=1 np.save(self.fname,data)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = sfs.shape[1] + bfs.shape[1] X = np.hstack((sfs, bfs, ifs, ffs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] + npaircols * (npaircols - 1) / 2 ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices #hasher = pyhash.murmur3_32() for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) gc.collect() start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d ij[1, start + j] = murmurhash3_32(str( (j, x)), seed=42, positive=True) % d #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d #ij[1,start+j] = abs(hash((j,x))) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), # seed=42)) % d ij[1, j] = murmurhash3_32(str((j1, row[j1], j2, row[j2])), seed=42, positive=True) % d #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), # seed=42)) % d #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def add_server(self, server_id, size): for p in range(size): h_val = murmurhash3_32(server_id, p) % self.hash_size cnt = 0 # not sure how to deal with cycle while True: if cnt > 50: cnt = 0 h_val = (h_val + random.randint(1, 30)) % self.hash_size if self.hash_space[h_val] != -1: h_val = murmurhash3_32(h_val, p) % self.hash_size cnt += 1 else: self.hash_space[h_val] = server_id break self.status.insert(server_id, 1) self.servers[server_id] = [] return
def add_element(self, vid): start_spoca = time.time() h_val = murmurhash3_32(vid) % self.hash_size # not sure how to deal with cycle cnt = 0 while True: if cnt > 50: cnt = 0 h_val = (h_val + random.randint(1, 20)) % self.hash_size if self.hash_space[h_val] == -1: h_val = murmurhash3_32(h_val) % self.hash_size cnt += 1 else: server_id = self.hash_space[h_val] self.servers[server_id].append(vid) break end_spoca = time.time() self.total_time += end_spoca - start_spoca return
def lentokens(data, hashbins): feats = re.findall(r"([^\x00-\x7F]+|\w+)", data) final_feats = [] rv = np.zeros(8 * hashbins) for feat in feats: loglength = int(min(8, max(1, math.log(len(feat), 1.4)))) - 1 # 0-7 shash = murmurhash3_32(feat) % (hashbins) rv[loglength * (hashbins) + shash] += 1 return rv
def get_embedding(self, token, seed=6): max_length = 5 if self.matrix is None: self.create(seed) if len(token) <= max_length and token.isdigit(): hash_index = murmurhash3_32(token, positive=True) % self.size return self.matrix[hash_index] else: return np.zeros(self.dim)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = sfs.shape[1] + bfs.shape[1] X = np.hstack((sfs, bfs, ifs, ffs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] + npaircols*(npaircols-1)/2 ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices #hasher = pyhash.murmur3_32() for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) gc.collect() start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d ij[1,start+j] = murmurhash3_32(str((j,x)), seed=42, positive=True) % d #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d #ij[1,start+j] = abs(hash((j,x))) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), # seed=42)) % d ij[1,j] = murmurhash3_32(str((j1,row[j1],j2,row[j2])), seed=42, positive=True) % d #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), # seed=42)) % d #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def hash_(token: str, hash_size: int) -> int: """Convert a token to a hash of given size. Args: token: a word hash_size: hash size Returns: int, hashed token """ return murmurhash3_32(token, positive=True) % hash_size
def minhash(self, string, n_components, ngram_range): min_hashes = np.ones(n_components) * np.infty grams = self.get_unique_ngrams(string, self.ngram_range) if len(grams) == 0: grams = self.get_unique_ngrams(' Na ', self.ngram_range) for gram in grams: hash_array = np.array([ murmurhash3_32(''.join(gram), seed=d, positive=True) for d in range(n_components)]) min_hashes = np.minimum(min_hashes, hash_array) return min_hashes/(2**32-1)
def calculate(self,atmId): """Given an id, returns the approximate historic frequency for that event. """ freq = -1 try: data = np.load(self.fname) for row in range(self.d): col = murmurhash3_32(atmId, seed=row, positive=True) % self.w freq = data[row][col] if (data[row][col] < freq or freq < 0) else freq except Exception, e: return 0
def hashword(word, hashsize=16777216): ''' hash the word using murmurhash3_32 to a positive int value input: - word: string format word - hashsize: maximum number, default is 16777216 output: - int ''' return murmurhash3_32(word, positive=True) % (hashsize)
def _make_hashfuncs(key): if isinstance(key, unicode): key = key.encode('utf-8') else: key = str(key) rval = [] current_hash = None for i in range(nbr_slices): seed = current_hash or 0 current_hash = utils.murmurhash3_32(key, seed, True) rval.append(current_hash % nbr_bits) return rval
def tagRawSentenceHash(self, rawLine): line = initializeSentence(self.DICT, rawLine) sen = [] wordTags = line.split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) #Format and return tagged word if node.depth > 0: tag = node.conclusion #Special units if "<" in word: if word in ["<url>", "<email>" "<phone>", "<cur>"]: tag = "NOUN" elif word == "<number>": tag = "NUM" #Hash word / tag tag_hash = murmurhash3_32(tag, seed=0) word_hash = murmurhash3_32(word, seed=0) #Get semantic category, if it is an open-class word if tag in ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]: word_cat = self.word_dict.get(word_hash, -1) #Closed class words don't have a semantic category else: word_cat = -1 #Add to list sen.append((word_hash, tag_hash, word_cat)) return sen
def MinHash(A, m): """ :param A: the input string :param m: the number of hash functions :return: """ min_hash = [] for i in range(m): temp = [] for j in range(len(A)): temp.append(murmurhash3_32(A[j], seed=i)) min_hash.append(min(temp)) #return m-length hashcodes return min_hash
def hash(token, num_buckets): """Unsigned 32 bit murmurhash for feature hashing.""" ret = murmurhash3_32(token, positive=True) if ret >= num_buckets: ret = ret % num_buckets #print('duplicate') #if ret == 12088341: # print('token') # print(token) # #assert False return ret
def add_server(self, server_id, size): # print("add") h_val = murmurhash3_32(server_id) % self.hash_size cnt = 0 # not sure how to deal with cycle while True: flag = True if cnt > 50: cnt = 0 h_val = (h_val + random.randint(1, 30)) % self.hash_size for p in range(size): if self.hash_space[h_val + p] != -1: flag = False if flag: for q in range(size): self.hash_space[h_val + q] = server_id break else: h_val = murmurhash3_32(h_val) % self.hash_size cnt += 1 self.status.insert(server_id, 1) self.overhead.append(-size) self.servers[server_id] = [] return
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def user_based_train_test_split(interactions, test_percentage=0.2, random_state=None): """ Split interactions between a train and a test set based on user ids, so that a given user's entire interaction history is either in the train, or the test set. Parameters ---------- interactions: :class:`spotlight.interactions.Interactions` The interactions to shuffle. test_percentage: float, optional The fraction of users to place in the test set. random_state: np.random.RandomState, optional The random state used for the shuffle. Returns ------- (train, test): (:class:`spotlight.interactions.Interactions`, :class:`spotlight.interactions.Interactions`) A tuple of (train data, test data) """ if random_state is None: random_state = np.random.RandomState() minint = np.iinfo(np.uint32).min maxint = np.iinfo(np.uint32).max seed = random_state.randint(minint, maxint) in_test = ((murmurhash3_32(interactions.user_ids, seed=seed, positive=True) % 100 / 100.0) < test_percentage) in_train = np.logical_not(in_test) train = Interactions(interactions.user_ids[in_train], interactions.item_ids[in_train], ratings=_index_or_none(interactions.ratings, in_train), timestamps=_index_or_none(interactions.timestamps, in_train), weights=_index_or_none(interactions.weights, in_train), num_users=interactions.num_users, num_items=interactions.num_items) test = Interactions(interactions.user_ids[in_test], interactions.item_ids[in_test], ratings=_index_or_none(interactions.ratings, in_test), timestamps=_index_or_none(interactions.timestamps, in_test), weights=_index_or_none(interactions.weights, in_test), num_users=interactions.num_users, num_items=interactions.num_items) return train, test
if timestamp < long(valoldesttimestamp): r.set(keyoldesttimestamp, timestamp) r.set(keytimestampdiff, long(vallatesttimestamp) - timestamp) elif timestamp > long(vallatesttimestamp): r.set(keylatesttimestamp, timestamp) r.set(keytimestampdiff, timestamp - long(valoldesttimestamp)) # Localbox. frequency feature eidFeatures[eid][3] = eidFeatures[eid][3] + 1 # Redis. frequency feature keyfrequency = eid + '_3' r.incr(keyfrequency, 1) # Localbox. monetary features (from hashing) eidFeatures[eid][4][murmurhash3_32(cid) % N] += 1 eidFeatures[eid][5][murmurhash3_32(src_evt) % N] += 1 eidFeatures[eid][6][murmurhash3_32(cat) % N] += 1 eidFeatures[eid][7][murmurhash3_32(obj) % N] += 1 ## Redis. monetary features (from hashing) keycid = eid + '_4' + '_' + str(murmurhash3_32(cid) % N) keysrcevt = eid + '_5' + '_' + str(murmurhash3_32(src_evt) % N) keycat = eid + '_6' + '_' + str(murmurhash3_32(cat) % N) keyobj = eid + '_7' + '_' + str(murmurhash3_32(obj) % N) r.incr(keycid, 1) r.incr(keysrcevt, 1) r.incr(keycat, 1) r.incr(keyobj, 1)
def __init__(self, Loader, word_classes = False, zho_split = False): self.language = Loader.language self.zho_split = zho_split self.Loader = Loader MODEL_STRING = os.path.join(".", "data", "pos_rdr", self.language + ".RDR") DICT_STRING = os.path.join(".", "data", "pos_rdr", self.language + ".DICT") DICTIONARY_FILE = os.path.join(".", "data", "dictionaries", self.language + DICT_CONSTANT) #zho needs an additional tokenizer if self.language == "zho": try: import modules.jieba.jeiba as jb except: import c2xg.modules.jieba.jeiba as jb self.tk = jb.Tokenizer() self.tk.initialize() self.tk.lock = True #Initialize tagger self.r = RDRPOSTagger() self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING) self.DICT = readDictionary(DICT_STRING) # #Initialize emoji remover try: # Wide UCS-4 build self.myre = re.compile(u'[' u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build self.myre = re.compile(u'(' u'\ud83c[\udf00-\udfff]|' u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' u'[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) #Universal POS Tags are fixed across languages pos_list = ["PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX", "ADJ", "INTJ", "SCONJ", "PRON", "NUM", "PUNCT", "ADV", "ADP", "X", "NOUN", "PART"] self.pos_dict = {murmurhash3_32(pos, seed=0): pos for pos in pos_list} #Get semantic dict, unless currently training those dicts if word_classes == False: try: with open(DICTIONARY_FILE, "rb") as fo: self.word_dict = pickle.load(fo) except: with open(os.path.join("..", "c2xg", "c2xg", DICTIONARY_FILE), "rb") as fo: self.word_dict = pickle.load(fo) self.domain_dict = {murmurhash3_32(key, seed=0): self.word_dict[key] for key in self.word_dict.keys()} self.word_dict = {murmurhash3_32(key, seed=0): key for key in self.word_dict.keys()} #Build decoder self.build_decoder()
# Daniel Rodgers-Pryor, 16/8/2013 ### # I had to build my own bloom filter because the ones I could find used C code that wouldn't compile in windows. ### from bitarray import bitarray from sklearn.utils import murmurhash3_32 from math import log, ceil, floor base_hash = lambda x: murmurhash3_32(x, seed = 0) # Note: might return a negative int # If you don't have scikit.learn feel free to comment this out and add your # own hash function here; just make sure that it has appropriate uniformity and speed. # The fnv hash would be another good choice class BloomFilter: def __init__(self, iterable = (), max_entries = None, false_positive_rate = 0.01): ''' If max_entries is undefined, then iterable must be amenable to len() (and must be reconsumable). If this approach is taken, no objects should be added later (if they are, the false_positive_rate will no longer apply). If you wish to add objects later, specify a suitable max_entries value. ''' if not max_entries: max_entries = len(iterable) max_entries = max(max_entries, 2) # Length 0 or 1 filters are pointless (length 0 filters break some of the maths) lg_2 = log(2) self.n = max_entries self.p = false_positive_rate optimal_size = -self.n * log(self.p) / (lg_2**2) # Formula from wikipedia self.index_bits = int(ceil(log(optimal_size, 2))) # Number of bits needed to address an array of this size self.m = 2**self.index_bits # Round up array size to a power of 2
def test_murmurhash_sklearn(iterable): [ murmurhash3_32(item) for item in iterable ]
def tidy_data( data_list, label_list=[] ): targets = len( label_list ) >0 if (targets and len(data_list)!=len(label_list)): print "Error; Not enough labels for data" return labels = [] item_ids = [] col = [] row = [] cur_row = 0 data = [] max_size = 999331 #39916801 for i in range( len( data_list ) ): print 'Processing Samplelist no ' + str(i) samples = data_list[i] if (targets and len(samples)!=len(label_list[i])): print "Error; Not enough labels for data" return if targets: labels.extend( label_list[i] ) for sample in samples: item_ids.append( int(sample[ 'item_id' ]) ) data.append( sample['price'] ) row.append(cur_row) col.append( murmurhash3_32( 'price') % max_size ) data.append( sample['phone'] ) row.append(cur_row) col.append( murmurhash3_32('phone') % max_size ) data.append( sample['email'] ) row.append(cur_row) col.append( murmurhash3_32('email') % max_size ) data.append( sample['urls'] ) row.append(cur_row) col.append( murmurhash3_32('urls') % max_size ) for word, count in sample['words'].iteritems(): data.append( count ) row.append( cur_row ) col.append( murmurhash3_32( word) % max_size ) cur_row += 1 features = sp.csr_matrix((data,(row,col)), shape=( max(cur_row,1), max_size), dtype=np.float64) if targets: return item_ids, features, labels else: return item_ids, features
def hash(token, num_buckets): """Unsigned 32 bit murmurhash for feature hashing.""" return murmurhash3_32(token, positive=True) % num_buckets
def get_x(csv_row, D): x = [0] # 0 is the index of the bias term for key, value in [kvp for kvp in csv_row.items() if kvp[0]!='bgctr']: index = murmurhash3_32(value + key[1:], positive=True) % D # weakest hash ever ;) x.append(index) return x # x contains indices of features that have a value of 1