class KLSH(KLSHBase): ''' >>>from klsh import KLSH >>>klsh = KLSH( ... b=b, #number of hash bits ... t=t, #number of ...) ''' def __init__(self, b, t, **kw): super(KLSH, self).__init__(b=b, t=t, **kw) #self._check_parameters() self.hash = Hashing(b=b, t=t) self.bucket = KlshBucket() self.storage = PickleStorage() def loadDataSet(self, filename, delim=','): self.dataMat = np.loadtxt(filename, delimiter=delim) def preprocessing(self): dataMat = self.dataMat numOfData = dataMat[:, 0].size self.insert_matrix(dataMat, numOfData) self.store_Wmat() self.bucket.store_buckets() def insert_matrix(self, matrix, num): KerMat = self.hash.kernelMatrix(matrix) CenterMat = self.hash.center(KerMat) (HashTable, self.W) = self.hash.creatHashTable(CenterMat, self.b, self.t) for i in xrange(num): self.bucket.insert_buckets(matrix[i, :], HashTable[i, :]) def knn(self, vector, knum, stored=False): if stored == False: hashed_array = self.hash.do_hashing( vector, self.dataMat, self.W) #vector must be the first argument knn_vectors = self.bucket.select_knn(knum, hashed_array) else: self.W = self.load_Wmat() self.bucket.load_buckets() hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W) #hashed_array = self.hash.do_hashing(self.dataMat, vector, self.W) knn_vectors = self.bucket.select_knn(knum, hashed_array) return knn_vectors def store_Wmat(self): fw = open('Wmat.data', 'wb') self.storage.save(self.W, fw) fw.close() def load_Wmat(self): fr = open('Wmat.data') W = self.storage.load(fr) fr.close() return W
class KLSH(KLSHBase): """ >>>from klsh import KLSH >>>klsh = KLSH( ... b=b, #number of hash bits ... t=t, #number of ...) """ def __init__(self, b, t, **kw): super(KLSH, self).__init__(b=b, t=t, **kw) # self._check_parameters() self.hash = Hashing(b=b, t=t) self.bucket = KlshBucket() self.storage = PickleStorage() def loadDataSet(self, filename, delim=","): self.dataMat = np.loadtxt(filename, delimiter=delim) def preprocessing(self): dataMat = self.dataMat numOfData = dataMat[:, 0].size self.insert_matrix(dataMat, numOfData) self.store_Wmat() self.bucket.store_buckets() def insert_matrix(self, matrix, num): KerMat = self.hash.kernelMatrix(matrix) CenterMat = self.hash.center(KerMat) (HashTable, self.W) = self.hash.creatHashTable(CenterMat, self.b, self.t) for i in xrange(num): self.bucket.insert_buckets(matrix[i, :], HashTable[i, :]) def knn(self, vector, knum, stored=False): if stored == False: hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W) # vector must be the first argument knn_vectors = self.bucket.select_knn(knum, hashed_array) else: self.W = self.load_Wmat() self.bucket.load_buckets() hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W) # hashed_array = self.hash.do_hashing(self.dataMat, vector, self.W) knn_vectors = self.bucket.select_knn(knum, hashed_array) return knn_vectors def store_Wmat(self): fw = open("Wmat.data", "wb") self.storage.save(self.W, fw) fw.close() def load_Wmat(self): fr = open("Wmat.data") W = self.storage.load(fr) fr.close() return W
class LshBucket(object): """Data bucket class >>> from buckets import LshBucket >>> bucket = LshBucket() >>> bucket.insert(vector, hashed_array) >>> bucket.select(vector, hashed_array) """ def __init__(self, **kw): self.L = 0 for key, value in kw.iteritems(): setattr(self, key, value) self.data = [{} for i in xrange(self.L)]#包含L个哈希表,表中存有哈希值及其对应的向量 self.index = [[] for i in xrange(self.L)]#包含L个列表,每个列表里包含对应哈希表中的哈希值 self.storage = PickleStorage() def insert(self, vector, hashed_array ): assert len(self.data) == self.L assert len(self.index) == self.L assert len(hashed_array) == self.L #assert <test> , <data> #if <test> is false, raise error L = self.L for i in xrange(L): self._putInBucket(i,vector,hashed_array[i]) def store_buckets(self): fw1 = open("buckets.data",'wb') self.storage.save(self.data,fw1) fw1.close() fw2 = open("buckets_index.data",'wb') self.storage.save(self.index,fw2) fw2.close() def load_buckets(self): fr1 = open('buckets.data') self.data = self.storage.load(fr1) fr1.close() fr2 = open('buckets_index.data') self.index = self.storage.load(fr2) fr2.close() def select(self,query_vector,hashed_array,without_itself=False): assert len(self.data) == self.L assert len(hashed_array) == self.L query_vector_tuple = tuple(query_vector) L = self.L result = [] seen = {} for i in xrange(L): hashed = "".join(map(str,hashed_array[i])) data = self.data[i] vectors = data.get(hashed,[])#return the list of vectors pointed to hashed for vector in vectors: key = tuple(vector) if key in seen: continue if key == query_vector_tuple and without_itself:#是否忽略完全一样的 continue seen[key] = True result.append(vector) if len(result) >= L*2: break return result#a list that composed of vectors def _putInBucket(self,i,vector,hashed_array): """ this func build the relationship of (data[hashed] = vector) and build the index of hashed_array """ index = self.index[i] hashed_array = "".join(map(str, hashed_array)) #"".join() return an object of string type #map() Apply function to every item of iterable and return a list of the results. self.index[i] = sorted(set(index + [hashed_array])) data = self.data[i] if hashed_array not in data: data[hashed_array] = [] data[hashed_array].append(vector)
class HammingHash(BaseHash): """Hash function class (Hamming Distance) >>> from LSH.hash import HammingHash >>> hash = HammingHash() >>> hashed_array = hash.do_hashing([123, 456, 789]) """ def __init__(self, L, k, d, **kw): super(HammingHash, self).__init__(L=L, k=k, d=d, **kw) self.storage = PickleStorage() def do_hashing(self,vector): if not isinstance(vector, (list, tuple)): raise TypeError("args should be an array_ref") if getattr(self, "d", None) is None: self.d = len(vector) if self.d != len(vector): raise ValueError("invalid dimention number") unary_code = self._unarize(vector) hash = [] for i in xrange(self.L): #the number of hash function sampling_bits = self.indexes[i] hash.append([unary_code[bit] for bit in sampling_bits]) return hash def do_hashing_index(self,vector): #using the indexes from file that has stored fr = open('indexes.data') indexes_stored = self.storage.load(fr) fr.close() unary_code = self._unarize(vector) hash = [] for i in xrange(self.L): sampling_bits = indexes_stored[i] hash.append([unary_code[bit] for bit in sampling_bits]) return hash @property #only read def indexes(self): if getattr(self, "_indexes", None) is None: self._indexes = self._create_indexes() #store(pickle) the indexes fw = open('indexes.data','wb') self.storage.save(self._indexes,fw) fw.close() return self._indexes def _create_indexes(self): indexes = [] for i in xrange(self.L): sampling_bits = set() #A set object is an unordered collection of distinct hashable objects while True: bit = random.randint(0, self.d * SCALE - 1) if bit not in sampling_bits: sampling_bits.add(bit) if len(sampling_bits) == self.k * SCALE: break indexes.append(sorted(sampling_bits)) return indexes def _unarize(self, vector): n = float(SCALE) / max(vector) unary = [] for x in vector: i = int(x * n) j = SCALE - i unary += [1] * i unary += [0] * j return unary
class KlshBucket(object): def __init__(self, **kw): for key, value in kw.iteritems(): setattr(self, key, value) self.buckets = {} self.index = [] self.storage = PickleStorage() def insert_buckets(self, vector, hashed_array): index = self.index hashed_temp = np.array(hashed_array,dtype=np.int8,ndmin=1) hashed_temp = hashed_temp.tolist() hashed_list = hashed_temp[0] hashed = "".join(map(str, hashed_list)) self.index = sorted(set(index + [hashed])) buckets = self.buckets if hashed not in buckets: buckets[hashed] = [] buckets[hashed].append(vector) def select_knn(self, k, hashed_array): #query_vector_tuple = tuple(query_vector) knn_result = [] #[element->{'distance':hamming_dist,'vector':vector}] hashed_temp = np.array(hashed_array,dtype=np.int8,ndmin=1) hashed_temp = hashed_temp.tolist() hashed_list = hashed_temp[0] hashed = "".join(map(str, hashed_list)) buckets = self.buckets indexes = self.index for index_val in indexes: ham_dist = self._ham_dist(hashed,index_val) if len(knn_result) < k: knn_result.append({'ham_dist':ham_dist,'vector':buckets[index_val]}) continue knn_result.sort(key=lambda x:x['ham_dist'])#对ham_dist进行升序排列 maximun_dist = knn_result[2]['ham_dist'] if ham_dist < maximun_dist: knn_result[2] = {'ham_dist':ham_dist,'vector':buckets[index_val]} return knn_result def store_buckets(self): fw1 = open("buckets.data",'wb') self.storage.save(self.buckets,fw1) fw1.close() fw2 = open("buckets_index.data",'wb') self.storage.save(self.index,fw2) fw2.close() def load_buckets(self): fr1 = open('buckets.data') self.buckets = self.storage.load(fr1) fr1.close() fr2 = open('buckets_index.data') self.index = self.storage.load(fr2) fr2.close() def _ham_dist(self, hashval1, hashval2): xor_result = bitarray(hashval1) ^ bitarray(hashval2) return xor_result.count()
class KlshBucket(object): def __init__(self, **kw): for key, value in kw.iteritems(): setattr(self, key, value) self.buckets = {} self.index = [] self.storage = PickleStorage() def insert_buckets(self, vector, hashed_array): index = self.index hashed_temp = np.array(hashed_array, dtype=np.int8, ndmin=1) hashed_temp = hashed_temp.tolist() hashed_list = hashed_temp[0] hashed = "".join(map(str, hashed_list)) self.index = sorted(set(index + [hashed])) buckets = self.buckets if hashed not in buckets: buckets[hashed] = [] buckets[hashed].append(vector) def select_knn(self, k, hashed_array): #query_vector_tuple = tuple(query_vector) knn_result = [] #[element->{'distance':hamming_dist,'vector':vector}] hashed_temp = np.array(hashed_array, dtype=np.int8, ndmin=1) hashed_temp = hashed_temp.tolist() hashed_list = hashed_temp[0] hashed = "".join(map(str, hashed_list)) buckets = self.buckets indexes = self.index for index_val in indexes: ham_dist = self._ham_dist(hashed, index_val) if len(knn_result) < k: knn_result.append({ 'ham_dist': ham_dist, 'vector': buckets[index_val] }) continue knn_result.sort(key=lambda x: x['ham_dist']) #对ham_dist进行升序排列 maximun_dist = knn_result[2]['ham_dist'] if ham_dist < maximun_dist: knn_result[2] = { 'ham_dist': ham_dist, 'vector': buckets[index_val] } return knn_result def store_buckets(self): fw1 = open("buckets.data", 'wb') self.storage.save(self.buckets, fw1) fw1.close() fw2 = open("buckets_index.data", 'wb') self.storage.save(self.index, fw2) fw2.close() def load_buckets(self): fr1 = open('buckets.data') self.buckets = self.storage.load(fr1) fr1.close() fr2 = open('buckets_index.data') self.index = self.storage.load(fr2) fr2.close() def _ham_dist(self, hashval1, hashval2): xor_result = bitarray(hashval1) ^ bitarray(hashval2) return xor_result.count()