def __init__(self, L, k, d, **kw): super(LSH, self).__init__(L=L, k=k, d=d, **kw) self._check_parameters() self.hash = HammingHash(L=L, k=k, d=d) self.bucket = LshBucket(L=L, k=k, d=d) self.index = False #the symbol whether use the stored indexes
class LSH(LSHBase): """ >>> from LSH import LSH >>> lsh = LSH( ... L=L, # number of hash functions ... k=k, # number of reductions ... d=d, # number of dimentions ... ) """ def __init__(self, L, k, d, **kw): super(LSH, self).__init__(L=L, k=k, d=d, **kw) self._check_parameters() self.hash = HammingHash(L=L, k=k, d=d) self.bucket = LshBucket(L=L, k=k, d=d) self.index = False #the symbol whether use the stored indexes def loadDataSet(self,filename,delim=','): fr = open(filename) arrayOfLines = fr.readlines() #numOfLines = len(arrayOfLines) for line in arrayOfLines: line = line.strip() arrayFromLine = line.split(delim) vectFromLine = [int(x) for x in arrayFromLine]#transform the string type to integer self.insert(vectFromLine) self.bucket.store_buckets()#store all the data and index def insert(self, vector): """insert a vector data to buckets""" hashed_array = self.hash.do_hashing(vector) self.bucket.insert(vector, hashed_array) def insert_index(self,vector): hashed_array = self.hash.do_hashing_index(vector) self.bucket.insert(vector, hashed_array) def nn(self, vector, without_itself=False, index=False):#nearest_neighbours neighbours = self._neighbours(vector,without_itself,index) nearest_vector = self._nearest(vector,neighbours) return nearest_vector def knn(self,vector,kk,without_itself=False, index=False): neighbours = self._neighbours(vector,without_itself,index) knn_vectors = self._k_nearest(vector,kk,neighbours) return knn_vectors def _neighbours(self, vector, without_itself=False, index=False): """this function extracts some vectors as neighbours with query vector""" self.index = index if self.index == False: hashed_array = self.hash.do_hashing(vector) else: self.bucket.load_buckets() hashed_array = self.hash.do_hashing_index(vector) neighbours = self.bucket.select(vector, hashed_array, without_itself) return neighbours def _k_nearest(self, vector, kk, neighbours): knn_list = [] for n_vector in neighbours: dist = self._euclidean_dist(vector,n_vector) if len(knn_list) < kk: knn_list.append({'distance':dist,'vector':n_vector}) continue knn_list.sort(key=lambda x:x['distance'])#sort from small to large maximum_dist = knn_list[2]['distance'] if dist < maximum_dist: knn_list[2] = {'distance':dist,'vector':n_vector} return knn_list def _nearest(self, vector, neighbours): """pick up the nearest vector from neighbours""" nearest = {} for n_vector in neighbours: dist = self._euclidean_dist(vector,n_vector) if "distance" not in nearest or dist < nearest["distance"]: """ 'distance' is the key of dictionary(nearest) 'distance' is not in the dict,then build it dist is shorter,then update """ nearest.update(vector = n_vector,distance = dist) return nearest def _hamming_dist(self, hashval1, hashval2): xor_result = bitarray(hashval1) ^ bitarray(hashval2) return xor_result.count() def _euclidean_dist(self, vector1, vector2): sum = 0 for x1, x2 in zip(vector1, vector2): d = (x1 - x2) ** 2 sum += d return math.sqrt(sum)