def get_hash2img(): if os.path.exists(redis_rdb): lsh = LSHash(hash_len, 960, storage_config=redis_config, matrices_filename=matrices_file) return lsh else: return create_hash2img()
def traceLSHash(queryName, hashSize): #queryName ="hamming_query_12_3" #需要进行hashQuery的轨迹index indexList = [14, 249, 479, 689, 899] XYMatrix = DateTransform() resultList = [] nearList = [] lsh = LSHash(hashSize, 44107) tid = 1 for traceList in XYMatrix: lsh.index(input_point=traceList, extra_data=tid) tid += 1 resultFile = open(queryName + '.txt', 'w') for index in indexList: queryList = lsh.query(XYMatrix[index], distance_func="hamming") for result in queryList: resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str( result[1]) + "\n" nearList.append(result[0][1]) resultFile.write(resultStr) resultList.append(nearList) nearList = [] resultFile.close() writeHTML(resultList, queryName, "hashQuerry") print resultList
def test_lshash_redis(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index( list(self.els[i]) ) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count( itm) == 1 # have multiple insertions been prevented? assert el in self.els for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in self.els assert el_dist == 0 del lsh
def init_lsh(args): d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config=args.s, matrices_filename='project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i) return (lsh, np_feature_vecs)
def test_lshash_redis_extra_val(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) lsh.index(list(self.els[i]), self.el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in self.els assert el[1] in self.el_names for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in self.els assert el_name in self.el_names assert el_dist == 0 del lsh
def write_json_lsh(hash_size, grid): ''' 将生成的lsh路径放入json并存储 :param hash_size: hash size列表 :param grid: 处理完的栅格数组 :return: none ''' data_lsh = {} for size in hash_size: print size print 'list' data_lsh[size] = [] lsh = LSHash(size, 44107) count = 0 for line in grid: lsh.index(line, extra_data=count) count += 1 for id in road_id: roads = [] res = lsh.query(grid[id]) print len(res) for r in res: roads.append(pack_data(r[0][1])) data_lsh[size].append({id: roads}) with open('result_lsh.json', 'w') as f: f.write(str(data_lsh))
def create_feature(list_author, net): global example_image_dir list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for subfolder in list_author.keys(): subfolder_path = os.path.join(example_image_dir, subfolder) count_items = len([ name for name in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, name)) ]) # print(subfolder) sum_acc = 0 sum_confiden = 0 for img in os.listdir(subfolder_path): image_path = os.path.join(subfolder_path, img) author, confidence, feature = predict_author_single_img( net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) pickle.dump(lsh, open('lsh.p', "wb")) return lsh, image_paths, list_feature
def test_lshash_extra_val(self): lsh = LSHash(self.hash_size, self.input_dim, 1, storage_config={'dict': None}) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: self.assertIn(el[0], self.els) self.assertIn(el[1], self.el_names) for el in self.els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] self.assertIn(el_v, self.els) self.assertIn(el_name, self.el_names) self.assertEqual(el_dist, 0) del lsh
def knn(data_array, data, hash_size_input, data_shape): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # get a random pos vipno_pos = rd.randint(0, data_shape[1]) # calculate and output for k in [1, 2, 3, 4, 5]: print 'hash size: %d' % hash_size_input print 'value k: %d' % k print 'target vipno: %d' % data.columns[vipno_pos] result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) print 'results: ' print result[1:]
def k_nn_lsh(k, word, decade_matrix, index_dict): index_dict = dict(map(reversed, index_dict.items())) num_rows = decade_matrix.get_shape()[0] lsh = LSHash(6, num_rows) for i in range(num_rows): print(i) lsh.index(decade_matrix.getrow(i).todense()) return lsh.query(word)
def Mainfunc(self, mat_addr): np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['FFE'] datalen = len(svec) n1, n2, n3 = np.shape(svec) data = np.zeros((n1, 87212)) m = 0 for i in range(n2): for j in range(n3): if svec[:, i, j].all() != 0: data[:, m] = svec[:, i, j] m = m + 1 # print data[:,0] dataves = np.transpose(data) modelindex = list(set(np.random.randint(1, 87212, size=10000))) lsh_model = LSHash(7, n1) for jj in modelindex: lsh_model.index(dataves[jj, :]) # if you want to test a program starttest = 1 # start test index endtest = 5 testindex = random.sample(modelindex, 1) # SIZE IS THE NUMBER OF TEST FUNCTIONS test = np.zeros((len(testindex), n1)) for i in range(len(testindex)): # print dataves[testindex[i],:] test[i, :] = dataves[testindex[i], :] # print len(test) output = open('result.txt', 'w') timee = open('time.txt', 'w') for queryi in range(len(testindex)): if test[queryi, :].all() != 0: starttime = time.time() Atemp = lsh_model.query(test[queryi, :], 5, 'cosine') print(str(Atemp[0]).split(')')[0]).replace('(', '') output.write((str(Atemp[0]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[1]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[2]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[3]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[4]).split(')')[0]).replace('(', '') + '\n') endtime = time.time() timee.write(str(endtime - starttime) + '\n') # output.write(A) output.write('\n') output.close() timee.close()
def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh = LSHash(hash_size=32, input_dim=f, num_hashtables=100) for i in range(n): lsh.index(X[i], i) return lsh
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() #这里的参数可以调整,具体见https://github.com/kayzh/LSHash lsh = LSHash(13, 128, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def lshSearch(dataBase2, test2, num): lsh = LSHash(30, 216) def CreateIndex(array): for item in array: lsh.index(item) CreateIndex(dataBase2) test2 = test2.reshape((216,)) res = lsh.query(test2, num, distance_func='true_euclidean') return res
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() # '10' means the bit binary (github.com/kayzh/LSHash) lsh = LSHash(13, 10, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def get_lshash(filename): lsh = LSHash(30, 8) try: with open(filename) as f: content = f.readlines() content = [x.strip('\n') for x in content] except Exception as e: print("Cannot find the file.") for row in content: row = row.split(",") row = list(map(int, row)) tmp = row[:8] lsh.index(tmp, str(row[8])) return lsh
def generateSingleHash(X, planesFileName, n_bits=64): """ Generate a n_bits long hash for each input in X :param X: :param n_bits: :return: """ import utils # overwrite old matrixes an build some random new ones fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz') lsh = LSHash(n_bits, np.shape(X)[0], matrices_filename=fileName, overwrite=False) return lsh._hash(lsh.uniform_planes[0], X.tolist())
def lshTOfind(path): lsh = LSHash(50,361) f = open('newindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [int(float(i)) for i in r[1:]] lsh.index(features) count += 1 try: f_v = getfeatures(path) ans = lsh.query(f_v) if ans != []: return searchid(int(ans[0][0][360]/10000)) except: return []
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # calculate and output result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) return result[1:]
def test_lshash(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh
def classify_nearest_neighbor_lsh(k): lsh = LSHash(3, 12) labels = load_labels() for genre, song_genres_ids in labels.groupby('category'): print('Indexing genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2)): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) for val in song.values: lsh.index(val, extra_data=genre) total_count = 0 match_count = 0 for genre, song_genres_ids in labels.groupby('category'): print('Expected genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2), num_values): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) genre_freqs = {} split_song = np.array_split(song, 5, axis=0) # Split song into sections for s in split_song: avg_song_val = np.mean(s) # Take average of each section neighbours = lsh.query(avg_song_val, num_results=k) for neighbour in neighbours: genre = neighbour[0][1] genre_freqs[genre] = genre_freqs.get(genre, 0) + 1 actual_genre = max(genre_freqs, key=genre_freqs.get) print('Predicted genre: {}'.format(actual_genre)) total_count += 1 if genre == actual_genre: match_count += 1 print('Matched {} out of {} songs: {}%'.format( match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 count = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results=5, distance_func="cosine") #print result #no similar tweets count += 1 if (result == []): outputdict[index] = [] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if (outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print count, #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line + "\n") print "Please check the output file:", outputFile
def create_feature(train_image_dir, classes, net): list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for each_object in classes: each_object_path = os.path.join(train_image_dir, each_object) list_img = next(os.walk(each_object_path))[2] print("hashing class: ", each_object, " which has: ", len(list_img)) for img in list_img: image_path = os.path.join(each_object_path, img) feature = get_feature_single_img(net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) return lsh, image_paths, list_feature
def test(): import utils trueIds, testSet = utils.load_test_set('fc7', 'raw', 0) lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True) for idx, input_point in enumerate(testSet): hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist()) print hastValue lsh.index(input_point, idx) print lsh.query(testSet[3], 3) return None
def test_lshash(self): lsh = LSHash(self.hash_size, self.input_dim, 1) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index(list(self.els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: self.assertEqual(itms.count(itm), 1) for el in itm: self.assertIn(el, self.els) for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res self.assertIn(el_v, self.els) self.assertEqual(el_dist, 0) del lsh
def index_room(): ''' lsh算法索引图片特征 :return: ''' files = glob("./data/features/*.csv") files_ids = [ filename.split("\\")[-1].replace(".csv", "") for filename in files ] X = np.load("data/train.npy") X = X.reshape(X.shape[0], -1) encoder = load_model("data/encoder.h5") dimension = 100 lsh_hash = LSHash(hash_size=32, input_dim=dimension) compress_feature = encoder.predict(X) for num, ele in enumerate(compress_feature.tolist()): lsh_hash.index(ele, extra_data=files_ids[num]) with open("data/lsh.pkl", "wb") as fh: pickle.dump(lsh_hash, fh)
def generateHashes(X, scalar, planesFileName, n_bits=64): """ Generate a n_bits long hash for each input in X :param X: :param n_bits: :return: """ import utils # overwrite old matrixes an build some random new ones fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz') lsh = LSHash(n_bits, np.shape(X[0])[0], matrices_filename=fileName, overwrite=False) hashValues = [] for input_point in X: input_point = scalar.transform(input_point) hashValues.append(lsh._hash(lsh.uniform_planes[0], input_point)) return hashValues
def test_lshash_extra_val(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert el[0] in els assert el[1] in el_names for el in els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh
def lshTOfind(path): lsh = LSHash(10, 360) f = open('copyindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [float(i) for i in r[1:]] lsh.index(features[:360], features[360]) count += 1 try: f_v = getfeatures(path) #print f_v ans = lsh.query(f_v[:360], 15) if ans != []: res = [] for i in ans: res.append(int(i[0][1] / 10000)) return res #searchid(int(ans[0][0][360]/10000)) except: return []
def create_hash2img(): img2gist = get_img2gist() lsh = LSHash(hash_len, 960, storage_config=redis_config, matrices_filename=matrices_file) count = 0 total_num = len(img2gist) for name, gist_v in img2gist.iteritems(): count += 1 lsh.index(gist_v, name) sys.stdout.write('%d/%d\r ' % (count, total_num)) sys.stdout.flush() print 'bucket ratio: %d/%d' % (len(lsh.hash_tables[0].keys()), 2 ** hash_len) return lsh