def write_json_lsh(hash_size, grid): ''' 将生成的lsh路径放入json并存储 :param hash_size: hash size列表 :param grid: 处理完的栅格数组 :return: none ''' data_lsh = {} for size in hash_size: print size print 'list' data_lsh[size] = [] lsh = LSHash(size, 44107) count = 0 for line in grid: lsh.index(line, extra_data=count) count += 1 for id in road_id: roads = [] res = lsh.query(grid[id]) print len(res) for r in res: roads.append(pack_data(r[0][1])) data_lsh[size].append({id: roads}) with open('result_lsh.json', 'w') as f: f.write(str(data_lsh))
def knn(data_array, data, hash_size_input, data_shape): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # get a random pos vipno_pos = rd.randint(0, data_shape[1]) # calculate and output for k in [1, 2, 3, 4, 5]: print 'hash size: %d' % hash_size_input print 'value k: %d' % k print 'target vipno: %d' % data.columns[vipno_pos] result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) print 'results: ' print result[1:]
def create_feature(list_author, net): global example_image_dir list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for subfolder in list_author.keys(): subfolder_path = os.path.join(example_image_dir, subfolder) count_items = len([ name for name in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, name)) ]) # print(subfolder) sum_acc = 0 sum_confiden = 0 for img in os.listdir(subfolder_path): image_path = os.path.join(subfolder_path, img) author, confidence, feature = predict_author_single_img( net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) pickle.dump(lsh, open('lsh.p', "wb")) return lsh, image_paths, list_feature
def test_lshash_redis_extra_val(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) lsh.index(list(els[i]), el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in els assert el[1] in el_names for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh sr.flushdb()
def traceLSHash(queryName, hashSize): #queryName ="hamming_query_12_3" #需要进行hashQuery的轨迹index indexList = [14, 249, 479, 689, 899] XYMatrix = DateTransform() resultList = [] nearList = [] lsh = LSHash(hashSize, 44107) tid = 1 for traceList in XYMatrix: lsh.index(input_point=traceList, extra_data=tid) tid += 1 resultFile = open(queryName + '.txt', 'w') for index in indexList: queryList = lsh.query(XYMatrix[index], distance_func="hamming") for result in queryList: resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str( result[1]) + "\n" nearList.append(result[0][1]) resultFile.write(resultStr) resultList.append(nearList) nearList = [] resultFile.close() writeHTML(resultList, queryName, "hashQuerry") print resultList
def test_lshash_redis(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count(itm) == 1 # have multiple insertions been prevented? assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh sr.flushdb()
def test_lshash_extra_val(self): lsh = LSHash(self.hash_size, self.input_dim, 1, storage_config={'dict': None}) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: self.assertIn(el[0], self.els) self.assertIn(el[1], self.el_names) for el in self.els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] self.assertIn(el_v, self.els) self.assertIn(el_name, self.el_names) self.assertEqual(el_dist, 0) del lsh
def k_nn_lsh(k, word, decade_matrix, index_dict): index_dict = dict(map(reversed, index_dict.items())) num_rows = decade_matrix.get_shape()[0] lsh = LSHash(6, num_rows) for i in range(num_rows): print(i) lsh.index(decade_matrix.getrow(i).todense()) return lsh.query(word)
def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh = LSHash(hash_size=32, input_dim=f, num_hashtables=100) for i in range(n): lsh.index(X[i], i) return lsh
def Mainfunc(self, mat_addr): np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['FFE'] datalen = len(svec) n1, n2, n3 = np.shape(svec) data = np.zeros((n1, 87212)) m = 0 for i in range(n2): for j in range(n3): if svec[:, i, j].all() != 0: data[:, m] = svec[:, i, j] m = m + 1 # print data[:,0] dataves = np.transpose(data) modelindex = list(set(np.random.randint(1, 87212, size=10000))) lsh_model = LSHash(7, n1) for jj in modelindex: lsh_model.index(dataves[jj, :]) # if you want to test a program starttest = 1 # start test index endtest = 5 testindex = random.sample(modelindex, 1) # SIZE IS THE NUMBER OF TEST FUNCTIONS test = np.zeros((len(testindex), n1)) for i in range(len(testindex)): # print dataves[testindex[i],:] test[i, :] = dataves[testindex[i], :] # print len(test) output = open('result.txt', 'w') timee = open('time.txt', 'w') for queryi in range(len(testindex)): if test[queryi, :].all() != 0: starttime = time.time() Atemp = lsh_model.query(test[queryi, :], 5, 'cosine') print(str(Atemp[0]).split(')')[0]).replace('(', '') output.write((str(Atemp[0]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[1]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[2]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[3]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[4]).split(')')[0]).replace('(', '') + '\n') endtime = time.time() timee.write(str(endtime - starttime) + '\n') # output.write(A) output.write('\n') output.close() timee.close()
def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh = LSHash(hash_size = 32, input_dim = f, num_hashtables = 100) for i in range(n): lsh.index(X[i], i) return lsh
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() #这里的参数可以调整,具体见https://github.com/kayzh/LSHash lsh = LSHash(13, 128, num_hashtables=1) map(lambda x:lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle,"wb") pickle.dump(lsh, out, -1) out.close()
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() #这里的参数可以调整,具体见https://github.com/kayzh/LSHash lsh = LSHash(13, 128, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def lshSearch(dataBase2, test2, num): lsh = LSHash(30, 216) def CreateIndex(array): for item in array: lsh.index(item) CreateIndex(dataBase2) test2 = test2.reshape((216,)) res = lsh.query(test2, num, distance_func='true_euclidean') return res
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() # '10' means the bit binary (github.com/kayzh/LSHash) lsh = LSHash(13, 10, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def generateSingleHash(X, planesFileName, n_bits=64): """ Generate a n_bits long hash for each input in X :param X: :param n_bits: :return: """ import utils # overwrite old matrixes an build some random new ones fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz') lsh = LSHash(n_bits, np.shape(X)[0], matrices_filename=fileName, overwrite=False) return lsh._hash(lsh.uniform_planes[0], X.tolist())
def init(): parser = argparse.ArgumentParser(description = 'Tools for hamming distance-based image retrieval by cuda') parser.add_argument('-f', help = 'The filename of image raw features (SIFT).') parser.add_argument('-v', default = 'fvecs', help = 'The format of image raw features.') parser.add_argument('-s', default = 'dict', help = 'The method of indexing storage.') parser.add_argument('-d', default = '128', help = 'Dimensions of raw image feature.') parser.add_argument('-o', default = '0', help = 'Offset of accessing raw image features.') parser.add_argument('-n', default = '1', help = 'Number of raw image features to read.') parser.add_argument('-i', default = 'n', help = 'Whether to perform indexing step.') parser.add_argument('-e', help = 'The dirname of indexing folder.') parser.add_argument('-k', default = '10', help = 'Number of retrieved images.') parser.add_argument('-r', default = '32', help = 'Number of dimensions randomly sampled.') parser.add_argument('-c', default = 'n', help = 'Whether to perform compressing step.') parser.add_argument('-q', default = 'n', help = 'Whether to sequentially sampling.') parser.add_argument('-p', default = 'n', help = 'Whether to perform querying in compressed domain.') parser.add_argument('-g', default = 'y', help = 'GPU mode. default is "yes".') parser.add_argument('-l', default = 'n', help = 'VLQ base64 mode. Load VLQ base64 encoding compressed dict.') parser.add_argument('-b', default = '1', help = 'Expanding level of search buckets.') parser.add_argument('-t', default = 'int32', help = 'FastDict type (int32, int8, string).') args = parser.parse_args() d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, random_dims, 1, storage_config = args.s, matrices_filename = 'project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, off) if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random': if args.p == 'y': print "loading compressed index." lsh.load_compress_index(args.e, (args.l == 'y')) print "loading done." else: print "loading index." lsh.load_index(args.e) print "loading done." print "indexing done. Ready for querying." return (lsh, np_feature_vecs, args)
def __init__(self): self.lshIndexList = [] # create a list of lsh indexes self.lsh = LSHash(NUMBER_OF_BITS_PER_HASH, NUM_TOPICS, num_hashtables=NUMBER_OF_LSH_INDEXES, storage_config={"redis": {"host": "localhost", "port": 6379}})
def plot_similar_tats_idx(idx: int, feature_dict: dict, lsh_variable: LSHash, n_items: int = 6, distance_func: str = 'hamming') -> plt.Figure: """Takes an input index for the training set and plots the closest matching tattoos to that input tattoo. Args: idx : index to tattoo in the training set feature_dict : wraps both image locations and feature vectors at the output of the cnn before the final layer. lsh_variable : trained lsh model to query the input image n_items : number of items to return distance_func : The distance function. Currently it needs to be one of ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"). By default "hamming" will used. Returns: Matplotlib grid plot of the index image first and n other similar images. """ response = lsh_variable.query(feature_dict[list( feature_dict.keys())[idx]].flatten(), num_results=n_items + 1, distance_func=distance_func) return plot_similar_tats_query(response, n_items=n_items + 1, distance_func=distance_func)
def init_lsh(args): d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config=args.s, matrices_filename='project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i) return (lsh, np_feature_vecs)
def get_lshash(filename): lsh = LSHash(30, 8) try: with open(filename) as f: content = f.readlines() content = [x.strip('\n') for x in content] except Exception as e: print("Cannot find the file.") for row in content: row = row.split(",") row = list(map(int, row)) tmp = row[:8] lsh.index(tmp, str(row[8])) return lsh
def get_hash2img(): if os.path.exists(redis_rdb): lsh = LSHash(hash_len, 960, storage_config=redis_config, matrices_filename=matrices_file) return lsh else: return create_hash2img()
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # calculate and output result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) return result[1:]
def lshTOfind(path): lsh = LSHash(50,361) f = open('newindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [int(float(i)) for i in r[1:]] lsh.index(features) count += 1 try: f_v = getfeatures(path) ans = lsh.query(f_v) if ans != []: return searchid(int(ans[0][0][360]/10000)) except: return []
def test_lshash(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh
class LshIndexer(Indexer): PARAMETERS = {'hash_size': 6, 'input_dim': 128, 'num_of_hashtables': 1, 'storage': {'redis': {'host':'localhost', 'port': 6379}}} def initialize_store(self, parameters): self.store = LSHash(parameters['hash_size'], parameters['input_dim'], parameters['num_of_hashtables'], parameters['storage']) def index(self, features): for feature in features: self.store.index(feature.data, feature.file_id) def query(self, feature, num_results=5): return self.store.query(feature, num_results)
def generateHashes(X, scalar, planesFileName, n_bits=64): """ Generate a n_bits long hash for each input in X :param X: :param n_bits: :return: """ import utils # overwrite old matrixes an build some random new ones fileName = os.path.join(utils.lsh_planes_dir, planesFileName + '.npz') lsh = LSHash(n_bits, np.shape(X[0])[0], matrices_filename=fileName, overwrite=False) hashValues = [] for input_point in X: input_point = scalar.transform(input_point) hashValues.append(lsh._hash(lsh.uniform_planes[0], input_point)) return hashValues
def classify_nearest_neighbor_lsh(k): lsh = LSHash(3, 12) labels = load_labels() for genre, song_genres_ids in labels.groupby('category'): print('Indexing genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2)): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) for val in song.values: lsh.index(val, extra_data=genre) total_count = 0 match_count = 0 for genre, song_genres_ids in labels.groupby('category'): print('Expected genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2), num_values): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) genre_freqs = {} split_song = np.array_split(song, 5, axis=0) # Split song into sections for s in split_song: avg_song_val = np.mean(s) # Take average of each section neighbours = lsh.query(avg_song_val, num_results=k) for neighbour in neighbours: genre = neighbour[0][1] genre_freqs[genre] = genre_freqs.get(genre, 0) + 1 actual_genre = max(genre_freqs, key=genre_freqs.get) print('Predicted genre: {}'.format(actual_genre)) total_count += 1 if genre == actual_genre: match_count += 1 print('Matched {} out of {} songs: {}%'.format( match_count, total_count, (match_count / total_count) * 100))
def learn(routes): global global_training_route global next_hop_index extra_data_len = 2 #destination, next_hop ndims = len(routes[0]) - extra_data_len #Number of dimensions hash_length = len(routes[0]) * 2 #arbitrarily chosen hash_length next_hop_index = len(routes[0]) - 1 #NextHop index at the last for i in range(0, len(routes) - 1): if(routes[i][next_hop_index] >= routes[i+1][next_hop_index]): routes[i][next_hop_index] = i+1 else: routes[i][next_hop_index] = -1 global_training_route = routes lsh = LSHash(hash_length, ndims) for entry in routes: lsh.index(entry[:-extra_data_len], extra_data = entry[-extra_data_len:]) return lsh
def create_feature(train_image_dir, classes, net): list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for each_object in classes: each_object_path = os.path.join(train_image_dir, each_object) list_img = next(os.walk(each_object_path))[2] print("hashing class: ", each_object, " which has: ", len(list_img)) for img in list_img: image_path = os.path.join(each_object_path, img) feature = get_feature_single_img(net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) return lsh, image_paths, list_feature
def index_room(): ''' lsh算法索引图片特征 :return: ''' files = glob("./data/features/*.csv") files_ids = [ filename.split("\\")[-1].replace(".csv", "") for filename in files ] X = np.load("data/train.npy") X = X.reshape(X.shape[0], -1) encoder = load_model("data/encoder.h5") dimension = 100 lsh_hash = LSHash(hash_size=32, input_dim=dimension) compress_feature = encoder.predict(X) for num, ele in enumerate(compress_feature.tolist()): lsh_hash.index(ele, extra_data=files_ids[num]) with open("data/lsh.pkl", "wb") as fh: pickle.dump(lsh_hash, fh)
def test_lshash(self): lsh = LSHash(self.hash_size, self.input_dim, 1) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index(list(self.els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: self.assertEqual(itms.count(itm), 1) for el in itm: self.assertIn(el, self.els) for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res self.assertIn(el_v, self.els) self.assertEqual(el_dist, 0) del lsh
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict={} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results = 50, distance_func = "euclidean") #print result #no similar tweets if(result == []): outputdict[index]=[] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if(outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line+"\n") print "Please check the output file:", outputFile
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 count = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results=5, distance_func="cosine") #print result #no similar tweets count += 1 if (result == []): outputdict[index] = [] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if (outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print count, #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line + "\n") print "Please check the output file:", outputFile
def getBuckets(fromFile): global nameDict global lsh nameDict = {} lsh = LSHash(bWidth,26, num_hashtables = num_ht) if fromFile: f = open(datafile, 'r') nameList = f.readlines() else: nameList = surnames.dic.keys() for l in nameList: name = l.split(" ")[0].strip() nameArr = getvec(name) arrStr = toStr(nameArr) if arrStr in nameDict: nameDict[arrStr].append(name) else: nameDict[arrStr] = [name] for k in nameDict.keys(): lsh.index(toArr(k))
def test_lshash_extra_val(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert el[0] in els assert el[1] in el_names for el in els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh
def lshTOfind(path): lsh = LSHash(10, 360) f = open('copyindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [float(i) for i in r[1:]] lsh.index(features[:360], features[360]) count += 1 try: f_v = getfeatures(path) #print f_v ans = lsh.query(f_v[:360], 15) if ans != []: res = [] for i in ans: res.append(int(i[0][1] / 10000)) return res #searchid(int(ans[0][0][360]/10000)) except: return []
def subEventDetection(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector, num_results = 50, distance_func = "cosine") #no similar tweets if(result == []): outputVector.append([index]) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): for ind in range(len(outputVector)): done = False for tweetNo in outputVector[ind]: if (tweetNo == tempDict[vector]): outputVector[ind].append(index) done = True break if done == True: break assigned = True break if assiged == False: outputVector.append([index]) lsh.index(denseVector) tempDict[tuple(denseVector)] = index with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += ", " + str(index) out.write(line[2:]+"\n") print "Please check the output file:", outputFile
class Searcher: _DIST_FUNCTIONS = ["hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"] index = None def __init__(self, dataset): self.create_index(dataset) def create_index(self, items, hash_size=6): input_dim = len(items.values()[0]) self.index = LSHash(hash_size, input_dim) for key in items: self.index.index(items[key], extra_data=key) return True def query(self, query_item, num_results=10, distance_function='cosine'): if distance_function not in self._DIST_FUNCTIONS: raise Exception("{0} not supported".format(distance_function)) results = self.index.query(query_item, num_results=num_results, distance_func=distance_function) return self.parse_results(results) def parse_results(self, results): return {x[0][1]:x[1] for x in results}
def eventIdentification(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) #print "Unique Tokens:", dictionary.__len__() lsh = LSHash(20, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector) #print denseVector #no similar tweets if(result == []): #print "No Similar Tweets for: ", index tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): ev = tempDict[tuple(vector[0])] outputVector[ev].append(index) tempDict[tuple(denseVector)] = ev #for ind in range(len(outputVector)): #done = False #for tweetNo in outputVector[ind]: #if (tweetNo == tempDict[tuple(vector[0])]): #outputVector[ind].append(index) #done = True #break #if done == True: #break assigned = True break if assigned == False: tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += "," + str(index) out.write(line[1:]+"\n") del outputVector del tempDict
def test(): import utils trueIds, testSet = utils.load_test_set('fc7', 'raw', 0) lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True) for idx, input_point in enumerate(testSet): hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist()) print hastValue lsh.index(input_point, idx) print lsh.query(testSet[3], 3) return None
def initialize_store(self, parameters): self.store = LSHash(parameters['hash_size'], parameters['input_dim'], parameters['num_of_hashtables'], parameters['storage'])
def run(): initial = True size = 2000 tweet_ids = [] tweet_text = [] counter = 0 num_hashtables = 13 ## recompute the random vectors if this is changed dimension = 50000 ## recompute the random vectors if this is changed hash_size = 13 ## length of the LSHash of the tweets bucket_size = 100 ## size of the queue for each hash in the hash tables comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor cos_threshold = .7 ## threshold for the similarity of two tweets ## initialize the tf-idf vectorizer vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension) ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size) clusters = {} ## maintain the clusters num_clusters = 0 inv_index = {} ## inverse mapping from tweet_id to clusters Y = None Y1 = None f_d = open("output.txt",'w') loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/" for root, dirs, filenames in os.walk(loc): for f in filenames: with open(loc+f) as infile: for line in infile: ## load 2000 tweets at a time tweet = json.loads(line) tweet_ids.append(tweet['id']) tweet_text.append(tweet['text']) counter = counter + 1 t2 = 0 if counter%size == 0: t1 = time.clock() ## X contains te tf-idf score of the tweets in the "sparse row matrix" format if initial: X = vectorizer.fit_transform(tweet_text) else: X = vectorizer.transform(tweet_text) print X.get_shape() print len(vectorizer.vocabulary_) ## if the total number of keywords exceed the pre-specified dimension, raise error if X.get_shape()[0] > dimension: print X.get_shape() print "dimension exceeded" raise for i in range(X.get_shape()[0]): temp_tweet = X.getrow(i) ## query for the nearest neighbor from the lshash tables nn = lsh.arpoxNN(temp_tweet, L=comparisons) c = 2 scase = False ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster if nn is not None: ((a, b),c) = nn if c <= cos_threshold: inv_index[tweet_ids[i]] = inv_index[b] clusters.setdefault(inv_index[b],[]).append(tweet_ids[i]) #else: # scase = True ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor """ code to linearly search through the tweets""" if (c > cos_threshold or nn is None or scase): searchY = False if (i==0 and not initial): searchY = True if (i==0 and initial): inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 if (i!=0): Z = X[:i] #print temp_tweet.shape t2 = temp_tweet.transpose() #print i a1 = Z.dot(t2).toarray() a2 = Z.multiply(Z).sum(axis = 1) a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray() a2 = sp.csc_matrix(a2).toarray() b = [j for j in range(Z.shape[0])] a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5)) #a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet)) #print a t3 = tweet_ids[a] if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold: if not initial and i != size-1: searchY = True else: inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 else: inv_index[tweet_ids[i]] = inv_index[t3] clusters.setdefault(inv_index[t3], []).append(tweet_ids[i]) if searchY == True: Z = Y[i:] t2 = temp_tweet.transpose() #print i a1 = Z.dot(t2).toarray() a2 = Z.multiply(Z).sum(axis = 1) a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray() a2 = sp.csc_matrix(a2).toarray() b1 = [j for j in range(Z.shape[0])] a = min(b1, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5)) t3 = Y1[a + i] if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))< cos_threshold: inv_index[tweet_ids[i]] = inv_index[t3] else: inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 ### index the tweet into the hsh tables lsh.index(input_point = temp_tweet, extra_data = tweet_ids[i]) initial = False Y = X Y1 = tweet_ids[:] tweet_ids = [] tweet_text = [] print counter print time.clock() - t1 f2 = open('time.txt','a') f2.write(str(time.clock()-t1) + '\n') f2.close() if counter%10000==0: f2 = open('result.txt', 'a') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'a') f4 = open('vectorizer.txt', 'a') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close() f2 = open('result.txt', 'w') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'w') f4 = open('vectorizer.txt', 'w') f5 = open('inv_index.txt', 'w') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") f5.write(json.dumps(inv_index)) #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close() f5.close()
numberRadius = 55 usedDataset = 'CVRR_dataset_trajectory_clustering\i5sim3.mat' runtime = str(dt.datetime.now().timetuple()[1])+str(dt.datetime.now().timetuple()[2])+str(dt.datetime.now().timetuple()[3])+str(dt.datetime.now().timetuple()[4]) fileContainer = open('Pointwise LSH Classification Experiment ('+usedDataset[35:-4]+') at '+runtime+' HFs_'+ str(numberHFs)+'_R_'+str(numberRadius), 'a') fileContainer.write('\n') fileContainer.write('Welcome to our experiment : ') fileContainer.write(str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) ))) print str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) )) fileContainer.write('\n') fileContainer.write('The discription needed for each result will be provided accordingly .....') fileContainer.write('\n') fileContainer.write('The used Dataset is : '+usedDataset) fileContainer.write('\n') print '\nStarting LSH initialization ...' fileContainer.write(str( '\nTime before LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) newLsh = LSHash(numberHFs, dimensionNumber, num_hashtables = 1) fileContainer.write(str( '\nTime after LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) print '\nStarting loading the trajectory dataset ...' fileContainer.write(str( '\nTime before loading the trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------ # The Trajectory dataset - I5SIM3 #------------------------------------------------------------------------------ mat = scipy.io.loadmat(usedDataset) datasetSize = len(mat.values()[0]) trajectoriesContainer = [] for i in range(datasetSize): trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))]) allPoints = [] fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------
pv = pv - np.median(pv) #filtered = del_outlier_pitches(pv.copy()) #adjusted = filtered - filtered.mean() pv = note_segment(pv) for x in sliding_window(pv,ws=10): note = x loc = np.isnan(note) note[loc] = 0 yield note,file if __name__ == '__main__': from lshash import LSHash hash_len = 10 dm = 10 lsh = LSHash(hash_len, dm) mid1 = '00001.mid' mid2 = '00002.mid' mid3 = '00003.mid' mid4 = '00004.mid' mid5 = '00005.mid' mid6 = '00006.mid' mid7 = '00007.mid' mid8 = '00008.mid' mid9 = '00009.mid' mid10 = '00010.mid' mid11 = '00011.mid' mid12 = '00012.mid' mid13 = '00013.mid' mid14 = '00014.mid'
from lshash import LSHash lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=1, storage_config={"lmdb": {'path': '/Users/christianburger/Downloads/testlmdb'}}) lsh.index([1,2,3,4,5,6,7,8], 'a') lsh.index([2,3,4,5,6,7,8,9], 'b') lsh.index([10,12,99,1,5,31,2,3], 'c') print lsh.query([1,2,3,4,5,6,7,7])
from lshash import LSHash import numpy as np s = LSHash(10, 8) s.index([1,2,3,4,5,6,7,8]) print s.hash_tables[0].keys()[0]
from __future__ import print_function from __future__ import division from scipy.spatial.distance import cosine from tqdm import tqdm import numpy from lshash import LSHash import time start = time.time() lsh = LSHash(8, 300) sample_word_embeds = [] for i in tqdm(xrange(20000)): word_embed = numpy.random.rand(300) lsh.index(word_embed) if i % 500 == 0: sample_word_embeds.append(word_embed) print("Indexing takes {} seconds".format(time.time() - start)) start = time.time() for word_embed in sample_word_embeds: print('-' * 80) results = lsh.query(word_embed, num_results=None, distance_func='cosine') print("Num result: {}".format(len(results))) print('Nearest neighbor cosine distance:') print(" {} | {}".format(results[1][1], cosine(results[1][0], word_embed))) print('Query takes average {} seconds'.format((time.time() - start) / len(sample_word_embeds)))
# numberHFs = 25 # numberRadius = 151 usedDataset = 'CVRR_dataset_trajectory_clustering\labomni.mat' fileContainer = open('Pointwise LSH Clustering SegDiff'+str(segment)+'Overlap Experiment ('+usedDataset[35:-4]+')', 'a') fileContainer.write('\n') fileContainer.write('Welcome to our experiment : -Segmentation more segments with Overlapping-'+' HFs_'+ str(numberHFs)+'_R_'+str(numberRadius)) fileContainer.write(str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) ))) print str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) )) fileContainer.write('\n') fileContainer.write('The discription needed for each result will be provided accordingly .....') fileContainer.write('\n') fileContainer.write('The used Dataset is : '+usedDataset) fileContainer.write('\n') print '\nStarting LSH initialization ...' fileContainer.write(str( '\nTime before LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) newLsh = LSHash(numberHFs, dimensionNumber, num_hashtables = 1) fileContainer.write(str( '\nTime after LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) print '\nStarting loading the trajectory dataset ...' fileContainer.write(str( '\nTime before loading the trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------ # The Trajectory dataset - LABOMNI #------------------------------------------------------------------------------ mat = scipy.io.loadmat(usedDataset) datasetSize = len(mat.values()[0]) trajectoriesContainer = [] for i in range(datasetSize): trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))]) allPoints = [] fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------
numberHFs = 25 numberRadius = 251 usedDataset = 'CVRR_dataset_trajectory_clustering/cross.mat' fileContainer = open('Pointwise LSH Clustering Experiment -ROUND-100- ('+usedDataset[35:-4]+') HFs_'+ str(numberHFs)+'_R_'+str(numberRadius), 'a') fileContainer.write('\n') fileContainer.write('Welcome to our experiment : ') fileContainer.write(str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) ))) print str( '\nSTRATING TIME : '+ time.asctime( time.localtime(time.time()) )) fileContainer.write('\n') fileContainer.write('The discription needed for each result will be provided accordingly .....') fileContainer.write('\n') fileContainer.write('The used Dataset is : '+usedDataset) fileContainer.write('\n') print '\nStarting LSH initialization ...' fileContainer.write(str( '\nTime before LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) newLsh = LSHash(numberHFs, dimensionNumber, num_hashtables = 1) fileContainer.write(str( '\nTime after LSH initialization : '+ time.asctime( time.localtime(time.time()) ))) print '\nStarting loading the trajectory dataset ...' fileContainer.write(str( '\nTime before loading the trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------ # The Trajectory dataset - cross #------------------------------------------------------------------------------ mat = scipy.io.loadmat(usedDataset) datasetSize = len(mat.values()[0]) trajectoriesContainer = [] for i in range(datasetSize): trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))]) allPoints = [] fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------
def main(): parser = argparse.ArgumentParser(description = 'Tools for hamming distance-based image retrieval by cuda') parser.add_argument('-f', help = 'The filename of image raw features (SIFT).') parser.add_argument('-v', default = 'fvecs', help = 'The format of image raw features.') parser.add_argument('-s', default = 'dict', help = 'The method of indexing storage.') parser.add_argument('-d', default = '128', help = 'Dimensions of raw image feature.') parser.add_argument('-o', default = '0', help = 'Offset of accessing raw image features.') parser.add_argument('-n', default = '1', help = 'Number of raw image features to read.') parser.add_argument('-i', default = 'n', help = 'Whether to perform indexing step.') parser.add_argument('-e', help = 'The dirname of indexing folder.') parser.add_argument('-k', default = '10', help = 'Number of retrieved images.') parser.add_argument('-r', default = '32', help = 'Number of dimensions randomly sampled.') parser.add_argument('-c', default = 'n', help = 'Whether to perform compressing step.') parser.add_argument('-q', default = 'n', help = 'Whether to sequentially sampling.') parser.add_argument('-p', default = 'n', help = 'Whether to perform querying in compressed domain.') parser.add_argument('-g', default = 'y', help = 'GPU mode. default is "yes".') parser.add_argument('-l', default = 'n', help = 'VLQ base64 mode. Load VLQ base64 encoding compressed dict.') parser.add_argument('-b', default = '1', help = 'Expanding level of search buckets.') parser.add_argument('-t', default = 'int32', help = 'FastDict type (int32, int8, string).') parser.add_argument('-u', default = 'local', help = 'CUDA client type (local, net).') parser.add_argument('-host', default = 'localhost', help = 'CUDA server address.') args = parser.parse_args() d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config = args.s, matrices_filename = 'project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i) if args.c == 'y': if args.e != None and args.s == 'random': lsh.load_index(args.e) print "compressing index..." lsh.compress_index(args.e) print "compressing done." else: print "Please specify generated indexing file." sys.exit(0) if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random': if args.p == 'y': print "loading compressed index." lsh.load_compress_index(args.e, (args.l == 'y')) print "loading done." else: print "loading index." lsh.load_index(args.e) print "loading done." if args.p != 'y': retrived = lsh.query(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming') else: retrived = lsh.query_in_compressed_domain(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming', gpu_mode = args.g, vlq_mode = args.l) print retrived
start_rowx=1 #First_cs = ord(u'\u4e00') #Last_cs = ord(u'\ufaff') jieba.initialize() Index = dict(zip(jieba.FREQ.keys(), range(len(jieba.FREQ)))) dim = len(Index) + 1 # -1 for excluded data = xlrd.open_workbook(fname) sht = data.sheet_by_name(shtname) head = sht.row_values(0) tweets = sht.col_values(head.index(target), start_rowx) hash_size = int(np.ceil(np.log2(len(tweets)))) print 'hash_size: %d, dim: %d' %(hash_size, dim) lsh=LSHash(hash_size, dim) for tweet in tweets: x = spar.csr_matrix((1,dim) ,dtype=np.int8) # x = np.zeros(dim, np.bool8) ws = jieba.cut(tweet) try: for w in ws: x[Index.get(w, -1)] = 1 lsh.index(x) except Exception, e: print e print tweet sent = True while sent:
def create_hash2img(): img2gist = get_img2gist() lsh = LSHash(hash_len, 960, storage_config=redis_config, matrices_filename=matrices_file) count = 0 total_num = len(img2gist) for name, gist_v in img2gist.iteritems(): count += 1 lsh.index(gist_v, name) sys.stdout.write('%d/%d\r ' % (count, total_num)) sys.stdout.flush() print 'bucket ratio: %d/%d' % (len(lsh.hash_tables[0].keys()), 2 ** hash_len) return lsh