def test_lshash_redis(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index( list(self.els[i]) ) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count( itm) == 1 # have multiple insertions been prevented? assert el in self.els for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in self.els assert el_dist == 0 del lsh
def create_feature(list_author, net): global example_image_dir list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for subfolder in list_author.keys(): subfolder_path = os.path.join(example_image_dir, subfolder) count_items = len([ name for name in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, name)) ]) # print(subfolder) sum_acc = 0 sum_confiden = 0 for img in os.listdir(subfolder_path): image_path = os.path.join(subfolder_path, img) author, confidence, feature = predict_author_single_img( net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) pickle.dump(lsh, open('lsh.p', "wb")) return lsh, image_paths, list_feature
def knn(data_array, data, hash_size_input, data_shape): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # get a random pos vipno_pos = rd.randint(0, data_shape[1]) # calculate and output for k in [1, 2, 3, 4, 5]: print 'hash size: %d' % hash_size_input print 'value k: %d' % k print 'target vipno: %d' % data.columns[vipno_pos] result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) print 'results: ' print result[1:]
def test_lshash_redis(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count(itm) == 1 # have multiple insertions been prevented? assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh sr.flushdb()
def test_lshash_redis_extra_val(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) lsh.index(list(els[i]), el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in els assert el[1] in el_names for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh sr.flushdb()
def test_lshash_extra_val(self): lsh = LSHash(self.hash_size, self.input_dim, 1, storage_config={'dict': None}) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: self.assertIn(el[0], self.els) self.assertIn(el[1], self.el_names) for el in self.els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] self.assertIn(el_v, self.els) self.assertIn(el_name, self.el_names) self.assertEqual(el_dist, 0) del lsh
def traceLSHash(queryName, hashSize): #queryName ="hamming_query_12_3" #需要进行hashQuery的轨迹index indexList = [14, 249, 479, 689, 899] XYMatrix = DateTransform() resultList = [] nearList = [] lsh = LSHash(hashSize, 44107) tid = 1 for traceList in XYMatrix: lsh.index(input_point=traceList, extra_data=tid) tid += 1 resultFile = open(queryName + '.txt', 'w') for index in indexList: queryList = lsh.query(XYMatrix[index], distance_func="hamming") for result in queryList: resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str( result[1]) + "\n" nearList.append(result[0][1]) resultFile.write(resultStr) resultList.append(nearList) nearList = [] resultFile.close() writeHTML(resultList, queryName, "hashQuerry") print resultList
def write_json_lsh(hash_size, grid): ''' 将生成的lsh路径放入json并存储 :param hash_size: hash size列表 :param grid: 处理完的栅格数组 :return: none ''' data_lsh = {} for size in hash_size: print size print 'list' data_lsh[size] = [] lsh = LSHash(size, 44107) count = 0 for line in grid: lsh.index(line, extra_data=count) count += 1 for id in road_id: roads = [] res = lsh.query(grid[id]) print len(res) for r in res: roads.append(pack_data(r[0][1])) data_lsh[size].append({id: roads}) with open('result_lsh.json', 'w') as f: f.write(str(data_lsh))
def test_lshash_redis_extra_val(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) lsh.index(list(self.els[i]), self.el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in self.els assert el[1] in self.el_names for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in self.els assert el_name in self.el_names assert el_dist == 0 del lsh
def k_nn_lsh(k, word, decade_matrix, index_dict): index_dict = dict(map(reversed, index_dict.items())) num_rows = decade_matrix.get_shape()[0] lsh = LSHash(6, num_rows) for i in range(num_rows): print(i) lsh.index(decade_matrix.getrow(i).todense()) return lsh.query(word)
def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh = LSHash(hash_size=32, input_dim=f, num_hashtables=100) for i in range(n): lsh.index(X[i], i) return lsh
def build_index(self, X): f = X.shape[1] n = X.shape[0] lsh = LSHash(hash_size = 32, input_dim = f, num_hashtables = 100) for i in range(n): lsh.index(X[i], i) return lsh
def Mainfunc(self, mat_addr): np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['FFE'] datalen = len(svec) n1, n2, n3 = np.shape(svec) data = np.zeros((n1, 87212)) m = 0 for i in range(n2): for j in range(n3): if svec[:, i, j].all() != 0: data[:, m] = svec[:, i, j] m = m + 1 # print data[:,0] dataves = np.transpose(data) modelindex = list(set(np.random.randint(1, 87212, size=10000))) lsh_model = LSHash(7, n1) for jj in modelindex: lsh_model.index(dataves[jj, :]) # if you want to test a program starttest = 1 # start test index endtest = 5 testindex = random.sample(modelindex, 1) # SIZE IS THE NUMBER OF TEST FUNCTIONS test = np.zeros((len(testindex), n1)) for i in range(len(testindex)): # print dataves[testindex[i],:] test[i, :] = dataves[testindex[i], :] # print len(test) output = open('result.txt', 'w') timee = open('time.txt', 'w') for queryi in range(len(testindex)): if test[queryi, :].all() != 0: starttime = time.time() Atemp = lsh_model.query(test[queryi, :], 5, 'cosine') print(str(Atemp[0]).split(')')[0]).replace('(', '') output.write((str(Atemp[0]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[1]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[2]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[3]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[4]).split(')')[0]).replace('(', '') + '\n') endtime = time.time() timee.write(str(endtime - starttime) + '\n') # output.write(A) output.write('\n') output.close() timee.close()
def eventIdentification(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) #print "Unique Tokens:", dictionary.__len__() lsh = LSHash(20, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector) #print denseVector #no similar tweets if(result == []): #print "No Similar Tweets for: ", index tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): ev = tempDict[tuple(vector[0])] outputVector[ev].append(index) tempDict[tuple(denseVector)] = ev #for ind in range(len(outputVector)): #done = False #for tweetNo in outputVector[ind]: #if (tweetNo == tempDict[tuple(vector[0])]): #outputVector[ind].append(index) #done = True #break #if done == True: #break assigned = True break if assigned == False: tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += "," + str(index) out.write(line[1:]+"\n") del outputVector del tempDict
def get_lshash(filename): lsh = LSHash(30, 8) try: with open(filename) as f: content = f.readlines() content = [x.strip('\n') for x in content] except Exception as e: print("Cannot find the file.") for row in content: row = row.split(",") row = list(map(int, row)) tmp = row[:8] lsh.index(tmp, str(row[8])) return lsh
def test(): import utils trueIds, testSet = utils.load_test_set('fc7', 'raw', 0) lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True) for idx, input_point in enumerate(testSet): hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist()) print hastValue lsh.index(input_point, idx) print lsh.query(testSet[3], 3) return None
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # calculate and output result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) return result[1:]
def lshTOfind(path): lsh = LSHash(50,361) f = open('newindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [int(float(i)) for i in r[1:]] lsh.index(features) count += 1 try: f_v = getfeatures(path) ans = lsh.query(f_v) if ans != []: return searchid(int(ans[0][0][360]/10000)) except: return []
def test_lshash(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh
class LshIndexer(Indexer): PARAMETERS = {'hash_size': 6, 'input_dim': 128, 'num_of_hashtables': 1, 'storage': {'redis': {'host':'localhost', 'port': 6379}}} def initialize_store(self, parameters): self.store = LSHash(parameters['hash_size'], parameters['input_dim'], parameters['num_of_hashtables'], parameters['storage']) def index(self, features): for feature in features: self.store.index(feature.data, feature.file_id) def query(self, feature, num_results=5): return self.store.query(feature, num_results)
def classify_nearest_neighbor_lsh(k): lsh = LSHash(3, 12) labels = load_labels() for genre, song_genres_ids in labels.groupby('category'): print('Indexing genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2)): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) for val in song.values: lsh.index(val, extra_data=genre) total_count = 0 match_count = 0 for genre, song_genres_ids in labels.groupby('category'): print('Expected genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2), num_values): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) genre_freqs = {} split_song = np.array_split(song, 5, axis=0) # Split song into sections for s in split_song: avg_song_val = np.mean(s) # Take average of each section neighbours = lsh.query(avg_song_val, num_results=k) for neighbour in neighbours: genre = neighbour[0][1] genre_freqs[genre] = genre_freqs.get(genre, 0) + 1 actual_genre = max(genre_freqs, key=genre_freqs.get) print('Predicted genre: {}'.format(actual_genre)) total_count += 1 if genre == actual_genre: match_count += 1 print('Matched {} out of {} songs: {}%'.format( match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict={} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results = 50, distance_func = "euclidean") #print result #no similar tweets if(result == []): outputdict[index]=[] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if(outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line+"\n") print "Please check the output file:", outputFile
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() #这里的参数可以调整,具体见https://github.com/kayzh/LSHash lsh = LSHash(13, 128, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() #这里的参数可以调整,具体见https://github.com/kayzh/LSHash lsh = LSHash(13, 128, num_hashtables=1) map(lambda x:lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle,"wb") pickle.dump(lsh, out, -1) out.close()
def create_feature(train_image_dir, classes, net): list_feature = list() image_paths = list() ## Locality Sensitive Hashing k = 10 # hash size L = 5 # number of tables d = 58 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) for each_object in classes: each_object_path = os.path.join(train_image_dir, each_object) list_img = next(os.walk(each_object_path))[2] print("hashing class: ", each_object, " which has: ", len(list_img)) for img in list_img: image_path = os.path.join(each_object_path, img) feature = get_feature_single_img(net, image_path) image_paths.append(image_path) list_feature.append(feature) lsh.index(feature, extra_data=image_path) return lsh, image_paths, list_feature
def index_room(): ''' lsh算法索引图片特征 :return: ''' files = glob("./data/features/*.csv") files_ids = [ filename.split("\\")[-1].replace(".csv", "") for filename in files ] X = np.load("data/train.npy") X = X.reshape(X.shape[0], -1) encoder = load_model("data/encoder.h5") dimension = 100 lsh_hash = LSHash(hash_size=32, input_dim=dimension) compress_feature = encoder.predict(X) for num, ele in enumerate(compress_feature.tolist()): lsh_hash.index(ele, extra_data=files_ids[num]) with open("data/lsh.pkl", "wb") as fh: pickle.dump(lsh_hash, fh)
def learn(routes): global global_training_route global next_hop_index extra_data_len = 2 #destination, next_hop ndims = len(routes[0]) - extra_data_len #Number of dimensions hash_length = len(routes[0]) * 2 #arbitrarily chosen hash_length next_hop_index = len(routes[0]) - 1 #NextHop index at the last for i in range(0, len(routes) - 1): if(routes[i][next_hop_index] >= routes[i+1][next_hop_index]): routes[i][next_hop_index] = i+1 else: routes[i][next_hop_index] = -1 global_training_route = routes lsh = LSHash(hash_length, ndims) for entry in routes: lsh.index(entry[:-extra_data_len], extra_data = entry[-extra_data_len:]) return lsh
def test_lshash(self): lsh = LSHash(self.hash_size, self.input_dim, 1) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index(list(self.els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: self.assertEqual(itms.count(itm), 1) for el in itm: self.assertIn(el, self.els) for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res self.assertIn(el_v, self.els) self.assertEqual(el_dist, 0) del lsh
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 count = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results=5, distance_func="cosine") #print result #no similar tweets count += 1 if (result == []): outputdict[index] = [] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if (outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print count, #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line + "\n") print "Please check the output file:", outputFile
def dump_lsh_data_to_pickle(bits_tid_pickle, lsh_pickle): f = file(bits_tid_pickle, "rb") data = pickle.load(f) f.close() # '10' means the bit binary (github.com/kayzh/LSHash) lsh = LSHash(13, 10, num_hashtables=1) map(lambda x: lsh.index(np.array([int(tmp) for tmp in x])), data.keys()) out = file(lsh_pickle, "wb") pickle.dump(lsh, out, -1) out.close()
def getBuckets(fromFile): global nameDict global lsh nameDict = {} lsh = LSHash(bWidth,26, num_hashtables = num_ht) if fromFile: f = open(datafile, 'r') nameList = f.readlines() else: nameList = surnames.dic.keys() for l in nameList: name = l.split(" ")[0].strip() nameArr = getvec(name) arrStr = toStr(nameArr) if arrStr in nameDict: nameDict[arrStr].append(name) else: nameDict[arrStr] = [name] for k in nameDict.keys(): lsh.index(toArr(k))
def test_lshash_extra_val(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert el[0] in els assert el[1] in el_names for el in els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh
def subEventDetection(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector, num_results = 50, distance_func = "cosine") #no similar tweets if(result == []): outputVector.append([index]) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): for ind in range(len(outputVector)): done = False for tweetNo in outputVector[ind]: if (tweetNo == tempDict[vector]): outputVector[ind].append(index) done = True break if done == True: break assigned = True break if assiged == False: outputVector.append([index]) lsh.index(denseVector) tempDict[tuple(denseVector)] = index with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += ", " + str(index) out.write(line[2:]+"\n") print "Please check the output file:", outputFile
def lshTOfind(path): lsh = LSHash(10, 360) f = open('copyindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [float(i) for i in r[1:]] lsh.index(features[:360], features[360]) count += 1 try: f_v = getfeatures(path) #print f_v ans = lsh.query(f_v[:360], 15) if ans != []: res = [] for i in ans: res.append(int(i[0][1] / 10000)) return res #searchid(int(ans[0][0][360]/10000)) except: return []
def get_lshash(self): """Index all existing reactions based on specified headers into an lshash.""" from lshash import LSHash headers = headers_to_use count = 0 lsh = LSHash(1, len(headers)) for i in PerformedReaction.objects.all().rows(True): to__index = [] for header in headers: try: to__index.append(i[header]) except KeyError: continue if len(to__index) == len(headers): lsh.index(to__index) count += 1 else: pass print('count', count) self.lshash = lsh self.headers = headers
class Searcher: _DIST_FUNCTIONS = ["hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"] index = None def __init__(self, dataset): self.create_index(dataset) def create_index(self, items, hash_size=6): input_dim = len(items.values()[0]) self.index = LSHash(hash_size, input_dim) for key in items: self.index.index(items[key], extra_data=key) return True def query(self, query_item, num_results=10, distance_function='cosine'): if distance_function not in self._DIST_FUNCTIONS: raise Exception("{0} not supported".format(distance_function)) results = self.index.query(query_item, num_results=num_results, distance_func=distance_function) return self.parse_results(results) def parse_results(self, results): return {x[0][1]:x[1] for x in results}
def filterDataset(fileIn, fileOut, fileNodes, threshold): ''' Reads filteredTaxiData.txt and filters out lines that are farther away from every node in OSM graph , by a threshold value (0.1 mile). A data entry will be kept if the distance of point is less than this threshold, from any node in OSM graph ''' # Dimension of our vector space lsh = LSHash(hash_size=10, input_dim=2) nodes = GetNodes(fileNodes) for node in nodes: v = np.array(node, dtype=float) lsh.index(v) bunch = [] bunch_size = 5000 count_lines_read = 0 count_lines_written = 0 with open(fileIn, "r") as fin, open(fileOut, "w") as fout: for line in fin: [latitude, longitude] = dataToGraph.lineToPoint(line) query = np.array((latitude, longitude), dtype=float) result = lsh.query(query, num_results=1) closest_node = result[0][0] count_lines_read += 1 if vin((latitude, longitude), closest_node).miles < threshold: line = replacePointByOSMnode(line, closest_node) bunch.append(line) if len(bunch) == bunch_size: fout.writelines(bunch) count_lines_written += len(bunch) bunch = [] if (count_lines_written % 10 == 0): print("%d written / %d read" % (count_lines_written, count_lines_read)) fout.writelines(bunch) count_lines_written += len(bunch) print("%d lines written" % count_lines_written)
class feature_comparer(): def __init__(self, fea_dim, compare_thresh): self.lsh = LSHash(bit_num, fea_dim, compare_kernel_num) self.fv_dict = {} self.compare_thresh = compare_thresh def load(self, filename): f = open(filename, 'r') while (1): line = f.readline() if not line: break fv = line.split(':')[0] id = line.split(':')[1] self.fv_dict[fv] = id fv_array = [] s = fv[1:-1].split(',') for i in range(0, len(s)): fv_array.append(float(s[i])) self.lsh.index(fv_array) def insert(self, feature, id): self.fv_dict[str(feature)[1:-1]] = str(id) self.lsh.index(feature) def match(self, feature): q = self.lsh.query(feature, distance_func='cosine') if len(q) == 0: return False, -1 mindis = q[0][1] if mindis < self.compare_thresh: return True, self.fv_dict[str(q[0][0])[1:-1]] else: return False, -1
def hash_item_pic_v1(pic_folder): """ 当前版本采用HardNet直接进行特征输出,并且对整张图作为特征区域进行特征向量输出,会在图片所在同级 目录输出一个图片与hash值编码的映射文件 这种方案出来的结果是如果图片有平移则特征向量会有差距 :param pic_folder: 所有图片所在文件夹 :return: 是否成功 """ try: # 计算所有图片的特征向量 desc = HardNetDescriptor() print(colored("HardNet模型加载完成", color='blue')) # 使用LSH lsh = LSHash(16, 128) img_feature_vector = {} with open(pic_folder + '_item_hash.txt', 'w') as to_write: img_file_list = glob(os.path.join(pic_folder, '*_[0-9].jpg')) for m_img_file in tqdm(img_file_list, desc='训练中'): fv = desc.describle([ np.array( Image.open(m_img_file).convert('L').resize((32, 32))), ])[0] img_feature_vector[m_img_file] = fv lsh.index(fv, extra_data=m_img_file) for m_img_file in tqdm(img_file_list, desc='输出中'): res = lsh.query(img_feature_vector[m_img_file], distance_func='centred_euclidean') # 输出所有临近的图片 print(m_img_file, '|'.join(map(lambda x: x[0][1], res))) pass return True except Exception as e: print(colored("错误:%s" % str(e), color='red')) return False
def main(argv): parser = argparse.ArgumentParser(prog='INDEX') parser.add_argument('source', help='path to the source metadata file') parser.add_argument('--hash-size', help='Hash size.', type=int, default=10) parser.add_argument('--num-tables', help='Number of tables.', type=int, default=5) parser.add_argument('--query-index', help='Index to use for query.', type=int, default=0) args = parser.parse_args(argv[1:]) # read in the data file data = pandas.read_csv(args.source, sep='\t') # params k = args.hash_size # hash size L = args.num_tables # number of tables d = len(data['features'][0].split(',')) lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) # indexing for i in range(0, len(data)): lsh.index(np.asarray(data['features'][i].split(',')).astype('float64'), extra_data=data['filename'][i]) # query a vector q_vec response = lsh.query( np.asarray( data['features'][args.query_index].split(',')).astype('float64')) pprint(response)
from lshash import LSHash lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=1, storage_config={"lmdb": {'path': '/Users/christianburger/Downloads/testlmdb'}}) lsh.index([1,2,3,4,5,6,7,8], 'a') lsh.index([2,3,4,5,6,7,8,9], 'b') lsh.index([10,12,99,1,5,31,2,3], 'c') print lsh.query([1,2,3,4,5,6,7,7])
from __future__ import print_function from __future__ import division from scipy.spatial.distance import cosine from tqdm import tqdm import numpy from lshash import LSHash import time start = time.time() lsh = LSHash(8, 300) sample_word_embeds = [] for i in tqdm(xrange(20000)): word_embed = numpy.random.rand(300) lsh.index(word_embed) if i % 500 == 0: sample_word_embeds.append(word_embed) print("Indexing takes {} seconds".format(time.time() - start)) start = time.time() for word_embed in sample_word_embeds: print('-' * 80) results = lsh.query(word_embed, num_results=None, distance_func='cosine') print("Num result: {}".format(len(results))) print('Nearest neighbor cosine distance:') print(" {} | {}".format(results[1][1], cosine(results[1][0], word_embed))) print('Query takes average {} seconds'.format((time.time() - start) / len(sample_word_embeds)))
mid14 = '00014.mid' mid15 = '00015.mid' mid16 = '00016.mid' mid17 = '00017.mid' mid18 = '00018.mid' mid19 = '00019.mid' mid20 = '00020.mid' s1 = 'yldw.mid' s2 = 'alphaville-forever_young.mid' s3 = 'counting_stars.mid' s4 = 'baba-go.mid' for note,name in note_from_midi(s1): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(s2): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(s3): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(s4): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid1): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid2): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid3): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid4): lsh.index(note,extra_data=(name,0.8))
def run(): initial = True size = 2000 tweet_ids = [] tweet_text = [] counter = 0 num_hashtables = 13 ## recompute the random vectors if this is changed dimension = 50000 ## recompute the random vectors if this is changed hash_size = 13 ## length of the LSHash of the tweets bucket_size = 100 ## size of the queue for each hash in the hash tables comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor cos_threshold = .7 ## threshold for the similarity of two tweets ## initialize the tf-idf vectorizer vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension) ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size) clusters = {} ## maintain the clusters num_clusters = 0 inv_index = {} ## inverse mapping from tweet_id to clusters Y = None Y1 = None f_d = open("output.txt",'w') loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/" for root, dirs, filenames in os.walk(loc): for f in filenames: with open(loc+f) as infile: for line in infile: ## load 2000 tweets at a time tweet = json.loads(line) tweet_ids.append(tweet['id']) tweet_text.append(tweet['text']) counter = counter + 1 t2 = 0 if counter%size == 0: t1 = time.clock() ## X contains te tf-idf score of the tweets in the "sparse row matrix" format if initial: X = vectorizer.fit_transform(tweet_text) else: X = vectorizer.transform(tweet_text) print X.get_shape() print len(vectorizer.vocabulary_) ## if the total number of keywords exceed the pre-specified dimension, raise error if X.get_shape()[0] > dimension: print X.get_shape() print "dimension exceeded" raise for i in range(X.get_shape()[0]): temp_tweet = X.getrow(i) ## query for the nearest neighbor from the lshash tables nn = lsh.arpoxNN(temp_tweet, L=comparisons) c = 2 scase = False ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster if nn is not None: ((a, b),c) = nn if c <= cos_threshold: inv_index[tweet_ids[i]] = inv_index[b] clusters.setdefault(inv_index[b],[]).append(tweet_ids[i]) #else: # scase = True ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor """ code to linearly search through the tweets""" if (c > cos_threshold or nn is None or scase): searchY = False if (i==0 and not initial): searchY = True if (i==0 and initial): inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 if (i!=0): Z = X[:i] #print temp_tweet.shape t2 = temp_tweet.transpose() #print i a1 = Z.dot(t2).toarray() a2 = Z.multiply(Z).sum(axis = 1) a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray() a2 = sp.csc_matrix(a2).toarray() b = [j for j in range(Z.shape[0])] a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5)) #a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet)) #print a t3 = tweet_ids[a] if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold: if not initial and i != size-1: searchY = True else: inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 else: inv_index[tweet_ids[i]] = inv_index[t3] clusters.setdefault(inv_index[t3], []).append(tweet_ids[i]) if searchY == True: Z = Y[i:] t2 = temp_tweet.transpose() #print i a1 = Z.dot(t2).toarray() a2 = Z.multiply(Z).sum(axis = 1) a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray() a2 = sp.csc_matrix(a2).toarray() b1 = [j for j in range(Z.shape[0])] a = min(b1, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5)) t3 = Y1[a + i] if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))< cos_threshold: inv_index[tweet_ids[i]] = inv_index[t3] else: inv_index[tweet_ids[i]] = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 ### index the tweet into the hsh tables lsh.index(input_point = temp_tweet, extra_data = tweet_ids[i]) initial = False Y = X Y1 = tweet_ids[:] tweet_ids = [] tweet_text = [] print counter print time.clock() - t1 f2 = open('time.txt','a') f2.write(str(time.clock()-t1) + '\n') f2.close() if counter%10000==0: f2 = open('result.txt', 'a') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'a') f4 = open('vectorizer.txt', 'a') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close() f2 = open('result.txt', 'w') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'w') f4 = open('vectorizer.txt', 'w') f5 = open('inv_index.txt', 'w') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") f5.write(json.dumps(inv_index)) #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close() f5.close()
dim = len(Index) + 1 # -1 for excluded data = xlrd.open_workbook(fname) sht = data.sheet_by_name(shtname) head = sht.row_values(0) tweets = sht.col_values(head.index(target), start_rowx) hash_size = int(np.ceil(np.log2(len(tweets)))) print 'hash_size: %d, dim: %d' %(hash_size, dim) lsh=LSHash(hash_size, dim) for tweet in tweets: x = spar.csr_matrix((1,dim) ,dtype=np.int8) # x = np.zeros(dim, np.bool8) ws = jieba.cut(tweet) try: for w in ws: x[Index.get(w, -1)] = 1 lsh.index(x) except Exception, e: print e print tweet sent = True while sent: sent = raw_input('input sentence...\n') res = lsh.query(sent, distance_func = 'hamming') for i in res: print i[0], i[-1]
class LshManager(object): def __init__(self): self.lshIndexList = [] # create a list of lsh indexes self.lsh = LSHash(NUMBER_OF_BITS_PER_HASH, NUM_TOPICS, num_hashtables=NUMBER_OF_LSH_INDEXES, storage_config={"redis": {"host": "localhost", "port": 6379}}) def clearIndex(self): redis.Redis().flushall() # adds a document to all lsh indexes def addDocument(self, document): lsa_vector = document.vectors["LSA"] dense_vector = self._sparseToDenseConverter(lsa_vector) if not hasattr(document, "timestamp"): document.timestamp = str(datetime.datetime.now()) extra = json.dumps(str(document._id)) # detect duplicates #result = self.lsh.query(dense_vector, num_results=1, distance_func="cosine") #if result: # nearest = result[0] # if nearest[1] > DUPLICATE_SIMILARITY_THRESHOLD: # extra = ast.literal_eval(ast.literal_eval(nearest[0])[1]) # doctitle = getDatabaseConnection().holist.articles.find({"_id": extra}).next()["title"] # ln.warn("Detected duplicate for %s (ID %s): %s.", document.title, document._id, extra) # return self.lsh.index(dense_vector, extra_data=extra) # extra MUST be hashable # takes a document and returns database ids of similar documents # uses cosine function to determine similarity def getSimilarDocuments(self, document, num_docs=7): if isinstance(document, Document): lsa_vector = document.vectors["LSA"] else: lsa_vector = document dense_vector = self._sparseToDenseConverter(lsa_vector) client = getDatabaseConnection() resultSet = set() results = [] for result in self.lsh.query(dense_vector, num_results=num_docs, distance_func="cosine"): # example: # [ # (((1, 2, 3), "{'extra1':'data'}"), 0), # (((1, 1, 3), "{'extra':'data'}"), 1) # ] extra = ast.literal_eval(ast.literal_eval(result[0])[1]) clientDoc = bsonToClientBson(client.holist.articles.find({"_id": extra}).next()) clientDoc['lsa'] = self._sparseToDenseConverter(clientDoc['lsa']) jsonstr = json.dumps(clientDoc) if not jsonstr in resultSet: resultSet.add(jsonstr) results.append(clientDoc) ln.debug("retrieved %s documents.", len(results)) return results # converts a vector in sparse format to a vector in dense format def _sparseToDenseConverter(self, sparseVector): dense = {} for x in range(NUM_TOPICS): dense[x] = 0 for dim, val in sparseVector: dense[dim] = val return [value for key, value in dense.items()]
#vector = np.matrix(onedarray) big_array=[] image_number=[] #hist = cv2.calcHist([img],[0],None,[256],[0,256]) #hist,bins = np.histogram(pixel.ravel(),256,[0,256]) lsh = LSHash(3, 255) for x in range(1, 100000): img = Image.open(dataset+str(x)+fileext) pixel = np.array(img) #onedarray = pixel.ravel() hist,bins = np.histogram(pixel.ravel(),256,[0,256]) listing=list(hist[0:255]) big_array.append(listing) lsh.index(listing) input_array=np.array(big_array) img = Image.open(queryset+"10"+fileext) pixel = np.array(img) #onedarray = pixel.ravel() hist,bins = np.histogram(pixel.ravel(),256,[0,256]) listing=list(hist[0:255]) k=lsh.query(listing,distance_func="l1norm") vector = np.matrix(k) length=len(k) if length > 0: for output in range(length): if (k[output][1] < 800): test=np.array(k[output][0])
fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------ # indexing all trajectories print '\nStarting the indexing procedure ...' fileContainer.write(str( 'Time before indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) ) )) queryDictionary = {} numberOfPoints = 0 for i, trajectory in enumerate(trajectoriesContainer): for j, point in enumerate(trajectory[:-((dimensionNumber-2)/2)]): involvedPoints = [point[0],point[1]] for s in range(j+1, j+1+((dimensionNumber-2)/2)): involvedPoints.append(trajectory[s][0]) involvedPoints.append(trajectory[s][1]) hash = newLsh.index((involvedPoints), loadF=numberRadius) if queryDictionary.has_key(hash): queryDictionary[hash].add(i) else: queryDictionary[hash] = set() queryDictionary[hash].add(i) numberOfPoints += 1 print len(trajectoriesContainer) fileContainer.write('\nThe following is the hash table used for querying or clustering ...') fileContainer.write(str(queryDictionary)) fileContainer.write('\n') fileContainer.write(str( 'The number of generated buckets is : '+ str(len(queryDictionary.keys())))) fileContainer.write('\n') fileContainer.write(str( 'Time after indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) ) )) fileContainer.write('\n') fileContainer.write(str( 'The number of point have beenindexed is : '+ str(numberOfPoints)))
def run(): initial = True size = 200000 tweet_ids = [] tweet_text = [] counter = 0 num_hashtables = 4 ## recompute the random vectors if this is changed dimension = 5000000 ## recompute the random vectors if this is changed hash_size = 13 ## length of the LSHash of the tweets bucket_size = 100 ## size of the queue for each hash in the hash tables comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor cos_threshold = .7 ## threshold for the similarity of two tweets ## initialize the tf-idf vectorizer vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension) ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size) clusters = {} ## maintain the clusters num_clusters = 0 completed = open('/tmp/completed_tmp.txt') completed = completed.readlines() completed = set([x.replace('\n', '') for x in completed]) while(True): clusters_size_prev = {} files = [] for root, dirs, filenames in os.walk('/tmp/tweets_tmp/'): for fname in filenames: if fname != '.DS_Store': files.append(fname) files = set(files) files = files - completed if len(files) == 0: print 'sleeping' time.sleep(3000) print 'checking' continue #print files tweets_dump = {} tweet_ids = [] tweet_text = [] time_sleep = time.time() for fn in files: print fn time_tmp2 = time.time() with open('/tmp/tweets_tmp/' + fn) as infile: for line in infile: ## load 2000 tweets at a time tweet = json.loads(line) tweet_ids.append(tweet['id']) tweet_text.append(tweet['filtered_text']) tweets_dump[str(tweet['id'])] = tweet['text'] counter = counter + 1 t2 = 0 if counter%size == 0: t1 = time.clock() ## X contains te tf-idf score of the tweets in the "sparse row matrix" format if initial: X = vectorizer.fit_transform(tweet_text) else: X = vectorizer.transform(tweet_text) #print X.get_shape() #print len(vectorizer.vocabulary_) ## if the total number of keywords exceed the pre-specified dimension, raise error if X.get_shape()[0] > dimension: print X.get_shape() print "dimension exceeded" raise for i in range(X.get_shape()[0]): temp_tweet = X.getrow(i) ## query for the nearest neighbor from the lshash tables nn = lsh.arpoxNN(temp_tweet, L=comparisons) c = 2 scase = False ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster cluster_id = -1 if nn is not None: ((a, (b,d)),c) = nn if c <= cos_threshold: cluster_id = d clusters.setdefault(d,[]).append(tweet_ids[i]) #else: # scase = True ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor """ code to linearly search through the tweets""" if (c > cos_threshold or nn is None or scase): cluster_id = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 ### index the tweet into the hsh tables lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id])) initial = False tweet_ids = [] tweet_text = [] #print counter #print time.clock() - t1 f2 = open('time.txt','a') f2.write(str(time.clock()-t1) + '\n') f2.close() """ if counter%100000==0: f2 = open('result.txt', 'w') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'w') f4 = open('vectorizer.txt', 'w') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close() """ print 'done' print counter print str(time.time() - time_tmp2) f = open('/tmp/completed_tmp.txt', 'a') f.write(fn + '\n') f.close() completed.add(fn) print "all done" time_temp = time.time() if not os.path.exists('/home/y/share/htdocs/clusters/' + str(time_temp)): os.makedirs('/home/y/share/htdocs/clusters/' + str(time_temp)) if not os.path.exists('/home/y/share/htdocs/clusters/current'): os.makedirs('/home/y/share/htdocs/clusters/current') clusters_size = {} for x in clusters: clusters_size[x] = len(clusters[x]) f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/sizes.txt', 'w') f.write(json.dumps(clusters_size)) f.close() f = open('/home/y/share/htdocs/clusters/current/sizes.txt', 'w') f.write(json.dumps(clusters_size)) f.close() cls = clusters_size.keys() cls.sort(key = lambda x : -1 * clusters_size[x]) cl = [] for x in cls: if clusters_size[x] >=10: cl.append(x) arr = [] for i in range(len(cl)): write_clusters(i, cl, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/clusters') arr.append(cl[i]) f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/list.txt', 'w') f.write(json.dumps(arr)) f.close() f = open('/home/y/share/htdocs/clusters/current/list.txt', 'w') f.write(json.dumps(arr)) f.close() f = open('/home/y/share/htdocs/clusters/list.txt', 'a') f.write(str(time_temp) + '\n') f.close() if not os.path.exists('/home/y/share/htdocs/ratio_clusters/' + str(time_temp)): os.makedirs('/home/y/share/htdocs/ratio_clusters/' + str(time_temp)) if not os.path.exists('/home/y/share/htdocs/ratio_clusters/current'): os.makedirs('/home/y/share/htdocs/ratio_clusters/current') ratio = {} for x in clusters_size: if clusters_size[x]>=10: r = 1 if (x in clusters_size_prev and clusters_size_prev[x] != 0): r = clusters_size_prev[x] ratio[x] = clusters_size[x]*1.0/r ratio_keys = ratio.keys() ratio_keys.sort(key = lambda x : -1 * ratio[x]) ratio_keys = ratio_keys[:300] arr = [] for i in range(len(ratio_keys)): write_clusters(i, ratio_keys, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/ratio_clusters') arr.append(ratio_keys[i]) f = open('/home/y/share/htdocs/ratio_clusters/' + str(time_temp) + '/list.txt', 'w') f.write(json.dumps(arr)) f.close() f = open('/home/y/share/htdocs/ratio_clusters/current/list.txt', 'w') f.write(json.dumps(arr)) f.close() f = open('/home/y/share/htdocs/ratio_clusters/list.txt', 'a') f.write(str(time_temp) + '\n') f.close() clusters_size_prev = {} for x in clusters_size: clusters_size_prev[x] = clusters_size[x] clusters = {} time.sleep(max(0, 3600 - (time.time() - time_sleep)))
def run(): initial = True size = 2000 tweet_ids = [] tweet_text = [] counter = 0 num_hashtables = 5 ## recompute the random vectors if this is changed dimension = 5000000 ## recompute the random vectors if this is changed hash_size = 13 ## length of the LSHash of the tweets bucket_size = 100 ## size of the queue for each hash in the hash tables comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor cos_threshold = .5 ## threshold for the similarity of two tweets ## initialize the tf-idf vectorizer vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension) ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size) clusters = {} ## maintain the clusters num_clusters = 0 Y = None Y1 = None f_d = open("output.txt",'w') loc = "processed_tweets/" for root, dirs, filenames in os.walk(loc): for f in filenames: with open(loc+f) as infile: for line in infile: ## load 2000 tweets at a time tweet = json.loads(line) tweet_ids.append(tweet['id']) tweet_text.append(tweet['text']) counter = counter + 1 t2 = 0 if counter%size == 0: t1 = time.clock() ## X contains te tf-idf score of the tweets in the "sparse row matrix" format if initial: X = vectorizer.fit_transform(tweet_text) else: X = vectorizer.transform(tweet_text) print X.get_shape() print len(vectorizer.vocabulary_) ## if the total number of keywords exceed the pre-specified dimension, raise error if X.get_shape()[0] > dimension: print X.get_shape() print "dimension exceeded" raise for i in range(X.get_shape()[0]): temp_tweet = X.getrow(i) ## query for the nearest neighbor from the lshash tables nn = lsh.arpoxNN(temp_tweet, L=comparisons) c = 2 scase = False ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster cluster_id = -1 if nn is not None: ((a, (b,d)),c) = nn if c <= cos_threshold: cluster_id = d clusters.setdefault(d,[]).append(tweet_ids[i]) #else: # scase = True ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor """ code to linearly search through the tweets""" if (c > cos_threshold or nn is None or scase): cluster_id = num_clusters clusters.setdefault(num_clusters, []).append(tweet_ids[i]) num_clusters = num_clusters + 1 ### index the tweet into the hsh tables lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id])) initial = False Y = X Y1 = tweet_ids[:] tweet_ids = [] tweet_text = [] print counter print time.clock() - t1 f2 = open('time.txt','a') f2.write(str(time.clock()-t1) + '\n') f2.close() if counter%100000==0: f2 = open('result.txt', 'a') f2.write(json.dumps(clusters) + "\n") f3 = open('vocab.txt', 'a') f4 = open('vectorizer.txt', 'a') f3.write(json.dumps(vectorizer.vocabulary_) + "\n") f4.write(json.dumps(vectorizer.idf_) + "\n") #print clusters #print vectorizer.vocabulary_ f2.close() f3.close() f4.close()
print 'extracting features ...' t1 = time.time() for i in files: responses.append(CHARS.index(i.split(".")[-2].decode("utf8")) + 1) samples.append(leargist.color_gist(Image.open(i), nblocks=BLOCKS, orientations=ORIENTATIONS)) t2 = time.time() print 'done, %d file took %0.3f ms' % (len(files), (t2 - t1) * 1000.0) train_n = int(len(files)*0.5) lsh = LSHash(3, DIMENSION, num_hashtables=5) print "indexing ..." t1 = time.time() for i, sample in enumerate(samples[:train_n]): lsh.index(sample, extra_data=responses[:train_n][i]) t2 = time.time() print "done. %d files took %0.3f ms" % (train_n, (t2 - t1) * 1000.0) ################## test ########################## print "testing ..." #correct = 0 #total = 0 #t1 = time.time() #for i, sample in enumerate(samples[:train_n]): # total = total + 1
# -*- coding: utf-8 -*- """ Created on Fri Dec 04 15:29:56 2015 @author: MaGesh """ import numpy as np from scipy.ndimage import imread from lshash import LSHash lsh=LSHash(20,32*32) #32*32 is the dimension with 20 hash buckets resultSet=[] for i in range(1,100001): print i; X="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\dataset\\"+str(i)+".bmp" im=imread(X,flatten=True) single_array=im.flatten() lsh.index(single_array)#hashing the each values in to the bucket for i in range(1,11): print i,"for querying" X1="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\Query\\"+str(i)+".bmp" imQ=imread(X1,flatten=True) #converting to grey scale imFlatten=imQ.flatten() value=lsh.query(imFlatten,distance_func="euclidean") #querying the nearest points resultSet.append(value)
trajectoriesContainer = [] for i in range(datasetSize): trajectoriesContainer.append([(mat.values()[0][i][0][0][j], mat.values()[0][i][0][1][j]) for j in range(len(mat.values()[0][i][0][0]))]) allPoints = [] fileContainer.write(str( '\nTime after loading trajectory dataset : '+ time.asctime( time.localtime(time.time()) ))) #------------------------------------------------------------------------------ # indexing all trajectories print '\nStarting the indexing procedure ...' fileContainer.write(str( 'Time before indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) ) )) queryDictionary = {} numberOfPoints = 0 for i, trajectory in enumerate(trajectoriesContainer): for point in trajectory: hash = newLsh.index(point, loadF=numberRadius) if queryDictionary.has_key(hash): queryDictionary[hash].add(i) else: queryDictionary[hash] = set() queryDictionary[hash].add(i) numberOfPoints += 1 fileContainer.write('\nThe following is the hash table used for querying or clustering ...') fileContainer.write(str(queryDictionary)) fileContainer.write('\n') fileContainer.write(str( 'The number of generated buckets is : '+ str(len(queryDictionary.keys())))) fileContainer.write('\n') fileContainer.write(str( 'Time after indixing all trajectories points : '+ time.asctime( time.localtime(time.time()) ) )) fileContainer.write('\n') fileContainer.write(str( 'The number of point have beenindexed is : '+ str(numberOfPoints)))
from lshash import LSHash import numpy as np s = LSHash(10, 8) s.index([1,2,3,4,5,6,7,8]) print s.hash_tables[0].keys()[0]
def create_hash2img(): img2gist = get_img2gist() lsh = LSHash(hash_len, 960, storage_config=redis_config, matrices_filename=matrices_file) count = 0 total_num = len(img2gist) for name, gist_v in img2gist.iteritems(): count += 1 lsh.index(gist_v, name) sys.stdout.write('%d/%d\r ' % (count, total_num)) sys.stdout.flush() print 'bucket ratio: %d/%d' % (len(lsh.hash_tables[0].keys()), 2 ** hash_len) return lsh