def test(): import utils trueIds, testSet = utils.load_test_set('fc7', 'raw', 0) lsh = LSHash(128, np.shape(testSet[0])[0], matrices_filename='lsh_planes.data.npz', overwrite=True) for idx, input_point in enumerate(testSet): hastValue = lsh._hash(lsh.uniform_planes[0], input_point.tolist()) print hastValue lsh.index(input_point, idx) print lsh.query(testSet[3], 3) return None
def test_lshash_extra_val(self): lsh = LSHash(self.hash_size, self.input_dim, 1, storage_config={'dict': None}) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: self.assertIn(el[0], self.els) self.assertIn(el[1], self.el_names) for el in self.els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] self.assertIn(el_v, self.els) self.assertIn(el_name, self.el_names) self.assertEqual(el_dist, 0) del lsh
def plot_similar_tats_idx(idx: int, feature_dict: dict, lsh_variable: LSHash, n_items: int = 6, distance_func: str = 'hamming') -> plt.Figure: """Takes an input index for the training set and plots the closest matching tattoos to that input tattoo. Args: idx : index to tattoo in the training set feature_dict : wraps both image locations and feature vectors at the output of the cnn before the final layer. lsh_variable : trained lsh model to query the input image n_items : number of items to return distance_func : The distance function. Currently it needs to be one of ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"). By default "hamming" will used. Returns: Matplotlib grid plot of the index image first and n other similar images. """ response = lsh_variable.query(feature_dict[list( feature_dict.keys())[idx]].flatten(), num_results=n_items + 1, distance_func=distance_func) return plot_similar_tats_query(response, n_items=n_items + 1, distance_func=distance_func)
def test_lshash_redis(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count(itm) == 1 # have multiple insertions been prevented? assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh sr.flushdb()
def test_lshash_redis_extra_val(): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} sr = StrictRedis(**config['redis']) sr.flushdb() lsh = LSHash(6, 8, 1, config) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) lsh.index(list(els[i]), el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in els assert el[1] in el_names for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh sr.flushdb()
def knn(data_array, data, hash_size_input, data_shape): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # get a random pos vipno_pos = rd.randint(0, data_shape[1]) # calculate and output for k in [1, 2, 3, 4, 5]: print 'hash size: %d' % hash_size_input print 'value k: %d' % k print 'target vipno: %d' % data.columns[vipno_pos] result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) print 'results: ' print result[1:]
def test_lshash_redis_extra_val(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i]), self.el_names[i]) lsh.index(list(self.els[i]), self.el_names[i]) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el[0] in self.els assert el[1] in self.el_names for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in self.els assert el_name in self.el_names assert el_dist == 0 del lsh
def traceLSHash(queryName, hashSize): #queryName ="hamming_query_12_3" #需要进行hashQuery的轨迹index indexList = [14, 249, 479, 689, 899] XYMatrix = DateTransform() resultList = [] nearList = [] lsh = LSHash(hashSize, 44107) tid = 1 for traceList in XYMatrix: lsh.index(input_point=traceList, extra_data=tid) tid += 1 resultFile = open(queryName + '.txt', 'w') for index in indexList: queryList = lsh.query(XYMatrix[index], distance_func="hamming") for result in queryList: resultStr = str(index + 1) + " : " + str(result[0][1]) + " " + str( result[1]) + "\n" nearList.append(result[0][1]) resultFile.write(resultStr) resultList.append(nearList) nearList = [] resultFile.close() writeHTML(resultList, queryName, "hashQuerry") print resultList
def test_lshash_redis(self): """ Test external lshash module """ config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} lsh = LSHash(self.hash_size, self.input_dim, 1, config) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index( list(self.els[i]) ) # multiple insertions should be prevented by the library hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert itms.count( itm) == 1 # have multiple insertions been prevented? assert el in self.els for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] el_v, el_dist = res assert el_v in self.els assert el_dist == 0 del lsh
def write_json_lsh(hash_size, grid): ''' 将生成的lsh路径放入json并存储 :param hash_size: hash size列表 :param grid: 处理完的栅格数组 :return: none ''' data_lsh = {} for size in hash_size: print size print 'list' data_lsh[size] = [] lsh = LSHash(size, 44107) count = 0 for line in grid: lsh.index(line, extra_data=count) count += 1 for id in road_id: roads = [] res = lsh.query(grid[id]) print len(res) for r in res: roads.append(pack_data(r[0][1])) data_lsh[size].append({id: roads}) with open('result_lsh.json', 'w') as f: f.write(str(data_lsh))
def k_nn_lsh(k, word, decade_matrix, index_dict): index_dict = dict(map(reversed, index_dict.items())) num_rows = decade_matrix.get_shape()[0] lsh = LSHash(6, num_rows) for i in range(num_rows): print(i) lsh.index(decade_matrix.getrow(i).todense()) return lsh.query(word)
def eventIdentification(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) #print "Unique Tokens:", dictionary.__len__() lsh = LSHash(20, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector) #print denseVector #no similar tweets if(result == []): #print "No Similar Tweets for: ", index tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): ev = tempDict[tuple(vector[0])] outputVector[ev].append(index) tempDict[tuple(denseVector)] = ev #for ind in range(len(outputVector)): #done = False #for tweetNo in outputVector[ind]: #if (tweetNo == tempDict[tuple(vector[0])]): #outputVector[ind].append(index) #done = True #break #if done == True: #break assigned = True break if assigned == False: tempDict[tuple(denseVector)] = len(outputVector) outputVector.append([index]) lsh.index(denseVector) with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += "," + str(index) out.write(line[1:]+"\n") del outputVector del tempDict
def Mainfunc(self, mat_addr): np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['FFE'] datalen = len(svec) n1, n2, n3 = np.shape(svec) data = np.zeros((n1, 87212)) m = 0 for i in range(n2): for j in range(n3): if svec[:, i, j].all() != 0: data[:, m] = svec[:, i, j] m = m + 1 # print data[:,0] dataves = np.transpose(data) modelindex = list(set(np.random.randint(1, 87212, size=10000))) lsh_model = LSHash(7, n1) for jj in modelindex: lsh_model.index(dataves[jj, :]) # if you want to test a program starttest = 1 # start test index endtest = 5 testindex = random.sample(modelindex, 1) # SIZE IS THE NUMBER OF TEST FUNCTIONS test = np.zeros((len(testindex), n1)) for i in range(len(testindex)): # print dataves[testindex[i],:] test[i, :] = dataves[testindex[i], :] # print len(test) output = open('result.txt', 'w') timee = open('time.txt', 'w') for queryi in range(len(testindex)): if test[queryi, :].all() != 0: starttime = time.time() Atemp = lsh_model.query(test[queryi, :], 5, 'cosine') print(str(Atemp[0]).split(')')[0]).replace('(', '') output.write((str(Atemp[0]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[1]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[2]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[3]).split(')')[0]).replace('(', '') + '\n') output.write((str(Atemp[4]).split(')')[0]).replace('(', '') + '\n') endtime = time.time() timee.write(str(endtime - starttime) + '\n') # output.write(A) output.write('\n') output.close() timee.close()
def lshSearch(dataBase2, test2, num): lsh = LSHash(30, 216) def CreateIndex(array): for item in array: lsh.index(item) CreateIndex(dataBase2) test2 = test2.reshape((216,)) res = lsh.query(test2, num, distance_func='true_euclidean') return res
def lshTOfind(path): lsh = LSHash(50,361) f = open('newindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [int(float(i)) for i in r[1:]] lsh.index(features) count += 1 try: f_v = getfeatures(path) ans = lsh.query(f_v) if ans != []: return searchid(int(ans[0][0][360]/10000)) except: return []
def knn(data_array, data, hash_size_input, data_shape, vipno_pos, k): # init LSHash lsh = LSHash(hash_size=hash_size_input, input_dim=data_shape[0]) # index for col_index in range(data_shape[1]): lsh.index(data_array[:, col_index], extra_data=data.columns[col_index]) # calculate and output result = [] for res in lsh.query(data_array[:, vipno_pos], num_results=k + 1, distance_func='euclidean'): result.append(res[0][1]) return result[1:]
class LshIndexer(Indexer): PARAMETERS = {'hash_size': 6, 'input_dim': 128, 'num_of_hashtables': 1, 'storage': {'redis': {'host':'localhost', 'port': 6379}}} def initialize_store(self, parameters): self.store = LSHash(parameters['hash_size'], parameters['input_dim'], parameters['num_of_hashtables'], parameters['storage']) def index(self, features): for feature in features: self.store.index(feature.data, feature.file_id) def query(self, feature, num_results=5): return self.store.query(feature, num_results)
def test_lshash(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i])) lsh.index(list(els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: assert itms.count(itm) == 1 for el in itm: assert el in els for el in els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res assert el_v in els assert el_dist == 0 del lsh
def classify_nearest_neighbor_lsh(k): lsh = LSHash(3, 12) labels = load_labels() for genre, song_genres_ids in labels.groupby('category'): print('Indexing genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2)): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) for val in song.values: lsh.index(val, extra_data=genre) total_count = 0 match_count = 0 for genre, song_genres_ids in labels.groupby('category'): print('Expected genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2), num_values): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) genre_freqs = {} split_song = np.array_split(song, 5, axis=0) # Split song into sections for s in split_song: avg_song_val = np.mean(s) # Take average of each section neighbours = lsh.query(avg_song_val, num_results=k) for neighbour in neighbours: genre = neighbour[0][1] genre_freqs[genre] = genre_freqs.get(genre, 0) + 1 actual_genre = max(genre_freqs, key=genre_freqs.get) print('Predicted genre: {}'.format(actual_genre)) total_count += 1 if genre == actual_genre: match_count += 1 print('Matched {} out of {} songs: {}%'.format( match_count, total_count, (match_count / total_count) * 100))
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict={} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results = 50, distance_func = "euclidean") #print result #no similar tweets if(result == []): outputdict[index]=[] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if(outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line+"\n") print "Please check the output file:", outputFile
def detect_subevent(filename): dictionaryFile = filename + ".dict" corpusFile = filename + ".mm" outputFile = filename + ".out" outputVector = [] tempDict = {} outputdict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 count = 0 for index in range(len(corpus)): #print str(index)+",", #print corpus[index] denseVector = getDenseVector(corpus[index], lsh.input_dim) #print getSparseVector(denseVector) result = lsh.query(denseVector, num_results=5, distance_func="cosine") #print result #no similar tweets count += 1 if (result == []): outputdict[index] = [] tempDict[getSparseVector(denseVector)] = index lsh.index(denseVector) #continue else: for r in result: if (outputdict.has_key(tempDict[getSparseVector(r[0])])): outputdict[tempDict[getSparseVector(r[0])]].append(index) break #print count, #print outputdict with open(outputFile, 'w') as out: for key in outputdict.iterkeys(): line = str(key) for i in outputdict[key]: line += ", " + str(i) out.write(line + "\n") print "Please check the output file:", outputFile
def test_lshash(self): lsh = LSHash(self.hash_size, self.input_dim, 1) for i in range(self.nb_elements): lsh.index(list(self.els[i])) lsh.index(list(self.els[i])) # multiple insertions hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: self.assertEqual(itms.count(itm), 1) for el in itm: self.assertIn(el, self.els) for el in self.els: res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # res is a tuple containing the vector and the distance el_v, el_dist = res self.assertIn(el_v, self.els) self.assertEqual(el_dist, 0) del lsh
def test_lshash_extra_val(): lsh = LSHash(6, 8, 1) for i in xrange(num_elements): lsh.index(list(els[i]), el_names[i]) hasht = lsh.hash_tables[0] itms = [hasht.get_list(k) for k in hasht.keys()] for itm in itms: for el in itm: assert el[0] in els assert el[1] in el_names for el in els: # res is a list, so we need to select the first entry only res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] # vector an name are in the first element of the tuple res[0] el_v, el_name = res[0] # the distance is in the second element of the tuple el_dist = res[1] assert el_v in els assert el_name in el_names assert el_dist == 0 del lsh
def subEventDetection(dictionaryFile, corpusFile, outputFile): outputVector = [] tempDict = {} corpus = corpora.MmCorpus(corpusFile) dictionary = corpora.Dictionary.load(dictionaryFile) lsh = LSHash(30, dictionary.__len__()) index = 0 for index in range(len(corpus)): denseVector = getDenseVector(corpus[index], lsh.input_dim) result = lsh.query(denseVector, num_results = 50, distance_func = "cosine") #no similar tweets if(result == []): outputVector.append([index]) continue assigned = False for vector in result: if(getDistance(vector, denseVector) == True): for ind in range(len(outputVector)): done = False for tweetNo in outputVector[ind]: if (tweetNo == tempDict[vector]): outputVector[ind].append(index) done = True break if done == True: break assigned = True break if assiged == False: outputVector.append([index]) lsh.index(denseVector) tempDict[tuple(denseVector)] = index with open(outputFile, 'w') as out: for vector in outputVector: line = "" for index in vector: line += ", " + str(index) out.write(line[2:]+"\n") print "Please check the output file:", outputFile
def lshTOfind(path): lsh = LSHash(10, 360) f = open('copyindex.csv') index = csv.reader(f) features = [] count = 0 for r in index: features = [float(i) for i in r[1:]] lsh.index(features[:360], features[360]) count += 1 try: f_v = getfeatures(path) #print f_v ans = lsh.query(f_v[:360], 15) if ans != []: res = [] for i in ans: res.append(int(i[0][1] / 10000)) return res #searchid(int(ans[0][0][360]/10000)) except: return []
class Searcher: _DIST_FUNCTIONS = ["hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"] index = None def __init__(self, dataset): self.create_index(dataset) def create_index(self, items, hash_size=6): input_dim = len(items.values()[0]) self.index = LSHash(hash_size, input_dim) for key in items: self.index.index(items[key], extra_data=key) return True def query(self, query_item, num_results=10, distance_function='cosine'): if distance_function not in self._DIST_FUNCTIONS: raise Exception("{0} not supported".format(distance_function)) results = self.index.query(query_item, num_results=num_results, distance_func=distance_function) return self.parse_results(results) def parse_results(self, results): return {x[0][1]:x[1] for x in results}
def filterDataset(fileIn, fileOut, fileNodes, threshold): ''' Reads filteredTaxiData.txt and filters out lines that are farther away from every node in OSM graph , by a threshold value (0.1 mile). A data entry will be kept if the distance of point is less than this threshold, from any node in OSM graph ''' # Dimension of our vector space lsh = LSHash(hash_size=10, input_dim=2) nodes = GetNodes(fileNodes) for node in nodes: v = np.array(node, dtype=float) lsh.index(v) bunch = [] bunch_size = 5000 count_lines_read = 0 count_lines_written = 0 with open(fileIn, "r") as fin, open(fileOut, "w") as fout: for line in fin: [latitude, longitude] = dataToGraph.lineToPoint(line) query = np.array((latitude, longitude), dtype=float) result = lsh.query(query, num_results=1) closest_node = result[0][0] count_lines_read += 1 if vin((latitude, longitude), closest_node).miles < threshold: line = replacePointByOSMnode(line, closest_node) bunch.append(line) if len(bunch) == bunch_size: fout.writelines(bunch) count_lines_written += len(bunch) bunch = [] if (count_lines_written % 10 == 0): print("%d written / %d read" % (count_lines_written, count_lines_read)) fout.writelines(bunch) count_lines_written += len(bunch) print("%d lines written" % count_lines_written)
def hash_item_pic_v1(pic_folder): """ 当前版本采用HardNet直接进行特征输出,并且对整张图作为特征区域进行特征向量输出,会在图片所在同级 目录输出一个图片与hash值编码的映射文件 这种方案出来的结果是如果图片有平移则特征向量会有差距 :param pic_folder: 所有图片所在文件夹 :return: 是否成功 """ try: # 计算所有图片的特征向量 desc = HardNetDescriptor() print(colored("HardNet模型加载完成", color='blue')) # 使用LSH lsh = LSHash(16, 128) img_feature_vector = {} with open(pic_folder + '_item_hash.txt', 'w') as to_write: img_file_list = glob(os.path.join(pic_folder, '*_[0-9].jpg')) for m_img_file in tqdm(img_file_list, desc='训练中'): fv = desc.describle([ np.array( Image.open(m_img_file).convert('L').resize((32, 32))), ])[0] img_feature_vector[m_img_file] = fv lsh.index(fv, extra_data=m_img_file) for m_img_file in tqdm(img_file_list, desc='输出中'): res = lsh.query(img_feature_vector[m_img_file], distance_func='centred_euclidean') # 输出所有临近的图片 print(m_img_file, '|'.join(map(lambda x: x[0][1], res))) pass return True except Exception as e: print(colored("错误:%s" % str(e), color='red')) return False
def main(argv): parser = argparse.ArgumentParser(prog='INDEX') parser.add_argument('source', help='path to the source metadata file') parser.add_argument('--hash-size', help='Hash size.', type=int, default=10) parser.add_argument('--num-tables', help='Number of tables.', type=int, default=5) parser.add_argument('--query-index', help='Index to use for query.', type=int, default=0) args = parser.parse_args(argv[1:]) # read in the data file data = pandas.read_csv(args.source, sep='\t') # params k = args.hash_size # hash size L = args.num_tables # number of tables d = len(data['features'][0].split(',')) lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) # indexing for i in range(0, len(data)): lsh.index(np.asarray(data['features'][i].split(',')).astype('float64'), extra_data=data['filename'][i]) # query a vector q_vec response = lsh.query( np.asarray( data['features'][args.query_index].split(',')).astype('float64')) pprint(response)
class feature_comparer(): def __init__(self, fea_dim, compare_thresh): self.lsh = LSHash(bit_num, fea_dim, compare_kernel_num) self.fv_dict = {} self.compare_thresh = compare_thresh def load(self, filename): f = open(filename, 'r') while (1): line = f.readline() if not line: break fv = line.split(':')[0] id = line.split(':')[1] self.fv_dict[fv] = id fv_array = [] s = fv[1:-1].split(',') for i in range(0, len(s)): fv_array.append(float(s[i])) self.lsh.index(fv_array) def insert(self, feature, id): self.fv_dict[str(feature)[1:-1]] = str(id) self.lsh.index(feature) def match(self, feature): q = self.lsh.query(feature, distance_func='cosine') if len(q) == 0: return False, -1 mindis = q[0][1] if mindis < self.compare_thresh: return True, self.fv_dict[str(q[0][0])[1:-1]] else: return False, -1
from __future__ import print_function from __future__ import division from scipy.spatial.distance import cosine from tqdm import tqdm import numpy from lshash import LSHash import time start = time.time() lsh = LSHash(8, 300) sample_word_embeds = [] for i in tqdm(xrange(20000)): word_embed = numpy.random.rand(300) lsh.index(word_embed) if i % 500 == 0: sample_word_embeds.append(word_embed) print("Indexing takes {} seconds".format(time.time() - start)) start = time.time() for word_embed in sample_word_embeds: print('-' * 80) results = lsh.query(word_embed, num_results=None, distance_func='cosine') print("Num result: {}".format(len(results))) print('Nearest neighbor cosine distance:') print(" {} | {}".format(results[1][1], cosine(results[1][0], word_embed))) print('Query takes average {} seconds'.format((time.time() - start) / len(sample_word_embeds)))
def vectorize(string): vec = numpy.zeros(25, dtype=numpy.int) for i in range(len(string)): vec[i] = ord(string[i]) #print vec return vec def decode(vec): vec = [unichr(int(vec[i])) for i in range(len(vec))] s = '' s = s.encode('utf-8', 'ignore') for i in range(len(vec)): if(vec[i] != '\x00'): s = s+vec[i] return s lsh = LSHash(1, 25, storage_config={'dict':'9'}, matrices_filename = '../advs/THE FIVE ORANGE PIPS.npz') f = open('../advs/THE FIVE ORANGE PIPS.tok') tok = pickle.load(f) for word in tok: lsh.index(vectorize(word)) res = lsh.query(vectorize('orang'), num_results = 3, distance_func = 'l1norm') print len(res) print [decode(r[0]) for r in res]
for i, sample in enumerate(samples[train_n:]): total = total + 1 #rs = lsh.query(get_img(fn), num_results=1, distance_func="cosine") # test rate: 91.326531, 196 files took 52901.431 ms #rs = lsh.query(get_img(fn), num_results=1, distance_func="l1norm") # test rate: 91.326531, 196 files took 35271.345 ms #rs = lsh.query(get_img(fn), num_results=1, distance_func="euclidean") # test rate: 90.816327, 196 files took 24904.888 ms #rs = lsh.query(get_img(fn), num_results=1, distance_func="true_euclidean") # test rate: 89.795918, 196 files took 17713.646 ms #rs = lsh.query(get_img(fn), num_results=1, distance_func="centred_euclidean") # test rate: 52.040816, 196 files took 9000.577 ms # BLOCKS = 1, ORIENTATIONS = (8, 8, 3), DIMENSION = 57, test rate: 89.285714, 196 files took 9997.003 ms # BLOCKS = 2, ORIENTATIONS = (8, 8, 3), DIMENSION = 228, test rate: 91.326531, 196 files took 17227.878 ms # BLOCKS = 3, ORIENTATIONS = (8, 8, 3), DIMENSION = 513, test rate: 98.469388, 196 files took 64944.190 ms # BLOCKS = 4, ORIENTATIONS = (8, 8, 4), DIMENSION = 960, test rate: 95.408163, 196 files took 47667.006 ms # BLOCKS = 5, ORIENTATIONS = (8, 8, 3), DIMENSION = 1425, test rate: 93.367347, 196 files took 71029.642 ms #rs = lsh.query(leargist.color_gist(Image.open(fn), nblocks=BLOCKS, orientations=ORIENTATIONS), num_results=1, distance_func="l1norm") rs = lsh.query(sample, num_results=1, distance_func=DISTANCE_FUNC) if rs and rs[0][0][1] == responses[train_n:][i]: correct = correct + 1 # if rs: # rs = [r[0][1] for r in rs] # try: # idx = rs.index(responses[train_n:][i]) # except ValueError: # idx = -1 # if idx != -1: # correct = correct + 1 #else: # print CHARS[rs[0][0][1]], " => ", CHARS[responses[train_n:][i]] t2 = time.time() print "test rate: %f, %d files took %0.3f ms" % (correct/float(total)*100, total, (t2 - t1) * 1000.0)
#Bloom filter from pybloom import BloomFilter from random import randrange import numpy.random as nprnd #### How does it work for a range of numbers? f = BloomFilter(capacity=10000, error_rate=0.001) [f.add(x) for x in range(10000)] sum = 0 for i in range(10000): #print(i in f) sum = sum + (i in f) print("Accuracy for range of numbers",(sum/10000)*100) #### How does it work for random numbers? f = BloomFilter(capacity=10000, error_rate=0.001) randomNumbers = nprnd.randint(10000000, size=10000) [f.add(x) for x in randomNumbers] sum = 0 for i in range(10000): #print(i in f) sum = sum + (i in f) print("Accuracy for random numbers",(sum/10000)*100) #Locally sensityve hash function from lshash import LSHash lsh = LSHash(6, 8) lsh.index([1,2,3,4,5,6,7,8]) lsh.query([6.8], num_results=None, distance_func="euclidean")
def Mainfunc(self, mat_addr, base, result_folder, binary_file): # base数据的所有binary_func_name Total_binary_func = [] # binnary:funcution# # np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['FFE'] datalen = len(svec) n1, n2, n3 = np.shape(svec) test_dict = { 'core': [0, 12], 'curl': [48, 60], 'libgmp': [60, 72], 'busybox': [72, 84], 'openssl': [84, 96], 'sqlite': [96, 108] } compareDict = { 'core_arm_o0': 4, 'core_arm_o1': 5, 'core_arm_o2': 6, 'core_arm_o3': 7, 'curl_arm_o0': 52, 'curl_arm_o1': 53, 'curl_arm_o2': 54, 'curl_arm_o3': 55, 'libgmp.so.10.3.2_arm_O0': 64, 'libgmp.so.10.3.2_arm_O1': 65, 'libgmp.so.10.3.2_arm_O2': 66, 'libgmp.so.10.3.2_arm_O3': 67, 'busybox_arm_o0': 72, 'busybox_arm_o1': 73, 'busybox_arm_o2': 74, 'busybox_arm_o3': 75, 'openssl_arm_o0': 84, 'openssl_arm_o1': 85, 'openssl_arm_o2': 86, 'openssl_arm_o3': 87, 'sqlite_arm_o0': 96, 'sqlite_arm_o1': 97, 'sqlite_arm_o2': 98, 'sqlite_arm_o3': 99, } FUNCTIONNUMBER = { 'coreutils_dir_X86_O0': 290, 'coreutils_dir_X86_O1': 239, 'coreutils_dir_X86_O2': 291, 'coreutils_dir_X86_O3': 255, 'coreutils_dir_arm_O0': 451, 'coreutils_dir_arm_O1': 368, 'coreutils_dir_arm_O2': 377, 'coreutils_dir_arm_O3': 334, 'coreutils_dir_mips_O0': 306, 'coreutils_dir_mips_O1': 247, 'coreutils_dir_mips_O2': 242, 'coreutils_dir_mips_O3': 244, 'coreutils_du_X86_O0': 237, 'coreutils_du_X86_O1': 182, 'coreutils_du_X86_O2': 211, 'coreutils_du_X86_O3': 176, 'coreutils_du_arm_O0': 529, 'coreutils_du_arm_O1': 393, 'coreutils_du_arm_O2': 387, 'coreutils_du_arm_O3': 329, 'coreutils_du_mips_O0': 401, 'coreutils_du_mips_O1': 288, 'coreutils_du_mips_O2': 273, 'coreutils_du_mips_O3': 248, 'coreutils_ls_X86_O0': 290, 'coreutils_ls_X86_O1': 239, 'coreutils_ls_X86_O2': 291, 'coreutils_ls_X86_O3': 255, 'coreutils_ls_arm_O0': 451, 'coreutils_ls_arm_O1': 368, 'coreutils_ls_arm_O2': 377, 'coreutils_ls_arm_O3': 334, 'coreutils_ls_mips_O0': 306, 'coreutils_ls_mips_O1': 247, 'coreutils_ls_mips_O2': 242, 'coreutils_ls_mips_O3': 244, 'coreutils_vdir_X86_O0': 290, 'coreutils_vdir_X86_O1': 239, 'coreutils_vdir_X86_O2': 291, 'coreutils_vdir_X86_O3': 255, 'coreutils_vdir_arm_O0': 451, 'coreutils_vdir_arm_O1': 368, 'coreutils_vdir_arm_O2': 377, 'coreutils_vdir_arm_O3': 334, 'coreutils_vdir_mips_O0': 306, 'coreutils_vdir_mips_O1': 247, 'coreutils_vdir_mips_O2': 242, 'coreutils_vdir_mips_O3': 244, 'curl_X86_O0': 128, 'curl_X86_O1': 102, 'curl_X86_O2': 152, 'curl_X86_O3': 134, 'curl_arm_O0': 263, 'curl_arm_O1': 223, 'curl_arm_O2': 213, 'curl_arm_O3': 209, 'curl_mips_O0': 130, 'curl_mips_O1': 107, 'curl_mips_O2': 169, 'curl_mips_O3': 186, 'libgmp.so.10.3.2_X86_O0': 621, 'libgmp.so.10.3.2_X86_O1': 568, 'libgmp.so.10.3.2_X86_O2': 591, 'libgmp.so.10.3.2_X86_O3': 571, 'libgmp.so.10.3.2_arm_O0': 971, 'libgmp.so.10.3.2_arm_O1': 876, 'libgmp.so.10.3.2_arm_O2': 854, 'libgmp.so.10.3.2_arm_O3': 844, 'libgmp.so.10.3.2_mips_O0': 606, 'libgmp.so.10.3.2_mips_O1': 551, 'libgmp.so.10.3.2_mips_O2': 545, 'libgmp.so.10.3.2_mips_O3': 544, 'busybox_arm_o0': 3216, 'busybox_arm_o1': 2128, 'busybox_arm_o2': 2099, 'busybox_arm_o3': 1730, 'busybox_mips_o0': 2900, 'busybox_mips_o1': 2243, 'busybox_mips_o2': 1726, 'busybox_mips_o3': 1381, 'busybox_x86_o0': 3196, 'busybox_x86_o1': 2390, 'busybox_x86_o2': 2542, 'busybox_x86_o3': 2045, 'openssl_arm_o0': 1778, 'openssl_arm_o1': 1692, 'openssl_arm_o2': 1675, 'openssl_arm_o3': 1658, 'openssl_mips_o0': 414, 'openssl_mips_o1': 333, 'openssl_mips_o2': 333, 'openssl_mips_o3': 324, 'openssl_x86_o0': 414, 'openssl_x86_o1': 322, 'openssl_x86_o2': 350, 'openssl_x86_o3': 333, 'sqlite_arm_o0': 2876, 'sqlite_arm_o1': 2058, 'sqlite_arm_o2': 1972, 'sqlite_arm_o3': 1805, 'sqlite_mips_o0': 2701, 'sqlite_mips_o1': 1936, 'sqlite_mips_o2': 1830, 'sqlite_mips_o3': 1705, 'sqlite_x86_o0': 2693, 'sqlite_x86_o1': 1931, 'sqlite_x86_o2': 1967, 'sqlite_x86_o3': 1772, } FUNCTIONNAME = [] func_name = open(binary_file, 'r') func_contents = func_name.readlines() for func_content in func_contents: FUNCTIONNAME.append(func_content.split("'")[1]) # # 确认数据库偏移量 # binary_db_num = [] # for binary in FUNCTIONNAME: # sql = "select * from " + self.table + " where binary_name=" + "'" + binary + "'" # self.DODB.cursor.execute(sql) # rows = self.DODB.cursor.fetchall() # binary_db_num.append({binary:len(rows)}) # print(binary_db_num) # exit() #core 只针对DIR imodel_name = 'openssl_arm_o3' imodel_BIN_name = 'openssl_arm_o3' imodel = compareDict[imodel_name] # 输入binanry全称 imdel_s = self.GetSqlStart(FUNCTIONNUMBER, FUNCTIONNAME, imodel_BIN_name) # 确定数据库范围 imodel_s_n = [imdel_s, FUNCTIONNUMBER[imodel_BIN_name]] itest_name = 'openssl_arm_o0' itest_BIN_name = 'openssl_arm_o0' itest = compareDict[itest_name] itest_s = self.GetSqlStart(FUNCTIONNUMBER, FUNCTIONNAME, itest_BIN_name) itest_s_n = [itest_s, FUNCTIONNUMBER[itest_BIN_name]] ######## 两两对比 data = np.zeros((n1, 3500)) test = np.zeros((n1, 3500)) model_num = 0 test_num = 0 for j in range(n3): if svec[:, imodel, j].all() != 0: data[:, model_num] = svec[:, imodel, j] model_num = model_num + 1 if svec[:, itest, j].all() != 0: test[:, test_num] = svec[:, itest, j] test_num = test_num + 1 dataves = np.transpose(data) testves = np.transpose(test) # output_total = open(result_folder + 'result_total.txt', 'w') model = np.zeros((model_num, n1)) lsh_model = LSHash(7, n1) for jj in range(model_num): lsh_model.index(dataves[jj, :]) model[jj, :] = dataves[jj, :] test = np.zeros((test_num, n1)) for ii in range(test_num): test[ii, :] = testves[ii, :] ############################################################################## itest_func_list = self.GetFuncListFromFeature(test, itest_s_n[0], itest_s_n[1]) #imodel_func_list = self.GetFuncListFromFeature(model,imodel_s_n[0],imodel_s_n[1]) print('target_list get success\n') # Inmodel_Total = self.GetInmodelTotal(imodel_func_list,itest_func_list) Inmodel_NUM = 0.0 output = open(result_folder + 'BetweenTestRecored' + '.txt', 'a') # SelectDB = Date_Analysis() for queryi in range(test_num): key = 20 test_funcname = itest_func_list[queryi] if test[queryi, :].all() != 0: Atemp = lsh_model.query(test[queryi, :], key, 'euclidean') for i in range(0, key): if i < len(Atemp): try: feature_str = str( Atemp[i]).split(')')[0].split('(')[2] feature_list = feature_str.split(',') feature_array = self.SelectDB.ListStr2ArrayFloat( feature_list) temp = self.SelectDB.DataAccuray(feature_array) str_data = temp.astype(str) feature = "-".join(str_data) rows = self.SelectDB.DatafromFeature( feature, imodel_s_n[0], imodel_s_n[1]) select_funcname = rows[0][1] if test_funcname.find(select_funcname): Inmodel_NUM = Inmodel_NUM + 1 print('Get One') break else: pass except Exception as e: print(e) print(str(Atemp[i])) else: print('AtempLen:', len(Atemp), ' ', 'key:', key) break res = str(float('%.4f' % (Inmodel_NUM / len(itest_func_list)))) msg = itest_name + '----->' + imodel_name + \ ' Res:' + res + ' Inmodel_NUM:' + str(Inmodel_NUM) +\ ' Test_NUM:' + str(len(itest_func_list)) #+ ' Model_NUM:' + str(len(imodel_func_list))# ' Inmodel_Total:' + str(Inmodel_Total)# +\ output.write(msg + '\n') print(msg) output.close()
img = Image.open(dataset+str(x)+fileext) pixel = np.array(img) #onedarray = pixel.ravel() hist,bins = np.histogram(pixel.ravel(),256,[0,256]) listing=list(hist[0:255]) big_array.append(listing) lsh.index(listing) input_array=np.array(big_array) img = Image.open(queryset+"10"+fileext) pixel = np.array(img) #onedarray = pixel.ravel() hist,bins = np.histogram(pixel.ravel(),256,[0,256]) listing=list(hist[0:255]) k=lsh.query(listing,distance_func="l1norm") vector = np.matrix(k) length=len(k) if length > 0: for output in range(length): if (k[output][1] < 800): test=np.array(k[output][0]) result=np.where((input_array == test).all(axis=1)) image_number.append(result[0][0]+1) #arr2 = np.asarray(k[output][0]).reshape(shape) #vector = np.matrix(np.uint8(arr2)) #img2 = Image.fromarray(np.uint8(arr2),'RGB') #img2.show() #onedarray = pixel.ravel() #image_size.append[x]=onedarray
if value[:INPUT_DIMENSION] == list(search_item): return key return None #Getting all tweets from the twitter_search3 library temp_db_collection = db.twitter_search3.find().limit(490) #Converting string to ASCII and storing in store_ascii_tweets. for data in temp_db_collection: lsh.index(conversion_to_get_ascii(data, True)) store_ascii_tweets.append(data) #Formation of Groups for data in store_ascii_tweets: if data["id"] not in temp: output = lsh.query(conversion_to_get_ascii(data, False)) temp.add(data["id"]) if len(output) > 0: for value in output: loc = None tweetid = back_to_string(value[0]) temp.add(tweetid) tweetobj = dictionary_group[tweetid] if tweetobj["user"]["geo_enabled"] == True: loc = tweetobj["place"]["name"] groups[loc] = len(output) break if (data["user"]["geo_enabled"] == True and data["place"] is not None): place = data["place"]["name"] t = 1
def main(): parser = argparse.ArgumentParser(description = 'Tools for hamming distance-based image retrieval by cuda') parser.add_argument('-f', help = 'The filename of image raw features (SIFT).') parser.add_argument('-v', default = 'fvecs', help = 'The format of image raw features.') parser.add_argument('-s', default = 'dict', help = 'The method of indexing storage.') parser.add_argument('-d', default = '128', help = 'Dimensions of raw image feature.') parser.add_argument('-o', default = '0', help = 'Offset of accessing raw image features.') parser.add_argument('-n', default = '1', help = 'Number of raw image features to read.') parser.add_argument('-i', default = 'n', help = 'Whether to perform indexing step.') parser.add_argument('-e', help = 'The dirname of indexing folder.') parser.add_argument('-k', default = '10', help = 'Number of retrieved images.') parser.add_argument('-r', default = '32', help = 'Number of dimensions randomly sampled.') parser.add_argument('-c', default = 'n', help = 'Whether to perform compressing step.') parser.add_argument('-q', default = 'n', help = 'Whether to sequentially sampling.') parser.add_argument('-p', default = 'n', help = 'Whether to perform querying in compressed domain.') parser.add_argument('-g', default = 'y', help = 'GPU mode. default is "yes".') parser.add_argument('-l', default = 'n', help = 'VLQ base64 mode. Load VLQ base64 encoding compressed dict.') parser.add_argument('-b', default = '1', help = 'Expanding level of search buckets.') parser.add_argument('-t', default = 'int32', help = 'FastDict type (int32, int8, string).') parser.add_argument('-u', default = 'local', help = 'CUDA client type (local, net).') parser.add_argument('-host', default = 'localhost', help = 'CUDA server address.') args = parser.parse_args() d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config = args.s, matrices_filename = 'project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i) if args.c == 'y': if args.e != None and args.s == 'random': lsh.load_index(args.e) print "compressing index..." lsh.compress_index(args.e) print "compressing done." else: print "Please specify generated indexing file." sys.exit(0) if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random': if args.p == 'y': print "loading compressed index." lsh.load_compress_index(args.e, (args.l == 'y')) print "loading done." else: print "loading index." lsh.load_index(args.e) print "loading done." if args.p != 'y': retrived = lsh.query(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming') else: retrived = lsh.query_in_compressed_domain(np_feature_vecs[1], num_results = int(args.k), expand_level = int(args.b), distance_func = 'hamming', gpu_mode = args.g, vlq_mode = args.l) print retrived
# -*- coding: utf-8 -*- """ Created on Fri Dec 04 15:29:56 2015 @author: MaGesh """ import numpy as np from scipy.ndimage import imread from lshash import LSHash lsh=LSHash(20,32*32) #32*32 is the dimension with 20 hash buckets resultSet=[] for i in range(1,100001): print i; X="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\dataset\\"+str(i)+".bmp" im=imread(X,flatten=True) single_array=im.flatten() lsh.index(single_array)#hashing the each values in to the bucket for i in range(1,11): print i,"for querying" X1="F:\\Fall 2015\\Data Mining\\Programming Assignments\\PA5\\data\\Query\\"+str(i)+".bmp" imQ=imread(X1,flatten=True) #converting to grey scale imFlatten=imQ.flatten() value=lsh.query(imFlatten,distance_func="euclidean") #querying the nearest points resultSet.append(value)
for u in range(6): lsh = LSHash(int(hash_size[u]), dimension) for i in range(1000): lsh.index(metric[i]) for v in trade: f.write("Hash_size: ") f.write(str(int(hash_size[u]))) f.write('\n') f.write("As for Tid ") f.write(str(v)) f.write('\n') f.write('--------------------------------') f.write('\n') row = lsh.query(metric[v - 1], distance_func="hamming") for g in row: f.write("Index ") f.write(str(1 + (findByRow(metric, list(g[0])))[0])) f.write(" Distance is :") f.write(str(g[1])) f.write('\n') f.write("Distance Func Euclidean") f.write('\n') f.write('\n') f.write('############################') f.write('\n') for u in range(6):
class LshManager(object): def __init__(self): self.lshIndexList = [] # create a list of lsh indexes self.lsh = LSHash(NUMBER_OF_BITS_PER_HASH, NUM_TOPICS, num_hashtables=NUMBER_OF_LSH_INDEXES, storage_config={"redis": {"host": "localhost", "port": 6379}}) def clearIndex(self): redis.Redis().flushall() # adds a document to all lsh indexes def addDocument(self, document): lsa_vector = document.vectors["LSA"] dense_vector = self._sparseToDenseConverter(lsa_vector) if not hasattr(document, "timestamp"): document.timestamp = str(datetime.datetime.now()) extra = json.dumps(str(document._id)) # detect duplicates #result = self.lsh.query(dense_vector, num_results=1, distance_func="cosine") #if result: # nearest = result[0] # if nearest[1] > DUPLICATE_SIMILARITY_THRESHOLD: # extra = ast.literal_eval(ast.literal_eval(nearest[0])[1]) # doctitle = getDatabaseConnection().holist.articles.find({"_id": extra}).next()["title"] # ln.warn("Detected duplicate for %s (ID %s): %s.", document.title, document._id, extra) # return self.lsh.index(dense_vector, extra_data=extra) # extra MUST be hashable # takes a document and returns database ids of similar documents # uses cosine function to determine similarity def getSimilarDocuments(self, document, num_docs=7): if isinstance(document, Document): lsa_vector = document.vectors["LSA"] else: lsa_vector = document dense_vector = self._sparseToDenseConverter(lsa_vector) client = getDatabaseConnection() resultSet = set() results = [] for result in self.lsh.query(dense_vector, num_results=num_docs, distance_func="cosine"): # example: # [ # (((1, 2, 3), "{'extra1':'data'}"), 0), # (((1, 1, 3), "{'extra':'data'}"), 1) # ] extra = ast.literal_eval(ast.literal_eval(result[0])[1]) clientDoc = bsonToClientBson(client.holist.articles.find({"_id": extra}).next()) clientDoc['lsa'] = self._sparseToDenseConverter(clientDoc['lsa']) jsonstr = json.dumps(clientDoc) if not jsonstr in resultSet: resultSet.add(jsonstr) results.append(clientDoc) ln.debug("retrieved %s documents.", len(results)) return results # converts a vector in sparse format to a vector in dense format def _sparseToDenseConverter(self, sparseVector): dense = {} for x in range(NUM_TOPICS): dense[x] = 0 for dim, val in sparseVector: dense[dim] = val return [value for key, value in dense.items()]
return np.array(data).reshape(20, 20).T limit = 10 data = loadmat('ex3data1.mat') X, y = data['X'], data['y'] dim=X.shape[1] hash_size = int(np.ceil(np.log2(len(y)))) print 'hash_size:', hash_size lsh=LSHash(hash_size, dim) #prepare for x in X: lsh.index(x) #test for _ in range(10): i = np.random.randint(0,y.shape[0]) res = lsh.query(X[i], distance_func = 'hamming') n = len(res) fig = plt.figure() fig.suptitle('y=%d, found: %d' %(y[i][0]%10, n)) n = n>limit and limit or n ax = fig.add_subplot(2, n, 1) ax.set_axis_off() ax.imshow(to_img(X[i])) ax.set_title('original img') for k,j in enumerate(res[:n]): ax = fig.add_subplot(2, n, k+1+n) ax.set_axis_off() ax.imshow(to_img(j[0])) ax.set_title('distance: %.2f' %j[-1]) plt.show()
from lshash import LSHash lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=1, storage_config={"lmdb": {'path': '/Users/christianburger/Downloads/testlmdb'}}) lsh.index([1,2,3,4,5,6,7,8], 'a') lsh.index([2,3,4,5,6,7,8,9], 'b') lsh.index([10,12,99,1,5,31,2,3], 'c') print lsh.query([1,2,3,4,5,6,7,7])
dim = len(Index) + 1 # -1 for excluded data = xlrd.open_workbook(fname) sht = data.sheet_by_name(shtname) head = sht.row_values(0) tweets = sht.col_values(head.index(target), start_rowx) hash_size = int(np.ceil(np.log2(len(tweets)))) print 'hash_size: %d, dim: %d' %(hash_size, dim) lsh=LSHash(hash_size, dim) for tweet in tweets: x = spar.csr_matrix((1,dim) ,dtype=np.int8) # x = np.zeros(dim, np.bool8) ws = jieba.cut(tweet) try: for w in ws: x[Index.get(w, -1)] = 1 lsh.index(x) except Exception, e: print e print tweet sent = True while sent: sent = raw_input('input sentence...\n') res = lsh.query(sent, distance_func = 'hamming') for i in res: print i[0], i[-1]
from lshash import LSHash """ 利用局部敏感hash查找近似值 """ lsh = LSHash(6, 8) lsh.index([1,2,3,4,5,6,7,8]) lsh.index([2,3,4,5,6,7,8,9]) lsh.index([10,12,99,1,5,31,2,3]) lsh.query([1,2,3,4,5,6,7,7])
def main(): parser = argparse.ArgumentParser( description='Tools for hamming distance-based image retrieval by cuda') parser.add_argument('-f', help='The filename of image raw features (SIFT).') parser.add_argument('-v', default='fvecs', help='The format of image raw features.') parser.add_argument('-s', default='dict', help='The method of indexing storage.') parser.add_argument('-d', default='128', help='Dimensions of raw image feature.') parser.add_argument('-o', default='0', help='Offset of accessing raw image features.') parser.add_argument('-n', default='1', help='Number of raw image features to read.') parser.add_argument('-i', default='n', help='Whether to perform indexing step.') parser.add_argument('-e', help='The dirname of indexing folder.') parser.add_argument('-k', default='10', help='Number of retrieved images.') parser.add_argument('-r', default='32', help='Number of dimensions randomly sampled.') parser.add_argument('-c', default='n', help='Whether to perform compressing step.') parser.add_argument('-q', default='n', help='Whether to sequentially sampling.') parser.add_argument( '-p', default='n', help='Whether to perform querying in compressed domain.') parser.add_argument('-g', default='y', help='GPU mode. default is "yes".') parser.add_argument( '-l', default='n', help='VLQ base64 mode. Load VLQ base64 encoding compressed dict.') parser.add_argument('-b', default='1', help='Expanding level of search buckets.') parser.add_argument('-t', default='int32', help='FastDict type (int32, int8, string).') parser.add_argument('-u', default='local', help='CUDA client type (local, net).') parser.add_argument('-host', default='localhost', help='CUDA server address.') args = parser.parse_args() d = int(args.d) nuse = int(args.n) off = int(args.o) random_dims = int(args.r) random_sampling = True if args.q == 'y': random_sampling = False lsh = LSHash(64, d, random_sampling, args.t, args.u, args.host, random_dims, 1, storage_config=args.s, matrices_filename='project_plane.npz') np_feature_vecs = load_features(args.f, args.v, nuse, d, lsh, args.e, off, args.i) if args.c == 'y': if args.e != None and args.s == 'random': lsh.load_index(args.e) print "compressing index..." lsh.compress_index(args.e) print "compressing done." else: print "Please specify generated indexing file." sys.exit(0) if args.c != 'y' and args.i != 'y' and args.e != None and args.s == 'random': if args.p == 'y': print "loading compressed index." lsh.load_compress_index(args.e, (args.l == 'y')) print "loading done." else: print "loading index." lsh.load_index(args.e) print "loading done." if args.p != 'y': retrived = lsh.query(np_feature_vecs[1], num_results=int(args.k), expand_level=int(args.b), distance_func='hamming') else: retrived = lsh.query_in_compressed_domain(np_feature_vecs[1], num_results=int(args.k), expand_level=int(args.b), distance_func='hamming', gpu_mode=args.g, vlq_mode=args.l) print retrived
def Mainfunc(self, mat_addr, base, result_folder): # base数据的所有binary_func_name Total_binary_func = [] # binnary:funcution# SelectDB = Date_Analysis() # np.set_printoptions(suppress=True, precision=6, threshold=8) s = sio.loadmat(mat_addr) svec = s['X'] datalen = len(svec) n1, n2, n3 = np.shape(svec) #test_dict = {'core':[0,12],'curl':[48,60],'libgmp':[60,72],'busybox':[72,84],'openssl':[84,96],'sqlite':[96,108]} test_dict = { 'busybox': [0, 12], 'core': [12, 60], 'curl': [60, 72], 'libgmp': [72, 84], 'openssl': [84, 96], 'sqlite': [96, 108] } compareDict = { 'core_arm_o0': 4, 'core_arm_o1': 5, 'core_arm_o2': 6, 'core_arm_o3': 7, 'curl_arm_o0': 52, 'curl_arm_o1': 53, 'curl_arm_o2': 54, 'curl_arm_o3': 55, 'libgmp.so.10.3.2_arm_O0': 64, 'libgmp.so.10.3.2_arm_O1': 65, 'libgmp.so.10.3.2_arm_O2': 66, 'libgmp.so.10.3.2_arm_O3': 67, 'busybox_arm_o0': 73, 'busybox_arm_o1': 74, 'busybox_arm_o2': 75, 'busybox_arm_o3': 76, 'openssl_arm_o0': 84, 'openssl_arm_o1': 85, 'openssl_arm_o2': 86, 'openssl_arm_o3': 87, 'sqlite_arm_o0': 96, 'sqlite_arm_o1': 97, 'sqlite_arm_o2': 98, 'sqlite_arm_o3': 99, 'core_x86_o0': 0, 'core_x86_o1': 1, 'core_x86_o2': 2, 'core_x86_o3': 3, 'curl_x86_o0': 48, 'curl_x86_o1': 49, 'curl_x86_o2': 50, 'curl_x86_o3': 51, 'libgmp.so.10.3.2_x86_O0': 60, 'libgmp.so.10.3.2_x86_O1': 61, 'libgmp.so.10.3.2_x86_O2': 62, 'libgmp.so.10.3.2_x86_O3': 63, 'busybox_x86_o0': 80, 'busybox_x86_o1': 81, 'busybox_x86_o2': 82, 'busybox_x86_o3': 83, 'openssl_x86_o0': 92, 'openssl_x86_o1': 93, 'openssl_x86_o2': 94, 'openssl_x86_o3': 95, 'sqlite_x86_o0': 104, 'sqlite_x86_o1': 105, 'sqlite_x86_o2': 106, 'sqlite_x86_o3': 107, } # FUNCTIONNUMBER={'coreutils_dir_X86_O0':290,'coreutils_dir_X86_O1':239, # 'coreutils_dir_X86_O2':291,'coreutils_dir_X86_O3':255, # 'coreutils_dir_arm_O0':451,'coreutils_dir_arm_O1':368,'coreutils_dir_arm_O3':334,'coreutilsr_mips_O0':306, # 'coreutils_dir_mips_O1':247,'coreutils_dir_mips_O2':242,'coreutils_dir_mips_O3':244,'coreutils_du_X86_O0':237,'coreutils_du_X86_O1':182, # 'coreutils_du_X86_O2':211,'coreutils_du_X86_O3':176,'coreutils_du_arm_O0':529,'coreutils_du_arm_O1':393, # 'coreutils_du_arm_O2':387, # 'coreutils_du_arm_O3':329, # 'coreutils_du_mips_O0':401, # 'coreutils_du_mips_O1':288, # 'coreutils_du_mips_O2':273, # 'coreutils_du_mips_O3':248, # 'coreutils_ls_X86_O0':290, # 'coreutils_ls_X86_O1':239, # 'coreutils_ls_X86_O2':291, # 'coreutils_ls_X86_O3':255, # 'coreutils_ls_arm_O0':451, # 'coreutils_ls_arm_O1':368, # 'coreutils_ls_arm_O2':377, # 'coreutils_ls_arm_O3':334, # 'coreutils_ls_mips_O0':306, # 'coreutils_ls_mips_O1':247, # 'coreutils_ls_mips_O2':242, # 'coreutils_ls_mips_O3':244, # 'coreutils_vdir_X86_O0':290, # 'coreutils_vdir_X86_O1':239, # 'coreutils_vdir_X86_O2':291, # 'coreutils_vdir_X86_O3':255, # 'coreutils_vdir_arm_O0':451, # 'coreutils_vdir_arm_O1':368, # 'coreutils_vdir_arm_O2':377, # 'coreutils_vdir_arm_O3':334, # 'coreutils_vdir_mips_O0':306, # 'coreutils_vdir_mips_O1':247, # 'coreutils_vdir_mips_O2':242, # 'coreutils_vdir_mips_O3':244, # 'curl_X86_O0':128, # 'curl_X86_O1':102, # 'curl_X86_O2':152, # 'curl_X86_O3':134, # 'curl_arm_O0':263, # 'curl_arm_O1':223, # 'curl_arm_O2':213, # 'curl_arm_O3':209, # 'curl_mips_O0':130, # 'curl_mips_O1':107, # 'curl_mips_O2':169, # 'curl_mips_O3':186, # 'libgmp.so.10.3.2_X86_O0': 621, # 'libgmp.so.10.3.2_X86__O1': 568, # 'libgmp.so.10.3.2_X86__O2': 591, # 'libgmp.so.10.3.2_X86__O3': 571, # 'libgmp.so.10.3.2_arm_O0':971, # 'libgmp.so.10.3.2_arm_O1':876, # 'libgmp.so.10.3.2_arm_O2':854, # 'libgmp.so.10.3.2_arm_O3':844, # 'libgmp.so.10.3.2_mips_O0':606, # 'libgmp.so.10.3.2_mips_O1':551, # 'libgmp.so.10.3.2_mips_O2':545, # 'libgmp.so.10.3.2_mipsO3':544, # 'busybox_arm_o0':3216, # 'busybox_arm_o1':2128, # 'busybox_arm_o2':2099, # 'busybox_arm_o3':1730, # 'busybox_mips_o0':2900, # 'busybox_mips_o1':2243, # 'busybox_mips_o2':1726, # 'busybox_mips_o3':1381, # 'busybox_x86_o0':3196, # 'busybox_x86_o1':2390, # 'busybox_x86_o2':2542, # 'busybox_x86_o3':2045, # 'openssl_arm_o0':1778, # 'openssl_arm_o1':1692, # 'openssl_arm_o2':1675, # 'openssl_arm_o3':1658, # 'openssl_mips_o0':414, # 'openssl_mips_o1':333, # 'openssl_mips_o2':333, # 'openssl_mips_o3':324, # 'openssl_x86_o0':414, # 'openssl_x86_o1':322, # 'openssl_x86_o2':350, # 'openssl_x86_o3':333, # 'sqlite_arm_o0':2876, # 'sqlite_arm_o1':2058, # 'sqlite_arm_o2':1972, # 'sqlite_arm_o3':1805, # 'sqlite_mips_o0':2701, # 'sqlite_mips_o1':1936, # 'sqlite_mips_o2':1830, # 'sqlite_mips_o3':1705, # 'sqlite_x86_o0':2693, # 'sqlite_x86_o1':1931, # 'sqlite_x86_o2':1967, # 'sqlite_x86_o3':1772, # } data = np.zeros((n1, 30000)) test = np.zeros((n1, 3500)) m = 0 imodel = compareDict['core_arm_o0'] itest = compareDict['core_arm_o3'] for i in range(test_dict['core'][0], test_dict['core'][1]): for j in range(n3): if svec[:, i, j].all() != 0: data[:, m] = svec[:, i, j] m = m + 1 dataves = np.transpose(data) #testves = np.transpose(test) ######## 两对比 两 # for j in range(n3): # if svec[:, imodel, j].all() != 0: # data[:, m] = svec[:, imodel, j] # m = m + 1 # if svec[:, itest, j].all() != 0: # test[:, mm] = svec[:, itest, j] # mm = mm + 1 #dataves = np.transpose(data) #testves=np.transpose(test) # modelindex = list(set(np.random.randint(0, m, size=10000))) # output_total = open(result_folder + 'result_total.txt', 'w') lsh_model = LSHash(7, n1) for jj in range(m): # for jj in range(87212): lsh_model.index(dataves[jj, :]) testindex = list(set(np.random.randint( 0, m, size=base))) # SIZE IS THE NUMBER OF TEST FUNCTIONS test = np.zeros((len(testindex), n1)) for i in range(len(testindex)): test[i, :] = dataves[testindex[i], :] # output = open(result_folder + 'result_key' + str(key) + '_base' + str(base) + '.txt', 'w') # testindex=mm ############################################################################## timee = open(result_folder + 'coreutils_time.txt', 'a') target_list = [] M_list = [] for queryi in range(len(testindex)): target = test[queryi, :] temp_target = SelectDB.DataAccuray(target) str_target = temp_target.astype(str) feature_target = "-".join(str_target) rows = SelectDB.DatafromFeature(feature_target) target_data = self.Row2Str(rows) target_list.append(target_data) Global_M = self.GetGlobalM(rows[0][1]) M_list.append(Global_M) print('Global_M get success\n') Totaltime = 0.0 # SelectDB = Date_Analysis() for queryi in range(len(testindex)): flag_over = 0 keylist = [i for i in range(1, 10001, 5)] #keylist=[5] target_data = target_list[queryi].split('#')[0] output = open(result_folder + 'coreutils_result_base' + str(base) + \ '_No' + str(queryi) + '.txt', 'w') output.write('Target:' + target_data + '\n') print(target_data + '\n') for key in keylist: if flag_over == 0: msg = 'Key:' + str(key) + ' Base:' + str(base) + \ ' No:' + str(queryi) + ' M:' + str(M_list[queryi]) print(msg + '\n') output.write(msg + '\n') if test[queryi, :].all() != 0: starttime = time.time() Atemp = lsh_model.query(test[queryi, :], key, 'euclidean') endtime = time.time() Totaltime = Totaltime + endtime - starttime for i in range(0, key): if i < len(Atemp): try: flag_over = 0 feature_str = str( Atemp[i]).split(')')[0].split('(')[2] feature_list = feature_str.split(',') feature_array = SelectDB.ListStr2ArrayFloat( feature_list) temp = SelectDB.DataAccuray(feature_array) str_data = temp.astype(str) feature = "-".join(str_data) rows = SelectDB.DatafromFeature(feature) select_data = self.Row2Str(rows) except Exception as e: print(e) print(str(Atemp[i])) select_data = 'null:null#' else: print('AtempLen:', len(Atemp), ' ', 'key:', key, '\n') select_data = 'null:null#' flag_over = 1 output.write(select_data + '\n') print(select_data + '\n') else: break msg = 'Key:' + str(key) + ' Base:' + str(base) + \ ' No:' + str(queryi) + ' Time:' + str(float(Totaltime/base)) + '\n' timee.write(msg) print(msg) output.close() timee.close()
for note,name in note_from_midi(mid18): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid17): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid19): lsh.index(note,extra_data=(name,0.8)) for note,name in note_from_midi(mid20): lsh.index(note,extra_data=(name,0.8)) kk = [] i = 0 result = {} for note,name in nlsh('xml.wav'): q = note kk.extend(q) r = lsh.query(q) print '--------'+str(i)+'-----------' i += 1 if(len(r) > 0): print len(r) # keep 3 candidate nn = min(3,len(r)) # let's vote(based on distance) for k in range(nn): w = r[k][1] name = r[k][0][1][0] if(not result.has_key(name)): result[name] = 0.0 else: w *= 0.93 result[name] += w