def lsh(p_hash_size, distance_func): """ 实现局部敏感哈希模拟KNN的具体函数 :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size :param distance_funcs: 可选择的距离计算函数 :return: 去除自身之后的该vipno对应knn的输出vipno """ datas_set, datas_matrix = get_data() # vipno_nums 为vipno去重后的总数 vipno_nums = len(datas_matrix[0]) # 随机取一个vipno(这里是vipno对应的下标) random_vipno = random.randint(0, vipno_nums - 1) # 初始化lshash lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0])) for i in range(vipno_nums): # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i]) vipno_res = [] # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身 for res in lsh.query(datas_matrix[:, random_vipno], num_results=6, distance_func=distance_func): vipno_res.append(res[0][1]) print("distance func:", distance_func) print("knn output(from 1 to 5): {}".format(vipno_res[1:])) return vipno_res[1:], datas_set.columns[random_vipno]
def build_index(): lsh = LSHash(32, HASH_SIZE**2) q = session.query(models.Image.hash, models.Image.id) for h, id in q.all(): h = hash.hex_to_hash(h).hash.flatten() bytearr = h.view(np.uint8) lsh.index(bytearr, extra_data=id) return lsh
def constructfeature(self, hash_size, input_dim, num_hashtables): multiscale = '[1]' print(">> Loading network:\n>>>> '{}'".format(self.network)) # state = load_url(PRETRAINED[args.network], model_dir=os.path.join(get_data_root(), 'networks')) state = torch.load(self.network) # parsing net params from meta # architecture, pooling, mean, std required # the rest has default values, in case that is doesnt exist net_params = {} net_params['architecture'] = state['meta']['architecture'] net_params['pooling'] = state['meta']['pooling'] net_params['local_whitening'] = state['meta'].get('local_whitening', False) net_params['regional'] = state['meta'].get('regional', False) net_params['whitening'] = state['meta'].get('whitening', False) net_params['mean'] = state['meta']['mean'] net_params['std'] = state['meta']['std'] net_params['pretrained'] = False # network initialization net = init_network(net_params) net.load_state_dict(state['state_dict']) print(">>>> loaded network: ") print(net.meta_repr()) # setting up the multi-scale parameters ms = list(eval(multiscale)) print(">>>> Evaluating scales: {}".format(ms)) # moving network to gpu and eval mode if torch.cuda.is_available(): net.cuda() net.eval() # set up the transform normalize = transforms.Normalize( mean=net.meta['mean'], std=net.meta['std'] ) transform = transforms.Compose([ transforms.ToTensor(), normalize ]) # extract database and query vectors print('>> database images...') images = ImageProcess(self.img_dir).process() vecs, img_paths = extract_vectors(net, images, 1024, transform, ms=ms) feature_dict = dict(zip(img_paths, list(vecs.detach().cpu().numpy().T))) # index lsh = LSHash(hash_size=int(hash_size), input_dim=int(input_dim), num_hashtables=int(num_hashtables)) for img_path, vec in feature_dict.items(): lsh.index(vec.flatten(), extra_data=img_path) # ## 保存索引模型 # with open(self.feature_path, "wb") as f: # pickle.dump(feature_dict, f) # with open(self.index_path, "wb") as f: # pickle.dump(lsh, f) print("extract feature is done") return feature_dict, lsh
def getLSHashOutput(filename, hash_size, k): matrix = getMatrix(filename) total_num = len(matrix.iloc[0]) lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0])) for i in range(total_num): lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i]) out_num = rand.randint(0, total_num - 1) #有多种lshash函数,默认是euclidean print(lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean'))
def sphere(): X = np.random.normal(size=(1000, 3)) lsh = LSHash(10, 3, num_hashtables=5) for x in X: x /= np.linalg.norm(x) lsh.index(x) closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine") assert len(closest) >= 10 assert 0.05 >= closest[9][-1] > 0.0003
def build_index(self, k: int, hash_size: int = 10, num_hashtables: int = 1, store_file: str = None, overwrite: bool = False): """Build index for each picture. First use K-means to find k key features from previously extracted features and the assignment of each feature; Then apply histogram on each image, get the distribution of its features, which serves as a unique finger print for this image. Finally use LSHash (locality sensitive hashing.) algorithm, index each image by their histogram array. Args: k: parameter used in K-means, number of centeroids (key features). hash_size: length of resulting binary hash array. num_hashtables: number of hashtables for multiple lookups. store_file: Specify the path to the .npz file random matrices are stored or to be stored if the file does not exist yet overwrite: Whether to overwrite the matrices file if it already exist. Returns: """ assert 0 < k < len(self._all_feats) assert hash_size > 0 and num_hashtables > 0 # Use kmeans to calculate K key features and assignment of each feature. logger.info('Calculating {} key featurs...'.format(k)) # Mini batch kmeans deals with large amount of data better. self._kmeans = MiniBatchKMeans(n_clusters=k) self._kmeans.fit(np.array(self._all_feats)) idx = self._kmeans.labels_ logger.info('Start indexing each image.') # Calculate histogram of each image self._lsh = LSHash(hash_size=hash_size, input_dim=k, num_hashtables=num_hashtables, matrices_filename=store_file, overwrite=overwrite) success = 0 progress_bar = tqdm(total=len(self._img_dict)) bins = np.arange(-0.5, k + 0.5, 1) for img_name, img_meta in self._img_dict.items(): try: start = img_meta['start_index'] end = start + img_meta['num_feats'] # Perform histogram hist, _ = np.histogram(idx[start:end], bins=bins) img_meta['histogram'] = hist # Store each picture in hash tables self._lsh.index(input_point=hist, extra_data=img_name) success += 1 except Exception as e: logger.warning(e) logger.warning('Error when indexing image: {}'.format(img_name)) progress_bar.update(1) progress_bar.close() logger.info('Successfully indexed {} images.'.format(success))
def hyperspheres(X=16, num_samples=200000): """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest >>> import pandas as pd >>> lsh, vectors, dfs = hyperspheres(16) >>> for df in dfs: ... print(df) """ X = np.random.uniform(size=(num_samples, X)) if isinstance(X, int) else X closest = [] secondclosest = [] tenthclosest = [] hundredthclosest = [] for D in range(2, X.shape[1] + 1): lsh = LSHash(int(64 / D) + D, D, num_hashtables=D) # query vector q = np.random.uniform(size=(D,)) q /= np.linalg.norm(q) distances = [] for x in X[:, :D]: x /= np.linalg.norm(x) distances += [1. - np.sum(x * q)] # cosine similarity lsh.index(x) distances = sorted(distances) print(distances[:10]) closest10 = lsh.query(q, distance_func='cosine') N = len(closest10) hundredthclosest += [[D, N, closest10[min(99, N - 1)][-1] if N else 2., distances[min(99, N - 1)]]] tenthclosest += [[D, N, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]] secondclosest += [[D, N, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]] closest += [[D, N, closest10[0][-1] if N else 2., distances[0]]] print("is correct: 100th 10th 2nd 1st") print(round(hundredthclosest[-1][-1], 14) == round(hundredthclosest[-1][-2], 14)) print(round(tenthclosest[-1][-1], 14) == round(tenthclosest[-1][-2], 14)) print(round(secondclosest[-1][-1], 14) == round(secondclosest[-1][-2], 14)) print(round(closest[-1][-1], 14) == round(closest[-1][-2], 14)) print("distances: 100th 10th 2nd 1st") print(hundredthclosest[-1]) print(tenthclosest[-1]) print(secondclosest[-1]) print(closest[-1]) dfs = [] for k, (i, df) in enumerate(zip([100, 10, 2, 1], [hundredthclosest, tenthclosest, secondclosest, closest])): df = pd.DataFrame(df, columns='D N dist{} true_dist{}'.format(i, i).split()).round(14) df['correct{}'.format(i)] = df['dist{}'.format(i)] == df['true_dist{}'.format(i)] dfs += [df] # for i, tc in enumerate(tenthclosest): # assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, dfs
def b(r, dim, vector): lsh = LSHash(r, dim) for n, v in xxx: lsh.index(v.tolist()) start = time.perf_counter() q = lsh.query(vector.tolist(), 10, 'cosine') end = time.perf_counter() qq = [(x, 1 - y) for x, y in q] if len(qq) > 0: return qq[0][1], end - start else: return -2, end - start
def save_embedding_hash(hash_params, save_path, img_names, features_dict): ## Locality Sensitive Hashing # params k = hash_params['hash_size'] # hash size L = hash_params['num_tables'] # number of tables d = hash_params['dim'] # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) # LSH on all the images for img_path, vec in tqdm(features_dict.items()): lsh.index(vec.flatten(), extra_data=img_path) ## Exporting as pickle pickle.dump(lsh, open(save_path, "wb"))
def lsh(p_hash_size, distance_funcs): """ 实现局部敏感哈希模拟KNN的具体函数 :param p_hash_size: 与vipno的总数(去重后)相乘构成最终的hash_size :param distance_funcs: 可选择的距离计算函数 :return: 去除自身之后的该vipno对应knn的输出vipno """ datas_set, datas_matrix = get_data() # vipno_nums 为vipno去重后的总数 vipno_nums = len(datas_matrix[0]) # 随机取一个vipno(这里是vipno对应的下标) random_vipno = random.randint(0, vipno_nums - 1) # 初始化lshash lsh = LSHash(int(vipno_nums * p_hash_size), len(datas_matrix[:, 0])) for i in range(vipno_nums): # extra_data为当前列对应的vipno值,作为之后输出的时候所想要的knn的输出vipno lsh.index(datas_matrix[:, i], extra_data=datas_set.columns[i]) print("hash size: {}".format(vipno_nums * p_hash_size)) # print("distance func:", distance_func) print("input vipno: {}".format(datas_set.columns[random_vipno])) # vipno_res = [] ends = [] for distance_func in distance_funcs: start = datetime.datetime.now() vipno_res = [] # num_results可以限制输出的结果个数,这里取前6个,因为第一个为输入列本身 for res in lsh.query(datas_matrix[:, random_vipno], num_results=6, distance_func=distance_func): vipno_res.append(res[0][1]) end = (datetime.datetime.now() - start).total_seconds() ends.append(end) print("distance func:", distance_func) print("knn output(from 1 to 5): {}".format(vipno_res[1:])) print("time:", end) # 做时间性能比较图 plt.bar(distance_funcs, ends, alpha=0.9, width=0.35, facecolor='lightskyblue', edgecolor='white', label='time', lw=1) plt.legend(loc="upper left") plt.show()
def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))): """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest >>> import pandas as pd >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres() >>> pd.DataFrame(rank2) >>> pd.DataFrame(rank10) """ tenthclosest = [] secondclosest = [] closest = [] for D in range(2, X.shape[1]): lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D) # query vector q = np.random.uniform(size=(D, )) q /= np.linalg.norm(q) distances = [] for x in X[:, :D]: lsh.index(x) x /= np.linalg.norm(x) distances += [1. - np.sum(x * q)] # cosine similarity distances = sorted(distances) print(distances[:10]) closest10 = lsh.query(q, distance_func='cosine') N = len(closest10) tenthclosest += [[ D, min(9, N - 1) if N else -1, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)] ]] secondclosest += [[ D, min(1, N - 1) if N else -1, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)] ]] closest += [[ D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0] ]] print(tenthclosest[-1]) print(secondclosest[-1]) print(closest[-1]) # for i, tc in enumerate(tenthclosest): # assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, closest, secondclosest, tenthclosest
def generate_lsh(images): num_images = images.shape[0] image_size = images.shape[1] engine = LSHash(8, image_size) # create locality sensitive hash from all files for idx, image in enumerate(tqdm(images)): (top, right, bottom, left) = get_all_edges_from_array(image) engine.index(top, extra_data=(idx, TOP)) engine.index(right, extra_data=(idx, RIGHT)) engine.index(bottom, extra_data=(idx, BOTTOM)) engine.index(left, extra_data=(idx, LEFT)) return engine
def knn(df, k, coefficient): hash_size = int(coefficient * df.shape[1]) lsh = LSHash(hash_size, input_dim=df.shape[0]) for vipno in df: lsh.index(df[vipno], extra_data=vipno) random_column = df[df.columns.to_series().sample(1)] random_vip = random_column.columns.values[0] logging.info('random vipno: {}'.format(random_vip)) res = lsh.query(random_column.values.flatten())[0:k + 1] logging.info('vipno in ranked order using kNN(k = {}):'.format(k)) knns = [] for item in res: if item[0][1] != random_vip: logging.info(item[0][1]) knns.append(item[0][1]) return random_vip, knns[:5]
def getLSHashOutput(filename, hash_size, k): matrix = getMatrix(filename) list = [] for i in range(matrix.shape[1]): list.append(matrix.iloc[i]) total_num = len(matrix.iloc[0]) lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0])) for i in range(total_num): lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i]) out_num = rand.randint(0, total_num - 1) #有多种lshash函数,默认是euclidean m = lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean') print("输入的vipno是" + str(matrix.columns[out_num]) + "\n其桶中的vipno有:") bucket = [] for i in range(len(m)): print(m[i][0][1]) tag = np.argwhere(matrix.columns == m[i][0][1]) bucket.append(int(tag)) return bucket
def __init__(self, hash_size, input_dim, num_of_hashtables=1, storage=None, matrices_filename=None, overwrite=False): """ Attributes: :param hash_size: The length of the resulting binary hash in integer.E.g., 32 means the resulting binary hash will be 32 - bit long. :param input_dim: The dimension of the input vector.E.g., a grey - scale picture of 30x30 pixels will have an input dimension of 900. :param num_hashtables: (optional) The number of hash tables used for multiple lookups. :param storage_config: (optional) A dictionary of the form `{backend_name: config}` where `backend_name` is the either `dict` or `redis`, and `config` is the configuration used by the backend. For `redis`it should be in the format of`{"redis": {"host": hostname, "port": port_num}}`, where `hostname` is normally `localhost` and `port` is normally 6379. :param matrices_filename: (optional) Specify the path to the compressed numpy file endin with extension `.npz`, where the uniform random planes are stored, or to be stored if the file does not exist yet. :paramoverwrite: (optional) Whether to overwrite the matrices file if it already exist """ self.hash_object = LSHash( hash_size=hash_size, # 二进制hash 结果的长度 input_dim=input_dim, # 输入向量的维度 num_of_hashtables=num_of_hashtables, # 用于多次查找的哈希表的数目。可选项 storage=storage, # (可选)指定用于索引存储的存储的名称。选项包括“redis” matrices_filename= matrices_filename, # (可选)指定.npz文件的路径随机矩阵被存储, 如果文件不存在 overwrite=overwrite) # 如果matrices文件存在,是否对其进行覆盖, 可选项
for word in inputset: if word in vocablist: returnvec[vocablist.index(word)] += 1 else: print('word:', word, 'is not in the list_vec') return returnvec if __name__ == '__main__': datalist, classlist, vocabset = textprocess('./paper') # 获取每篇论文的词集 stop_word_file = './stopwords_cn.txt' stop_word_set = make_word_set(stop_word_file) feature_words = word_dict(vocabset, 0, stop_word_set) trainMat = [] lsh = LSHash(hash_size=10, input_dim=len(feature_words)) for postinDoc in datalist: trainMat_vec = bagof_word2vec(feature_words, postinDoc) # 训练集向量化 trainMat.append(trainMat_vec) lsh.index(trainMat_vec) testfile = './test.txt' testlist = [] with open(testfile, 'r', encoding='utf-8') as f: sequence = f.read() testlist.append(jieba.lcut(sequence, cut_all=False)) testvect = bagof_word2vec(feature_words, testlist[0]) re = lsh.query(testvect, num_results=1) print(list(re[0][0])) print(trainMat.index(list(re[0][0])))
trade_mat # show the table # # 2. LSH # In[9]: from lshash.lshash import LSHash import random # In[10]: o = open('lsh_output.txt', 'w') # create a file to write the results # loop with different hash size for e in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]: lsh = LSHash(round(n_vip * e), n_plu) for v in vipno: feature = list(trade_mat[v]) lsh.index(feature, extra_data=v) # pick up a random vipno pick_vip = random.randint(1, n_vip) pick_vip = vipno[pick_vip] o.write("Hash_size = {} * n_plu \n".format(e)) o.write("Pick up a vip: {}\n".format(pick_vip)) # lsh query and write the results candi = lsh.query(list(trade_mat[pick_vip]), 6, distance_func='hamming') # print(len(candi)) for i, item in enumerate(candi[1:]): dist = item[1]
# convert the words into word frequency matrix vectorizer1 = CountVectorizer() X = vectorizer1.fit_transform(corpus) # get the keywords in corpus word = vectorizer1.get_feature_names() transformer = TfidfTransformer() # calculate the TF-IDF values tfidf = transformer.fit_transform(X) lsh = LSHash(6, 8) # construct the centriodSet centriodSet = [] Ind = 0 for i in range(0, DBconnection.DBconnection.count(a.dbconnect_to_collection()) - 1): centriod = [] j = 0 while j < (tfidf.indptr[i + 1] - tfidf.indptr[i]): if j > 7: break centriod.append(round(tfidf.data[Ind + j], 2)) j += 1 if len(centriod) < 8: for index in range(8 - len(centriod)): centriod.append(0 + 1) lsh.index(centriod)
from src.utils import generate_data import numpy as np from lshash.lshash import LSHash np.random.seed(18) dimension = 10 size = 1000 hash_size = 3 num_tables = 8 if __name__ == '__main__': data = generate_data(size, dimension) # ============= lsh = LSH(hash_size, dimension, num_tables) lsh.index(data) lsh_1 = LSHash(hash_size, dimension, num_tables) # make the projection planes the same lsh_1.uniform_planes = lsh.projections for d in data: lsh_1.index(d) for i in range(num_tables): t1 = lsh.hash_tables[i] t2 = lsh_1.hash_tables[i] for k in t1: assert t1[k] == t2.get_val(k)
weight_decay=1e-4), ContrastiveLoss(), metric=None, device='cuda') model.load_weights( '/home/palm/PycharmProjects/seven2/snapshots/pairs/4/epoch_0_0.016697616640688282.pth' ) model.model.eval() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor(), normalize]) lsh = LSHash(hash_size=16, input_dim=1024, num_hashtables=5) target_path = '/home/palm/PycharmProjects/seven/images/cropped2/unknown/obj' query_path = '/home/palm/PycharmProjects/seven/images/cropped2/train' cache_path = '/home/palm/PycharmProjects/seven/caches' cache_dict = {} with torch.no_grad(): for target_image_path in os.listdir(target_path): target = os.path.join(target_path, target_image_path) target_image_ori = Image.open(target) target_image = transform(target_image_ori) x = torch.zeros((1, 3, 224, 224)) x[0] = target_image target_features = model.model._forward_impl(x.cuda()) minimum = (float('inf'), 0) for query_folder in os.listdir(query_path):
def test_lshash(): lsh = LSHash(6, 8) # 对于输入数据为8维的数据创建6位hash lsh.index([1, 2, 3, 4, 5, 6, 7, 8]) lsh.index([2, 3, 4, 5, 6, 7, 8, 9]) lsh.index([10, 12, 99, 1, 5, 31, 2, 3]) print(lsh.query([1, 2, 3, 4, 5, 6, 7, 7]))
feature_dict = dict(zip(img_path,sf.features)) # key val of 'image_path':'512_dim_visual_embedding' pickle.dump(feature_dict, open(path/"feature_dict.p", "wb")) feature_dict = pickle.load(open(path/'feature_dict.p','rb')) ## Locality Sensitive Hashing # params k = 10 # hash size L = 5 # number of tables d = 512 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) # LSH on all the images # for img_path, vec in tqdm_notebook(feature_dict.items()): for img_path, vec in (feature_dict.items()): print(img_path) print(vec) lsh.index(vec.flatten(), extra_data=img_path) ## Exporting as pickle pickle.dump(lsh, open(path/'lsh.p', "wb"))
# @Time : 2017/10/15 21:35 # @Author : Jalin Hu # @File : note.py # @Software: PyCharm from lshash.lshash import LSHash if __name__ == '__main__': lsh = LSHash(hash_size=6, input_dim=8) lsh.index([1, 2, 3, 4, 5, 6, 7, 8]) lsh.index([2, 3, 4, 5, 6, 7, 8, 9]) lsh.index([3, 4, 5, 6, 7, 8, 9, 10]) lsh.index([10, 12, 99, 1, 5, 6, 24, 20]) res = lsh.query([1, 2, 3, 4, 5, 6, 7, 7], num_results=2) print(res)
def createlshash(): k = 32 # hash size #12 L = 8 # number of tables #8 d = 128 # Dimension of Feature vector lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, seed=40) return lsh
import gym import numpy as np from PIL import Image from lshash.lshash import LSHash from collections import deque from random import random from diskcache import FanoutCache, Cache qtable = Cache('cache') qtable.clear() env = gym.make('Breakout-v0') lshs = LSHash(500, 8192) LEARNING_RATE = 0.15 DISCOUNT = 0.95 EPISODES = 25000 def preprocess(obs): image = Image.fromarray(observation) image = image.resize((64, 64)) image = image.convert(mode='1') array = np.array(image, dtype=np.uint8).flatten() return array def get_action(obs_seq): query = lshs.query(obs_seq, num_results=1) if len(query) <= 0: lshs.index(obs_seq) actions = np.ones(env.action_space.n) qtable[obs_seq] = actions elif query[0][1] >= 10: lshs.index(obs_seq)