class LOPQRetriever(object): """ Deprecated and soon to be removed """ def __init__(self, name, approximator): self.approximate = True self.name = name self.loaded_entries = set() self.entries = [] self.support_batching = False self.approximator = approximator self.approximator.load() self.searcher = LOPQSearcher(model=self.approximator.model) def add_entries(self, entries, video_id, entry_type): codes = [] ids = [] last_index = len(self.entries) for i, e in enumerate(entries): codes.append((tuple(e[1][0]), tuple(e[1][1]))) ids.append(i + last_index) self.entries.append({"id":e[0],"type":entry_type,"video":video_id}) self.searcher.add_codes(codes, ids) def nearest(self, vector=None, n=12): results = [] pca_vec = self.approximator.get_pca_vector(vector) results_indexes, visited = self.searcher.search(pca_vec, quota=n) for r in results_indexes: results.append(self.entries[r.id]) return results
class LOPQ(BaseANN): def __init__(self, v): m = 4 self.name = 'LOPQ(v={}, m={})'.format(v, m) self._m = m self._model = LOPQModel(V=v, M=m) self._searcher = None print("Init done") def fit(self, X): X = numpy.array(X) X = X.astype(numpy.float32) self._model.fit(X) self._searcher = LOPQSearcher(self._model) self._searcher.add_data(X) print("Fit done") def query(self, v, n): v = v.astype(numpy.float32) print(v) print(n) print("-----------------------------------") nns = searcher.search(x, quota=100) return nns def use_threads(self): return True
class LOPQRetriever(BaseRetriever): def __init__(self,name,approximator): super(LOPQRetriever, self).__init__(name=name,approximator=approximator,algorithm="LOPQ") self.approximate = True self.name = name self.loaded_entries = {} self.entries = [] self.support_batching = False self.approximator = approximator self.approximator.load() self.searcher = LOPQSearcher(model=self.approximator.model) def load_index(self,numpy_matrix=None,entries=None): codes = [] ids = [] last_index = len(self.entries) for i, e in enumerate(entries): codes.append((tuple(e['codes'][0]),tuple(e['codes'][1]))) ids.append(i+last_index) self.entries.append(e) self.searcher.add_codes(codes,ids) def nearest(self,vector=None,n=12): results = [] pca_vec = self.approximator.get_pca_vector(vector) results_indexes, visited = self.searcher.search(pca_vec,quota=n) for r in results_indexes: results.append(self.entries[r.id]) return results
def fit(self, X): X = numpy.array(X) X = X.astype(numpy.float32) self._model.fit(X) self._searcher = LOPQSearcher(self._model) self._searcher.add_data(X) print("Fit done")
def __init__(self, name, approximator): self.approximate = True self.name = name self.loaded_entries = set() self.entries = [] self.support_batching = False self.approximator = approximator self.approximator.load() self.searcher = LOPQSearcher(model=self.approximator.model)
def __init__(self,name,approximator): super(LOPQRetriever, self).__init__(name=name,approximator=approximator,algorithm="LOPQ") self.approximate = True self.name = name self.loaded_entries = {} self.entries = [] self.support_batching = False self.approximator = approximator self.approximator.load() self.searcher = LOPQSearcher(model=self.approximator.model)
class ApproximateIndexer(object): def __init__(self,index_name,model_path,lmdb_path,V=16, M=16): self.model = LOPQModel(V,M) self.index_name = index_name self.searcher = None self.model_path = model_path self.lmdb_path = lmdb_path def load(self): self.model.load_proto(self.model_path) def fit(self,train): print train.shape self.pca_reduction = PCA(n_components=256) self.pca_reduction.fit(train) train = self.pca_reduction.transform(train) self.P, self.mu = pca(train) train = np.dot(train, self.P) print train.shape self.model.fit(train, n_init=1) def transform(self,test): print test.shape test = self.pca_reduction.transform(test) test = test - self.mu test = np.dot(test,self.P) print test.shape return test def fit_model(self,train): self.fit(train) self.model.export_proto(self.model_path) self.searcher = LOPQSearcher(self.model) # LOPQSearcherLMDB(self.model,self.lmdb_path) def experiment(self,data): train, test = train_test_split(data, test_size=0.1) print data.shape,train.shape,test.shape nns = compute_all_neighbors(test, train) self.fit_model(train) self.searcher.add_data(self.transform(train)) recall, _ = get_recall(self.searcher, self.transform(test), nns) print 'Recall (V={}, M={}, subquants={}): {}'.format(self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) def add_data(self,data): self.searcher.add_data(data) def search(self,x): return self.searcher.search(x,quota=100)
class LOPQ(BaseANN): def __init__(self, v): m = 4 self.name = 'LOPQ(v={}, m={})'.format(v, m) self._m = m self._model = LOPQModel(V=v, M=m) self._searcher = None def fit(self, X): X = numpy.array(X) X = X.astype(numpy.float32) self._model.fit(X) self._searcher = LOPQSearcher(self._model) self._searcher.add_data(X) def query(self, v, n): v = v.astype(numpy.float32) nns = self._searcher.search(v, quota=100) return nns
def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) self.searcher = LOPQSearcher(self.model) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) for i, e in enumerate(self.entries): e['coarse'] = self.model.predict(self.data[i]).coarse e['fine'] = self.model.predict(self.data[i]).fine
class Clustering(object): def __init__(self, fnames, n_components, model_proto_filename, m, v, sub, test_mode=False): data = [] self.fnames = fnames self.entries = [] for fname in fnames: nmat = np.load(fname) if nmat.ndim > 2: nmat = nmat.squeeze() data.append(nmat) for e in json.load(file(fname.replace('npy', 'json'))): self.entries.append(e) if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = test_mode self.n_components = n_components self.m = m self.v = v self.sub = sub self.model = None self.search = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = model_proto_filename self.P_filename = model_proto_filename.replace('.proto', '.P.npy') self.mu_filename = model_proto_filename.replace('.proto', '.mu.npy') self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl') self.permuted_inds_filename = model_proto_filename.replace( '.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) self.searcher = LOPQSearcher(self.model) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) for i, e in enumerate(self.entries): e['coarse'] = self.model.predict(self.data[i]).coarse e['fine'] = self.model.predict(self.data[i]).fine def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): self.model.export_proto(self.model_proto_filename) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename)
def main(new=True): # data: 3000 x 128dim if not new: # load data data = np.load('./data.npy') else: data = np.vstack((np.random.rand(1000, 128), np.random.rand(1000, 128) + 1, np.random.rand(1000, 128) - 1)) print 'make data' # save data np.save('data.npy', data) # wanted to know this nearest neighbors x = np.ones(128) * 2 print 'naive implementation' start = time.time() dist = np.sum(np.power((data - x), 2), axis=1) res = np.argsort(dist) print res[0:10] # return indices; top 10 print time.time() - start, 's taken for naive NNsearch' model = None if not new: # load model model = LOPQModel.load_mat('params.mat') else: # Define a model and fit it to data model = LOPQModel(V=3, M=2, subquantizer_clusters=64) start = time.time() model.fit(data) print time.time() -start, 's taken for model fitting' # save model model.export_mat('params.mat') # Compute the LOPQ codes for a vector # if we define SC as subquantizer_clusters, # input vec(128dim); output: coarse codes(V, V), fine codes(SC, SC) because M = 2 """ for i in xrange(10): y = np.random.rand(128) code = model.predict(y) print 'output: ', code """ # Create a searcher to index data with the model searcher = LOPQSearcher(model) searcher.add_data(data) start = time.time() # Retrieve ranked nearest neighbors nns = searcher.search(x, quota=10) ans = [nns[0][i][0] for i in range(10)] print ans print time.time() -start, 's taken for prediction top 10' count = 0 for element in ans: if element in res[0:10]: count += 1 else: print 'accuracy: ', count, '/', 10
def main(input_dir='/Users/aub3/temptest/gtin/', output_dir="/Users/aub3/temptest/products"): products = external_indexed.ProductsIndex(path=output_dir) # products.prepare(input_dir) products.build_approximate() data = products.data # data = load_oxford_data() print data.shape pca_reduction = PCA(n_components=32) pca_reduction.fit(data) data = pca_reduction.transform(data) print data.shape P, mu = pca(data) data = data - mu data = np.dot(data, P) train, test = train_test_split(data, test_size=0.2) print train.shape, test.shape nns = compute_all_neighbors(test, train) m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) print "fitted" searcher = LOPQSearcher(m) print "adding data" searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m.V, m.M, m.subquantizer_clusters, str(recall)) m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m2.V, m2.M, m2.subquantizer_clusters, str(recall)) m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m3.V, m3.M, m3.subquantizer_clusters, str(recall))
def fit_model(self,train): self.fit(train) self.model.export_proto(self.model_path) self.searcher = LOPQSearcher(self.model) # LOPQSearcherLMDB(self.model,self.lmdb_path)
def main(): """ A brief demo script showing how to train various LOPQ models with brief discussion of trade offs. """ # Get the oxford dataset data = load_oxford_data() # Compute PCA of oxford dataset. See README in data/oxford for details # about this dataset. P, mu = pca(data) # Mean center and rotate the data; includes dimension permutation. # It is worthwhile see how this affects recall performance. On this # dataset, which is already PCA'd from higher dimensional features, # this additional step to variance balance the dimensions typically # improves recall@1 by 3-5%. The benefit can be much greater depending # on the dataset. data = data - mu data = np.dot(data, P) # Create a train and test split. The test split will become # a set of queries for which we will compute the true nearest neighbors. train, test = train_test_split(data, test_size=0.2) # Compute distance-sorted neighbors in training set for each point in test set. # These will be our groundtruth for recall evaluation. nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) # Note that we didn't specify a random seed for fitting the model, so different # runs will be different. You may also see a warning that some local projections # can't be estimated because too few points fall in a cluster. This is ok for the # purposes of this demo, but you might want to avoid this by increasing the amount # of training data or decreasing the number of clusters (the V hyperparameter). # With a model in hand, we can test it's recall. We populate a LOPQSearcher # instance with data and get recall stats. By default, we will retrieve 1000 # ranked results for each query vector for recall evaluation. searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall)) # We can experiment with other hyperparameters without discarding all # parameters everytime. Here we train a new model that uses the same coarse # quantizers but a higher number of subquantizers, i.e. we increase M. m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) # Let's evaluate again. searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall)) # The recall is probably higher. We got better recall with a finer quantization # at the expense of more data required for index items. # We can also hold both coarse quantizers and rotations fixed and see what # increasing the number of subquantizer clusters does to performance. m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall))
def cluster(self): print self.data.shape pca_reduction = PCA(n_components=32) pca_reduction.fit(self.data) self.data = pca_reduction.transform(self.data) print self.data.shape P, mu = self.pca() self.data = self.data - mu data = np.dot(self.data, P) train, test = train_test_split(self.data, test_size=0.2) print train.shape, test.shape nns = compute_all_neighbors(test, train) m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) print "fitted" searcher = LOPQSearcher(m) print "adding data" searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m.V, m.M, m.subquantizer_clusters, str(recall)) m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m2.V, m2.M, m2.subquantizer_clusters, str(recall)) m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m3.V, m3.M, m3.subquantizer_clusters, str(recall))
def main(): """ A brief demo script showing how to train various LOPQ models with brief discussion of trade offs. """ # Get the oxford dataset data = load_oxford_data() # Compute PCA of oxford dataset. See README in data/oxford for details # about this dataset. P, mu = pca(data) # Mean center and rotate the data; includes dimension permutation. # It is worthwhile see how this affects recall performance. On this # dataset, which is already PCA'd from higher dimensional features, # this additional step to variance balance the dimensions typically # improves recall@1 by 3-5%. The benefit can be much greater depending # on the dataset. data = data - mu data = np.dot(data, P) # Create a train and test split. The test split will become # a set of queries for which we will compute the true nearest neighbors. train, test = train_test_split(data, test_size=0.2) # Compute distance-sorted neighbors in training set for each point in test set. # These will be our groundtruth for recall evaluation. nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) # Note that we didn't specify a random seed for fitting the model, so different # runs will be different. You may also see a warning that some local projections # can't be estimated because too few points fall in a cluster. This is ok for the # purposes of this demo, but you might want to avoid this by increasing the amount # of training data or decreasing the number of clusters (the V hyperparameter). # With a model in hand, we can test it's recall. We populate a LOPQSearcher # instance with data and get recall stats. By default, we will retrieve 1000 # ranked results for each query vector for recall evaluation. searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall))) # We can experiment with other hyperparameters without discarding all # parameters everytime. Here we train a new model that uses the same coarse # quantizers but a higher number of subquantizers, i.e. we increase M. m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) # Let's evaluate again. searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall))) # The recall is probably higher. We got better recall with a finer quantization # at the expense of more data required for index items. # We can also hold both coarse quantizers and rotations fixed and see what # increasing the number of subquantizer clusters does to performance. m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall)))
gtobj = GTOBJ() relevant_labels_mapping = { 'DSVR': ['ND', 'DS'], 'CSVR': ['ND', 'DS', 'CS'], 'ISVR': ['ND', 'DS', 'CS', 'IS'], } print('LOPQModel!') start = time.time() final_vids, features, vid2features = load_features( '/home/camp/FIVR/features/vcms_v1', is_gv=False) # Define a model and fit it to data model = LOPQModel(V=8, M=4) model.fit(np.array(features).reshape(-1, 512)) # Create a searcher to index data with the model searcher = LOPQSearcher(model) searcher.add_data(features) print('Read time: %.2f' % (time.time() - start)) # 加载特征 vids = list(vid2features.keys()) print(vids[:10]) global_features = np.squeeze( np.asarray(list(vid2features.values()), np.float32)) print(np.shape(global_features)) # 加载vid2name 和 name2vid with open('/home/camp/FIVR/vid2name.pk', 'rb') as pk_file: vid2names = pk.load(pk_file) with open('/home/camp/FIVR/vid2name.pk', 'rb') as pk_file: name2vids = pk.load(pk_file)