def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data)
def experiment(self,data): train, test = train_test_split(data, test_size=0.1) print data.shape,train.shape,test.shape nns = compute_all_neighbors(test, train) self.fit_model(train) self.searcher.add_data(self.transform(train)) recall, _ = get_recall(self.searcher, self.transform(test), nns) print 'Recall (V={}, M={}, subquants={}): {}'.format(self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
def test_oxford5k(): random_state = 40 data = load_oxford_data() train, test = train_test_split(data, test_size=0.2, random_state=random_state) # Compute distance-sorted neighbors in training set for each point in test set nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1, random_state=random_state) # Assert correct code computation assert_equal(m.predict(test[0]), ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250))) # Assert low number of empty cells h = get_cell_histogram(train, m) assert_equal(np.count_nonzero(h == 0), 6) # Assert true NN recall on test set searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with just coarse quantizers m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with coarse quantizers and rotations m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
def main(input_dir='/Users/aub3/temptest/gtin/', output_dir="/Users/aub3/temptest/products"): products = external_indexed.ProductsIndex(path=output_dir) # products.prepare(input_dir) products.build_approximate() data = products.data # data = load_oxford_data() print data.shape pca_reduction = PCA(n_components=32) pca_reduction.fit(data) data = pca_reduction.transform(data) print data.shape P, mu = pca(data) data = data - mu data = np.dot(data, P) train, test = train_test_split(data, test_size=0.2) print train.shape, test.shape nns = compute_all_neighbors(test, train) m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) print "fitted" searcher = LOPQSearcher(m) print "adding data" searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m.V, m.M, m.subquantizer_clusters, str(recall)) m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m2.V, m2.M, m2.subquantizer_clusters, str(recall)) m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m3.V, m3.M, m3.subquantizer_clusters, str(recall))
def cluster(self): print self.data.shape pca_reduction = PCA(n_components=32) pca_reduction.fit(self.data) self.data = pca_reduction.transform(self.data) print self.data.shape P, mu = self.pca() self.data = self.data - mu data = np.dot(self.data, P) train, test = train_test_split(self.data, test_size=0.2) print train.shape, test.shape nns = compute_all_neighbors(test, train) m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) print "fitted" searcher = LOPQSearcher(m) print "adding data" searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m.V, m.M, m.subquantizer_clusters, str(recall)) m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m2.V, m2.M, m2.subquantizer_clusters, str(recall)) m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( m3.V, m3.M, m3.subquantizer_clusters, str(recall))
def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data,self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i,e in enumerate(self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data)
def main(): """ A brief demo script showing how to train various LOPQ models with brief discussion of trade offs. """ # Get the oxford dataset data = load_oxford_data() # Compute PCA of oxford dataset. See README in data/oxford for details # about this dataset. P, mu = pca(data) # Mean center and rotate the data; includes dimension permutation. # It is worthwhile see how this affects recall performance. On this # dataset, which is already PCA'd from higher dimensional features, # this additional step to variance balance the dimensions typically # improves recall@1 by 3-5%. The benefit can be much greater depending # on the dataset. data = data - mu data = np.dot(data, P) # Create a train and test split. The test split will become # a set of queries for which we will compute the true nearest neighbors. train, test = train_test_split(data, test_size=0.2) # Compute distance-sorted neighbors in training set for each point in test set. # These will be our groundtruth for recall evaluation. nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) # Note that we didn't specify a random seed for fitting the model, so different # runs will be different. You may also see a warning that some local projections # can't be estimated because too few points fall in a cluster. This is ok for the # purposes of this demo, but you might want to avoid this by increasing the amount # of training data or decreasing the number of clusters (the V hyperparameter). # With a model in hand, we can test it's recall. We populate a LOPQSearcher # instance with data and get recall stats. By default, we will retrieve 1000 # ranked results for each query vector for recall evaluation. searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall))) # We can experiment with other hyperparameters without discarding all # parameters everytime. Here we train a new model that uses the same coarse # quantizers but a higher number of subquantizers, i.e. we increase M. m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) # Let's evaluate again. searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall))) # The recall is probably higher. We got better recall with a finer quantization # at the expense of more data required for index items. # We can also hold both coarse quantizers and rotations fixed and see what # increasing the number of subquantizer clusters does to performance. m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print('Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall)))
def main(): """ A brief demo script showing how to train various LOPQ models with brief discussion of trade offs. """ # Get the oxford dataset data = load_oxford_data() # Compute PCA of oxford dataset. See README in data/oxford for details # about this dataset. P, mu = pca(data) # Mean center and rotate the data; includes dimension permutation. # It is worthwhile see how this affects recall performance. On this # dataset, which is already PCA'd from higher dimensional features, # this additional step to variance balance the dimensions typically # improves recall@1 by 3-5%. The benefit can be much greater depending # on the dataset. data = data - mu data = np.dot(data, P) # Create a train and test split. The test split will become # a set of queries for which we will compute the true nearest neighbors. train, test = train_test_split(data, test_size=0.2) # Compute distance-sorted neighbors in training set for each point in test set. # These will be our groundtruth for recall evaluation. nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1) # Note that we didn't specify a random seed for fitting the model, so different # runs will be different. You may also see a warning that some local projections # can't be estimated because too few points fall in a cluster. This is ok for the # purposes of this demo, but you might want to avoid this by increasing the amount # of training data or decreasing the number of clusters (the V hyperparameter). # With a model in hand, we can test it's recall. We populate a LOPQSearcher # instance with data and get recall stats. By default, we will retrieve 1000 # ranked results for each query vector for recall evaluation. searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall)) # We can experiment with other hyperparameters without discarding all # parameters everytime. Here we train a new model that uses the same coarse # quantizers but a higher number of subquantizers, i.e. we increase M. m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1) # Let's evaluate again. searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall)) # The recall is probably higher. We got better recall with a finer quantization # at the expense of more data required for index items. # We can also hold both coarse quantizers and rotations fixed and see what # increasing the number of subquantizer clusters does to performance. m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall))