Beispiel #1
0
 def cluster(self):
     self.pca_reduction = PCA(n_components=self.n_components)
     self.pca_reduction.fit(self.data)
     self.data = self.pca_reduction.transform(self.data)
     self.P, self.mu = self.pca()
     self.data = self.data - self.mu
     self.data = np.dot(self.data, self.P)
     train, test = train_test_split(self.data, test_size=0.2)
     self.model = LOPQModel(V=self.v,
                            M=self.m,
                            subquantizer_clusters=self.sub)
     self.model.fit(train, n_init=1)
     for i, e in enumerate(
             self.entries):  # avoid doing this twice again in searcher
         r = self.model.predict(self.data[i])
         e['coarse'] = r.coarse
         e['fine'] = r.fine
         e['index'] = i
     self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
     if self.test_mode:
         self.searcher.add_data(train)
         nns = compute_all_neighbors(test, train)
         recall, _ = get_recall(self.searcher, test, nns)
         print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
             self.model.V, self.model.M, self.model.subquantizer_clusters,
             str(recall))
     else:
         self.searcher.add_data(self.data)
Beispiel #2
0
 def experiment(self,data):
     train, test = train_test_split(data, test_size=0.1)
     print data.shape,train.shape,test.shape
     nns = compute_all_neighbors(test, train)
     self.fit_model(train)
     self.searcher.add_data(self.transform(train))
     recall, _ = get_recall(self.searcher, self.transform(test), nns)
     print 'Recall (V={}, M={}, subquants={}): {}'.format(self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
Beispiel #3
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]),
                 ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
def main(input_dir='/Users/aub3/temptest/gtin/',
         output_dir="/Users/aub3/temptest/products"):
    products = external_indexed.ProductsIndex(path=output_dir)
    # products.prepare(input_dir)
    products.build_approximate()
    data = products.data
    # data = load_oxford_data()
    print data.shape
    pca_reduction = PCA(n_components=32)
    pca_reduction.fit(data)
    data = pca_reduction.transform(data)
    print data.shape
    P, mu = pca(data)
    data = data - mu
    data = np.dot(data, P)
    train, test = train_test_split(data, test_size=0.2)
    print train.shape, test.shape
    nns = compute_all_neighbors(test, train)
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)
    print "fitted"
    searcher = LOPQSearcher(m)
    print "adding data"
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m.V, m.M, m.subquantizer_clusters, str(recall))
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m2.V, m2.M, m2.subquantizer_clusters, str(recall))
    m3 = LOPQModel(V=16,
                   M=8,
                   subquantizer_clusters=512,
                   parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)
    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
        m3.V, m3.M, m3.subquantizer_clusters, str(recall))
Beispiel #5
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data, test_size=0.2, random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]), ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
Beispiel #6
0
 def cluster(self):
     print self.data.shape
     pca_reduction = PCA(n_components=32)
     pca_reduction.fit(self.data)
     self.data = pca_reduction.transform(self.data)
     print self.data.shape
     P, mu = self.pca()
     self.data = self.data - mu
     data = np.dot(self.data, P)
     train, test = train_test_split(self.data, test_size=0.2)
     print train.shape, test.shape
     nns = compute_all_neighbors(test, train)
     m = LOPQModel(V=16, M=8)
     m.fit(train, n_init=1)
     print "fitted"
     searcher = LOPQSearcher(m)
     print "adding data"
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m.V, m.M, m.subquantizer_clusters, str(recall))
     m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
     m2.fit(train, n_init=1)
     searcher = LOPQSearcher(m2)
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m2.V, m2.M, m2.subquantizer_clusters, str(recall))
     m3 = LOPQModel(V=16,
                    M=8,
                    subquantizer_clusters=512,
                    parameters=(m.Cs, m.Rs, m.mus, None))
     m3.fit(train, n_init=1)
     searcher = LOPQSearcher(m3)
     searcher.add_data(train)
     recall, _ = get_recall(searcher, test, nns)
     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
         m3.V, m3.M, m3.subquantizer_clusters, str(recall))
Beispiel #7
0
 def cluster(self):
     self.pca_reduction = PCA(n_components=self.n_components)
     self.pca_reduction.fit(self.data)
     self.data = self.pca_reduction.transform(self.data)
     self.P, self.mu = self.pca()
     self.data = self.data - self.mu
     self.data = np.dot(self.data,self.P)
     train, test = train_test_split(self.data, test_size=0.2)
     self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub)
     self.model.fit(train, n_init=1)
     for i,e in enumerate(self.entries): # avoid doing this twice again in searcher
         r = self.model.predict(self.data[i])
         e['coarse'] = r.coarse
         e['fine'] = r.fine
         e['index'] = i
     self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename)
     if self.test_mode:
         self.searcher.add_data(train)
         nns = compute_all_neighbors(test, train)
         recall, _ = get_recall(self.searcher, test, nns)
         print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
     else:
         self.searcher.add_data(self.data)
Beispiel #8
0
def main():
    """
    A brief demo script showing how to train various LOPQ models with brief
    discussion of trade offs.
    """

    # Get the oxford dataset
    data = load_oxford_data()

    # Compute PCA of oxford dataset. See README in data/oxford for details
    # about this dataset.
    P, mu = pca(data)

    # Mean center and rotate the data; includes dimension permutation.
    # It is worthwhile see how this affects recall performance. On this
    # dataset, which is already PCA'd from higher dimensional features,
    # this additional step to variance balance the dimensions typically
    # improves recall@1 by 3-5%. The benefit can be much greater depending
    # on the dataset.
    data = data - mu
    data = np.dot(data, P)

    # Create a train and test split. The test split will become
    # a set of queries for which we will compute the true nearest neighbors.
    train, test = train_test_split(data, test_size=0.2)

    # Compute distance-sorted neighbors in training set for each point in test set.
    # These will be our groundtruth for recall evaluation.
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)

    # Note that we didn't specify a random seed for fitting the model, so different
    # runs will be different. You may also see a warning that some local projections
    # can't be estimated because too few points fall in a cluster. This is ok for the
    # purposes of this demo, but you might want to avoid this by increasing the amount
    # of training data or decreasing the number of clusters (the V hyperparameter).

    # With a model in hand, we can test it's recall. We populate a LOPQSearcher
    # instance with data and get recall stats. By default, we will retrieve 1000
    # ranked results for each query vector for recall evaluation.
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m.V, m.M, m.subquantizer_clusters, str(recall)))

    # We can experiment with other hyperparameters without discarding all
    # parameters everytime. Here we train a new model that uses the same coarse
    # quantizers but a higher number of subquantizers, i.e. we increase M.
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)

    # Let's evaluate again.
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m2.V, m2.M, m2.subquantizer_clusters, str(recall)))

    # The recall is probably higher. We got better recall with a finer quantization
    # at the expense of more data required for index items.

    # We can also hold both coarse quantizers and rotations fixed and see what
    # increasing the number of subquantizer clusters does to performance.
    m3 = LOPQModel(V=16,
                   M=8,
                   subquantizer_clusters=512,
                   parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print('Recall (V=%d, M=%d, subquants=%d): %s' %
          (m3.V, m3.M, m3.subquantizer_clusters, str(recall)))
Beispiel #9
0
def main():
    """
    A brief demo script showing how to train various LOPQ models with brief
    discussion of trade offs.
    """

    # Get the oxford dataset
    data = load_oxford_data()

    # Compute PCA of oxford dataset. See README in data/oxford for details
    # about this dataset.
    P, mu = pca(data)

    # Mean center and rotate the data; includes dimension permutation.
    # It is worthwhile see how this affects recall performance. On this
    # dataset, which is already PCA'd from higher dimensional features,
    # this additional step to variance balance the dimensions typically
    # improves recall@1 by 3-5%. The benefit can be much greater depending
    # on the dataset.
    data = data - mu
    data = np.dot(data, P)

    # Create a train and test split. The test split will become
    # a set of queries for which we will compute the true nearest neighbors.
    train, test = train_test_split(data, test_size=0.2)

    # Compute distance-sorted neighbors in training set for each point in test set.
    # These will be our groundtruth for recall evaluation.
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1)

    # Note that we didn't specify a random seed for fitting the model, so different
    # runs will be different. You may also see a warning that some local projections
    # can't be estimated because too few points fall in a cluster. This is ok for the
    # purposes of this demo, but you might want to avoid this by increasing the amount
    # of training data or decreasing the number of clusters (the V hyperparameter).

    # With a model in hand, we can test it's recall. We populate a LOPQSearcher
    # instance with data and get recall stats. By default, we will retrieve 1000
    # ranked results for each query vector for recall evaluation.
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m.V, m.M, m.subquantizer_clusters, str(recall))

    # We can experiment with other hyperparameters without discarding all
    # parameters everytime. Here we train a new model that uses the same coarse
    # quantizers but a higher number of subquantizers, i.e. we increase M.
    m2 = LOPQModel(V=16, M=16, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1)

    # Let's evaluate again.
    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m2.V, m2.M, m2.subquantizer_clusters, str(recall))

    # The recall is probably higher. We got better recall with a finer quantization
    # at the expense of more data required for index items.

    # We can also hold both coarse quantizers and rotations fixed and see what
    # increasing the number of subquantizer clusters does to performance.
    m3 = LOPQModel(V=16, M=8, subquantizer_clusters=512, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    print 'Recall (V=%d, M=%d, subquants=%d): %s' % (m3.V, m3.M, m3.subquantizer_clusters, str(recall))