Esempio n. 1
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]),
                 ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
Esempio n. 2
0
def test_proto_partial():
    import os

    filename = './temp_proto_partial.lopq'
    c = (np.random.rand(8, 8), np.random.rand(8, 8))
    m = LOPQModel(parameters=(c, None, None, None))
    m.export_proto(filename)
    m2 = LOPQModel.load_proto(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(m.Rs == m2.Rs)
    assert_true(m.mus == m2.mus)
    assert_true(m.subquantizers == m.subquantizers)

    os.remove(filename)
    if STEP_COARSE in args.steps:
        Cs = train_coarse(sc, data, args.V, seed=args.seed)
    else:
        Cs = model.Cs

    # Get rotations
    if STEP_ROTATION in args.steps:
        Rs, mus, counts = train_rotations(sc, data, args.M, Cs)
    else:
        Rs = model.Rs
        mus = model.mus

    # Get subquantizers
    if STEP_SUBQUANT in args.steps:
        model = LOPQModel(V=args.V,
                          M=args.M,
                          subquantizer_clusters=args.subquantizer_clusters,
                          parameters=(Cs, Rs, mus, None))

        if args.subquantizer_sampling_ratio != 1.0:
            data = data.sample(False, args.subquantizer_sampling_ratio,
                               args.seed)

        subs = train_subquantizers(sc,
                                   data,
                                   args.M,
                                   args.subquantizer_clusters,
                                   model,
                                   seed=args.seed)

    # Final output model
    model = LOPQModel(V=args.V,
    def train_index(self):

        if self.model_type == "lopq":
            train_np = self.get_train_features(self.nb_train,
                                               nb_min_train=self.nb_min_train)
            print("Got train features array with shape: {}".format(
                train_np.shape))
            nb_train_feats = train_np.shape[0]
            sys.stdout.flush()

            if nb_train_feats >= self.nb_train:
                from lopq.model import LOPQModel
                # we could have default values for those parameters and/or heuristic to estimate them based on data count...
                lopq_model = LOPQModel(
                    V=self.model_params['V'],
                    M=self.model_params['M'],
                    subquantizer_clusters=self.model_params['subq'])
                # we could have separate training/indexing features
                msg = "[{}.train_model: info] Starting local training of 'lopq' model with parameters {} using {} features."
                print(msg.format(self.pp, self.model_params, nb_train_feats))
                start_train = time.time()
                # specify a n_init < 10 (default value) to speed-up training?
                lopq_model.fit(train_np, verbose=True)
                # save model
                self.storer.save(self.build_model_str(), lopq_model)
                print(
                    "[{}.train_model: info] Trained lopq model in {}s.".format(
                        self.pp,
                        time.time() - start_train))
                return lopq_model
            else:
                msg = "[{}.train_model: error] Could not train model, not enough training samples."
                print(msg.format(self.pp))

        elif self.model_type == "lopq_pca":
            # lopq_pca training.
            from lopq.model import LOPQModelPCA
            # we could have default values for those parameters
            # and/or heuristic to estimate them based on data count...
            lopq_model = LOPQModelPCA(
                V=self.model_params['V'],
                M=self.model_params['M'],
                subquantizer_clusters=self.model_params['subq'],
                renorm=True)
            # pca loading/training first
            pca_model = self.storer.load(self.build_pca_model_str())
            if pca_model is None:
                train_np = self.get_train_features(
                    self.nb_train_pca, nb_min_train=self.nb_min_train_pca)
                msg = "[{}.train_model: info] Training PCA model, keeping {} dimensions from features {}."
                print(
                    msg.format(self.pp, self.model_params['pca'],
                               train_np.shape))
                sys.stdout.flush()
                start_train_pca = time.time()
                lopq_model.fit_pca(train_np, pca_dims=self.model_params['pca'])
                info_msg = "[{}.train_model: info] Trained pca model in {}s."
                print(info_msg.format(self.pp, time.time() - start_train_pca))
                del train_np
                self.storer.save(self.build_pca_model_str(), {
                    "P": lopq_model.pca_P,
                    "mu": lopq_model.pca_mu
                })
            else:
                lopq_model.pca_P = pca_model["P"]
                lopq_model.pca_mu = pca_model["mu"]
            # train model
            train_np = self.get_train_features(self.nb_train,
                                               lopq_pca_model=lopq_model,
                                               nb_min_train=self.nb_min_train)
            msg = "[{}.train_model: info] Training 'lopq_pca' model with parameters {} using features {}"
            print(msg.format(self.pp, self.model_params, train_np.shape))
            sys.stdout.flush()
            start_train = time.time()
            # specify a n_init < 10 (default value) to speed-up training?
            lopq_model.fit(train_np,
                           verbose=True,
                           apply_pca=False,
                           train_pca=False)
            # TODO: we could evaluate model based on reconstruction of some randomly sampled features?
            # save model
            self.storer.save(self.build_model_str(), lopq_model)
            info_msg = "[{}.train_model: info] Trained lopq model in {}s."
            print(info_msg.format(self.pp, time.time() - start_train))
            sys.stdout.flush()
            return lopq_model
            # err_msg = "[{}.train_model: error] Local training of 'lopq_pca' model not yet implemented."
            # raise NotImplementedError(err_msg.format(self.pp))
        else:
            err_msg = "[{}.train_model: error] Unknown 'lopq' type {}."
            raise ValueError(err_msg.format(self.pp, self.model_type))
Esempio n. 5
0
def make_random_model():
    m = LOPQModel(V=5, M=4, subquantizer_clusters=10)
    m.fit(np.random.RandomState(42).rand(200, 8), n_init=1)
    return m