Exemple #1
0
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Exemple #2
0
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Exemple #3
0
def test_reconstruction():
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    code = ((0, 1), (0, 1, 2, 3))
    r = m.reconstruct(code)
    expected = [-2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377]

    assert_true(np.allclose(expected, r))
Exemple #4
0
def test_proto_partial():
    import os

    filename = './temp_proto_partial.lopq'
    c = (np.random.rand(8, 8), np.random.rand(8, 8))
    m = LOPQModel(parameters=(c, None, None, None))
    m.export_proto(filename)
    m2 = LOPQModel.load_proto(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(m.Rs == m2.Rs)
    assert_true(m.mus == m2.mus)
    assert_true(m.subquantizers == m.subquantizers)

    os.remove(filename)
Exemple #5
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data, test_size=0.2, random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]), ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
Exemple #6
0
def test_reconstruction():
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    code = ((0, 1), (0, 1, 2, 3))
    r = m.reconstruct(code)
    expected = [
        -2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283,
        8.11780532, 6.34846377
    ]

    assert_true(np.allclose(expected, r))
Exemple #7
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
Exemple #8
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
Exemple #9
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
Exemple #10
0
def test_searcher():
    data = pkl.load(open(relpath("./testdata/test_searcher_data.pkl")))
    m = LOPQModel.load_proto(relpath("./testdata/random_test_model.lopq"))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
Exemple #11
0
def test_mat():
    import os

    filename = './temp_mat.mat'
    m = make_random_model()
    m.export_mat(filename)
    m2 = LOPQModel.load_mat(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(np.allclose(m.Rs[0], m2.Rs[0]))
    assert_true(np.allclose(m.mus[0], m2.mus[0]))
    assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0]))

    os.remove(filename)
Exemple #12
0
def test_mat():
    import os

    filename = './temp_mat.mat'
    m = make_random_model()
    m.export_mat(filename)
    m2 = LOPQModel.load_mat(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(np.allclose(m.Rs[0], m2.Rs[0]))
    assert_true(np.allclose(m.mus[0], m2.mus[0]))
    assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0]))

    os.remove(filename)
Exemple #13
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        model = pkl.load(open(args.model_pkl))
    elif args.model_proto:
        model = LOPQModel.load_proto(args.model_proto)

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Exemple #14
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        model = pkl.load(open(args.model_pkl))
    elif args.model_proto:
        model = LOPQModel.load_proto(args.model_proto)

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(
        lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Exemple #15
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        filename = copy_from_hdfs(args.model_pkl)
        model = pkl.load(open(filename))
        os.remove(filename)
    elif args.model_proto:
        filename = copy_from_hdfs(args.model_proto)
        model = LOPQModel.load_proto(args.model_proto)
        os.remove(filename)

    print 'LOPQModel is of type: {}'.format(type(model))

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Deprecated. Now assume m.value.predict will apply PCA if needed
    # # Apply PCA before encoding if needed
    # if args.pca_model is not None:
    #     # Check if we should get PCA model
    #     print 'Loading PCA model from {}'.format(args.pca_model)
    #     filename = copy_from_hdfs(args.pca_model)
    #     params = pkl.load(open(filename))
    #     # TODO: we should also remove tmp dir
    #     os.remove(filename)
    #     P = params['P']
    #     mu = params['mu']
    #     print 'Applying PCA from model {}'.format(args.pca_model)
    #     # Use mapValues this time as we DO have the ids as keys
    #     d = d.mapValues(lambda x: apply_PCA(x, mu, P))

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(
        lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        filename = copy_from_hdfs(args.model_pkl)
        model = pkl.load(open(filename))
        os.remove(filename)
    elif args.model_proto:
        filename = copy_from_hdfs(args.model_proto)
        model = LOPQModel.load_proto(args.model_proto)
        os.remove(filename)

    print 'LOPQModel is of type: {}'.format(type(model))

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Deprecated. Now assume m.value.predict will apply PCA if needed
    # # Apply PCA before encoding if needed
    # if args.pca_model is not None:
    #     # Check if we should get PCA model
    #     print 'Loading PCA model from {}'.format(args.pca_model)
    #     filename = copy_from_hdfs(args.pca_model)
    #     params = pkl.load(open(filename))
    #     # TODO: we should also remove tmp dir
    #     os.remove(filename)
    #     P = params['P']
    #     mu = params['mu']
    #     print 'Applying PCA from model {}'.format(args.pca_model)
    #     # Use mapValues this time as we DO have the ids as keys
    #     d = d.mapValues(lambda x: apply_PCA(x, mu, P))

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Exemple #17
0
def make_random_model():
    m = LOPQModel(V=5, M=4, subquantizer_clusters=10)
    m.fit(np.random.RandomState(42).rand(200, 8), n_init=1)
    return m
Exemple #18
0
def make_random_model():
    m = LOPQModel(V=5, M=4, subquantizer_clusters=10)
    m.fit(np.random.RandomState(42).rand(200, 8), n_init=1)
    return m
    def train_index(self):

        if self.model_type == "lopq":
            train_np = self.get_train_features(self.nb_train,
                                               nb_min_train=self.nb_min_train)
            print("Got train features array with shape: {}".format(
                train_np.shape))
            nb_train_feats = train_np.shape[0]
            sys.stdout.flush()

            if nb_train_feats >= self.nb_train:
                from lopq.model import LOPQModel
                # we could have default values for those parameters and/or heuristic to estimate them based on data count...
                lopq_model = LOPQModel(
                    V=self.model_params['V'],
                    M=self.model_params['M'],
                    subquantizer_clusters=self.model_params['subq'])
                # we could have separate training/indexing features
                msg = "[{}.train_model: info] Starting local training of 'lopq' model with parameters {} using {} features."
                print(msg.format(self.pp, self.model_params, nb_train_feats))
                start_train = time.time()
                # specify a n_init < 10 (default value) to speed-up training?
                lopq_model.fit(train_np, verbose=True)
                # save model
                self.storer.save(self.build_model_str(), lopq_model)
                print(
                    "[{}.train_model: info] Trained lopq model in {}s.".format(
                        self.pp,
                        time.time() - start_train))
                return lopq_model
            else:
                msg = "[{}.train_model: error] Could not train model, not enough training samples."
                print(msg.format(self.pp))

        elif self.model_type == "lopq_pca":
            # lopq_pca training.
            from lopq.model import LOPQModelPCA
            # we could have default values for those parameters
            # and/or heuristic to estimate them based on data count...
            lopq_model = LOPQModelPCA(
                V=self.model_params['V'],
                M=self.model_params['M'],
                subquantizer_clusters=self.model_params['subq'],
                renorm=True)
            # pca loading/training first
            pca_model = self.storer.load(self.build_pca_model_str())
            if pca_model is None:
                train_np = self.get_train_features(
                    self.nb_train_pca, nb_min_train=self.nb_min_train_pca)
                msg = "[{}.train_model: info] Training PCA model, keeping {} dimensions from features {}."
                print(
                    msg.format(self.pp, self.model_params['pca'],
                               train_np.shape))
                sys.stdout.flush()
                start_train_pca = time.time()
                lopq_model.fit_pca(train_np, pca_dims=self.model_params['pca'])
                info_msg = "[{}.train_model: info] Trained pca model in {}s."
                print(info_msg.format(self.pp, time.time() - start_train_pca))
                del train_np
                self.storer.save(self.build_pca_model_str(), {
                    "P": lopq_model.pca_P,
                    "mu": lopq_model.pca_mu
                })
            else:
                lopq_model.pca_P = pca_model["P"]
                lopq_model.pca_mu = pca_model["mu"]
            # train model
            train_np = self.get_train_features(self.nb_train,
                                               lopq_pca_model=lopq_model,
                                               nb_min_train=self.nb_min_train)
            msg = "[{}.train_model: info] Training 'lopq_pca' model with parameters {} using features {}"
            print(msg.format(self.pp, self.model_params, train_np.shape))
            sys.stdout.flush()
            start_train = time.time()
            # specify a n_init < 10 (default value) to speed-up training?
            lopq_model.fit(train_np,
                           verbose=True,
                           apply_pca=False,
                           train_pca=False)
            # TODO: we could evaluate model based on reconstruction of some randomly sampled features?
            # save model
            self.storer.save(self.build_model_str(), lopq_model)
            info_msg = "[{}.train_model: info] Trained lopq model in {}s."
            print(info_msg.format(self.pp, time.time() - start_train))
            sys.stdout.flush()
            return lopq_model
            # err_msg = "[{}.train_model: error] Local training of 'lopq_pca' model not yet implemented."
            # raise NotImplementedError(err_msg.format(self.pp))
        else:
            err_msg = "[{}.train_model: error] Unknown 'lopq' type {}."
            raise ValueError(err_msg.format(self.pp, self.model_type))
Exemple #20
0
def test_oxford5k():

    random_state = 40
    data = load_oxford_data()
    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=random_state)

    # Compute distance-sorted neighbors in training set for each point in test set
    nns = compute_all_neighbors(test, train)

    # Fit model
    m = LOPQModel(V=16, M=8)
    m.fit(train, n_init=1, random_state=random_state)

    # Assert correct code computation
    assert_equal(m.predict(test[0]),
                 ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250)))

    # Assert low number of empty cells
    h = get_cell_histogram(train, m)
    assert_equal(np.count_nonzero(h == 0), 6)

    # Assert true NN recall on test set
    searcher = LOPQSearcher(m)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with just coarse quantizers
    m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None))
    m2.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m2)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))

    # Test partial fitting with coarse quantizers and rotations
    m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None))
    m3.fit(train, n_init=1, random_state=random_state)

    searcher = LOPQSearcher(m3)
    searcher.add_data(train)
    recall, _ = get_recall(searcher, test, nns)
    assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
        default=None,
        help='hdfs path to save protobuf file of resulting model parameters')

    args = parser.parse_args()

    # Check that some output format was provided
    if args.model_pkl is None and args.model_proto is None:
        parser.error(
            'at least one of --model_pkl and --model_proto is required')

    # Load existing model if provided
    model = None
    if args.existing_model_pkl:
        model = pkl.load(open(args.existing_model_pkl))
    elif args.existing_model_proto:
        model = LOPQModel.load_proto(args.existing_model_proto)

    args = validate_arguments(args, model)

    # Build descriptive app name
    get_step_name = lambda x: {
        STEP_COARSE: 'coarse',
        STEP_ROTATION: 'rotations',
        STEP_SUBQUANT: 'subquantizers'
    }.get(x, None)
    steps_str = ', '.join(
        filter(lambda x: x is not None, map(get_step_name,
                                            sorted(args.steps))))
    APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str)

    sc = SparkContext(appName=APP_NAME)
    parser.add_argument('--pca_model', dest='pca_model', type=str, default=None, help='hdfs path to pickle file containing PCA model to be used')
    parser.add_argument('--model_pkl', dest='model_pkl', type=str, default=None, help='hdfs path to save pickle file of resulting LOPQModel')
    parser.add_argument('--model_proto', dest='model_proto', type=str, default=None, help='hdfs path to save protobuf file of resulting model parameters')

    args = parser.parse_args()

    # Check that some output format was provided
    if args.model_pkl is None and args.model_proto is None:
        parser.error('at least one of --model_pkl and --model_proto is required')

    # Load existing model if provided
    model = None
    if args.existing_model_pkl:
        model = pkl.load(open(args.existing_model_pkl))
    elif args.existing_model_proto:
        model = LOPQModel.load_proto(args.existing_model_proto)

    args = validate_arguments(args, model)

    # Build descriptive app name
    get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None)
    steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps))))
    APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str)

    sc = SparkContext(appName=APP_NAME)

    # Load UDF module if provided and load training data RDD
    if args.data_udf:
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py')
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py')
        udf_module = __import__(args.data_udf, fromlist=['udf'])