def test_searcher_lmdb(): import shutil data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) lmbd_test_path = './test_lopq_lmbd' q = np.ones(8) # Test add_data searcher = LOPQSearcherLMDB(m, lmbd_test_path) searcher.add_data(data) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path) # Test add_codes searcher = LOPQSearcherLMDB(m, lmbd_test_path) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path)
def test_reconstruction(): m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) code = ((0, 1), (0, 1, 2, 3)) r = m.reconstruct(code) expected = [-2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377] assert_true(np.allclose(expected, r))
def test_proto_partial(): import os filename = './temp_proto_partial.lopq' c = (np.random.rand(8, 8), np.random.rand(8, 8)) m = LOPQModel(parameters=(c, None, None, None)) m.export_proto(filename) m2 = LOPQModel.load_proto(filename) assert_equal(m.V, m2.V) assert_equal(m.M, m2.M) assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters) assert_true(np.allclose(m.Cs[0], m2.Cs[0])) assert_true(m.Rs == m2.Rs) assert_true(m.mus == m2.mus) assert_true(m.subquantizers == m.subquantizers) os.remove(filename)
def test_oxford5k(): random_state = 40 data = load_oxford_data() train, test = train_test_split(data, test_size=0.2, random_state=random_state) # Compute distance-sorted neighbors in training set for each point in test set nns = compute_all_neighbors(test, train) # Fit model m = LOPQModel(V=16, M=8) m.fit(train, n_init=1, random_state=random_state) # Assert correct code computation assert_equal(m.predict(test[0]), ((3, 2), (14, 164, 83, 49, 185, 29, 196, 250))) # Assert low number of empty cells h = get_cell_histogram(train, m) assert_equal(np.count_nonzero(h == 0), 6) # Assert true NN recall on test set searcher = LOPQSearcher(m) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with just coarse quantizers m2 = LOPQModel(V=16, M=8, parameters=(m.Cs, None, None, None)) m2.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m2) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97])) # Test partial fitting with coarse quantizers and rotations m3 = LOPQModel(V=16, M=8, parameters=(m.Cs, m.Rs, m.mus, None)) m3.fit(train, n_init=1, random_state=random_state) searcher = LOPQSearcher(m3) searcher.add_data(train) recall, _ = get_recall(searcher, test, nns) assert_true(np.all(recall > [0.51, 0.92, 0.97, 0.97]))
def test_reconstruction(): m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) code = ((0, 1), (0, 1, 2, 3)) r = m.reconstruct(code) expected = [ -2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377 ] assert_true(np.allclose(expected, r))
def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) searcher = LOPQSearcher(m) searcher.add_data(data) q = np.ones(8) retrieved, visited = searcher.get_result_quota(q) assert_equal(len(retrieved), 12) assert_equal(visited, 3) retrieved, visited = searcher.get_result_quota(q, quota=20) assert_equal(len(retrieved), 28) assert_equal(visited, 5)
def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) q = np.ones(8) # Test add_data searcher = LOPQSearcher(m) searcher.add_data(data) searcher_instance_battery(searcher, q) # Test add_codes searcher = LOPQSearcher(m) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q)
def test_searcher(): data = pkl.load(open(relpath("./testdata/test_searcher_data.pkl"))) m = LOPQModel.load_proto(relpath("./testdata/random_test_model.lopq")) searcher = LOPQSearcher(m) searcher.add_data(data) q = np.ones(8) retrieved, visited = searcher.get_result_quota(q) assert_equal(len(retrieved), 12) assert_equal(visited, 3) retrieved, visited = searcher.get_result_quota(q, quota=20) assert_equal(len(retrieved), 28) assert_equal(visited, 5)
def test_mat(): import os filename = './temp_mat.mat' m = make_random_model() m.export_mat(filename) m2 = LOPQModel.load_mat(filename) assert_equal(m.V, m2.V) assert_equal(m.M, m2.M) assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters) assert_true(np.allclose(m.Cs[0], m2.Cs[0])) assert_true(np.allclose(m.Rs[0], m2.Rs[0])) assert_true(np.allclose(m.mus[0], m2.mus[0])) assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0])) os.remove(filename)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: model = pkl.load(open(args.model_pkl)) elif args.model_proto: model = LOPQModel.load_proto(args.model_proto) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: model = pkl.load(open(args.model_pkl)) elif args.model_proto: model = LOPQModel.load_proto(args.model_proto) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map( lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: filename = copy_from_hdfs(args.model_pkl) model = pkl.load(open(filename)) os.remove(filename) elif args.model_proto: filename = copy_from_hdfs(args.model_proto) model = LOPQModel.load_proto(args.model_proto) os.remove(filename) print 'LOPQModel is of type: {}'.format(type(model)) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Deprecated. Now assume m.value.predict will apply PCA if needed # # Apply PCA before encoding if needed # if args.pca_model is not None: # # Check if we should get PCA model # print 'Loading PCA model from {}'.format(args.pca_model) # filename = copy_from_hdfs(args.pca_model) # params = pkl.load(open(filename)) # # TODO: we should also remove tmp dir # os.remove(filename) # P = params['P'] # mu = params['mu'] # print 'Applying PCA from model {}'.format(args.pca_model) # # Use mapValues this time as we DO have the ids as keys # d = d.mapValues(lambda x: apply_PCA(x, mu, P)) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map( lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: filename = copy_from_hdfs(args.model_pkl) model = pkl.load(open(filename)) os.remove(filename) elif args.model_proto: filename = copy_from_hdfs(args.model_proto) model = LOPQModel.load_proto(args.model_proto) os.remove(filename) print 'LOPQModel is of type: {}'.format(type(model)) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Deprecated. Now assume m.value.predict will apply PCA if needed # # Apply PCA before encoding if needed # if args.pca_model is not None: # # Check if we should get PCA model # print 'Loading PCA model from {}'.format(args.pca_model) # filename = copy_from_hdfs(args.pca_model) # params = pkl.load(open(filename)) # # TODO: we should also remove tmp dir # os.remove(filename) # P = params['P'] # mu = params['mu'] # print 'Applying PCA from model {}'.format(args.pca_model) # # Use mapValues this time as we DO have the ids as keys # d = d.mapValues(lambda x: apply_PCA(x, mu, P)) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def make_random_model(): m = LOPQModel(V=5, M=4, subquantizer_clusters=10) m.fit(np.random.RandomState(42).rand(200, 8), n_init=1) return m
def train_index(self): if self.model_type == "lopq": train_np = self.get_train_features(self.nb_train, nb_min_train=self.nb_min_train) print("Got train features array with shape: {}".format( train_np.shape)) nb_train_feats = train_np.shape[0] sys.stdout.flush() if nb_train_feats >= self.nb_train: from lopq.model import LOPQModel # we could have default values for those parameters and/or heuristic to estimate them based on data count... lopq_model = LOPQModel( V=self.model_params['V'], M=self.model_params['M'], subquantizer_clusters=self.model_params['subq']) # we could have separate training/indexing features msg = "[{}.train_model: info] Starting local training of 'lopq' model with parameters {} using {} features." print(msg.format(self.pp, self.model_params, nb_train_feats)) start_train = time.time() # specify a n_init < 10 (default value) to speed-up training? lopq_model.fit(train_np, verbose=True) # save model self.storer.save(self.build_model_str(), lopq_model) print( "[{}.train_model: info] Trained lopq model in {}s.".format( self.pp, time.time() - start_train)) return lopq_model else: msg = "[{}.train_model: error] Could not train model, not enough training samples." print(msg.format(self.pp)) elif self.model_type == "lopq_pca": # lopq_pca training. from lopq.model import LOPQModelPCA # we could have default values for those parameters # and/or heuristic to estimate them based on data count... lopq_model = LOPQModelPCA( V=self.model_params['V'], M=self.model_params['M'], subquantizer_clusters=self.model_params['subq'], renorm=True) # pca loading/training first pca_model = self.storer.load(self.build_pca_model_str()) if pca_model is None: train_np = self.get_train_features( self.nb_train_pca, nb_min_train=self.nb_min_train_pca) msg = "[{}.train_model: info] Training PCA model, keeping {} dimensions from features {}." print( msg.format(self.pp, self.model_params['pca'], train_np.shape)) sys.stdout.flush() start_train_pca = time.time() lopq_model.fit_pca(train_np, pca_dims=self.model_params['pca']) info_msg = "[{}.train_model: info] Trained pca model in {}s." print(info_msg.format(self.pp, time.time() - start_train_pca)) del train_np self.storer.save(self.build_pca_model_str(), { "P": lopq_model.pca_P, "mu": lopq_model.pca_mu }) else: lopq_model.pca_P = pca_model["P"] lopq_model.pca_mu = pca_model["mu"] # train model train_np = self.get_train_features(self.nb_train, lopq_pca_model=lopq_model, nb_min_train=self.nb_min_train) msg = "[{}.train_model: info] Training 'lopq_pca' model with parameters {} using features {}" print(msg.format(self.pp, self.model_params, train_np.shape)) sys.stdout.flush() start_train = time.time() # specify a n_init < 10 (default value) to speed-up training? lopq_model.fit(train_np, verbose=True, apply_pca=False, train_pca=False) # TODO: we could evaluate model based on reconstruction of some randomly sampled features? # save model self.storer.save(self.build_model_str(), lopq_model) info_msg = "[{}.train_model: info] Trained lopq model in {}s." print(info_msg.format(self.pp, time.time() - start_train)) sys.stdout.flush() return lopq_model # err_msg = "[{}.train_model: error] Local training of 'lopq_pca' model not yet implemented." # raise NotImplementedError(err_msg.format(self.pp)) else: err_msg = "[{}.train_model: error] Unknown 'lopq' type {}." raise ValueError(err_msg.format(self.pp, self.model_type))
default=None, help='hdfs path to save protobuf file of resulting model parameters') args = parser.parse_args() # Check that some output format was provided if args.model_pkl is None and args.model_proto is None: parser.error( 'at least one of --model_pkl and --model_proto is required') # Load existing model if provided model = None if args.existing_model_pkl: model = pkl.load(open(args.existing_model_pkl)) elif args.existing_model_proto: model = LOPQModel.load_proto(args.existing_model_proto) args = validate_arguments(args, model) # Build descriptive app name get_step_name = lambda x: { STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers' }.get(x, None) steps_str = ', '.join( filter(lambda x: x is not None, map(get_step_name, sorted(args.steps)))) APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str) sc = SparkContext(appName=APP_NAME)
parser.add_argument('--pca_model', dest='pca_model', type=str, default=None, help='hdfs path to pickle file containing PCA model to be used') parser.add_argument('--model_pkl', dest='model_pkl', type=str, default=None, help='hdfs path to save pickle file of resulting LOPQModel') parser.add_argument('--model_proto', dest='model_proto', type=str, default=None, help='hdfs path to save protobuf file of resulting model parameters') args = parser.parse_args() # Check that some output format was provided if args.model_pkl is None and args.model_proto is None: parser.error('at least one of --model_pkl and --model_proto is required') # Load existing model if provided model = None if args.existing_model_pkl: model = pkl.load(open(args.existing_model_pkl)) elif args.existing_model_proto: model = LOPQModel.load_proto(args.existing_model_proto) args = validate_arguments(args, model) # Build descriptive app name get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None) steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps)))) APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str) sc = SparkContext(appName=APP_NAME) # Load UDF module if provided and load training data RDD if args.data_udf: sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py') sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py') udf_module = __import__(args.data_udf, fromlist=['udf'])