def test_searcher_lmdb(): import shutil data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) lmbd_test_path = './test_lopq_lmbd' q = np.ones(8) # Test add_data searcher = LOPQSearcherLMDB(m, lmbd_test_path) searcher.add_data(data) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path) # Test add_codes searcher = LOPQSearcherLMDB(m, lmbd_test_path) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path)
def test_reconstruction(): m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) code = ((0, 1), (0, 1, 2, 3)) r = m.reconstruct(code) expected = [-2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377] assert_true(np.allclose(expected, r))
def test_reconstruction(): m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) code = ((0, 1), (0, 1, 2, 3)) r = m.reconstruct(code) expected = [ -2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377 ] assert_true(np.allclose(expected, r))
def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) q = np.ones(8) # Test add_data searcher = LOPQSearcher(m) searcher.add_data(data) searcher_instance_battery(searcher, q) # Test add_codes searcher = LOPQSearcher(m) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q)
def test_searcher(): data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) searcher = LOPQSearcher(m) searcher.add_data(data) q = np.ones(8) retrieved, visited = searcher.get_result_quota(q) assert_equal(len(retrieved), 12) assert_equal(visited, 3) retrieved, visited = searcher.get_result_quota(q, quota=20) assert_equal(len(retrieved), 28) assert_equal(visited, 5)
def test_searcher(): data = pkl.load(open(relpath("./testdata/test_searcher_data.pkl"))) m = LOPQModel.load_proto(relpath("./testdata/random_test_model.lopq")) searcher = LOPQSearcher(m) searcher.add_data(data) q = np.ones(8) retrieved, visited = searcher.get_result_quota(q) assert_equal(len(retrieved), 12) assert_equal(visited, 3) retrieved, visited = searcher.get_result_quota(q, quota=20) assert_equal(len(retrieved), 28) assert_equal(visited, 5)
def test_proto(): import os filename = './temp_proto.lopq' m = make_random_model() m.export_proto(filename) m2 = LOPQModel.load_proto(filename) assert_equal(m.V, m2.V) assert_equal(m.M, m2.M) assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters) assert_true(np.allclose(m.Cs[0], m2.Cs[0])) assert_true(np.allclose(m.Rs[0], m2.Rs[0])) assert_true(np.allclose(m.mus[0], m2.mus[0])) assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0])) os.remove(filename)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: model = pkl.load(open(args.model_pkl)) elif args.model_proto: model = LOPQModel.load_proto(args.model_proto) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def test_proto_partial(): import os filename = './temp_proto_partial.lopq' c = (np.random.rand(8, 8), np.random.rand(8, 8)) m = LOPQModel(parameters=(c, None, None, None)) m.export_proto(filename) m2 = LOPQModel.load_proto(filename) assert_equal(m.V, m2.V) assert_equal(m.M, m2.M) assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters) assert_true(np.allclose(m.Cs[0], m2.Cs[0])) assert_true(m.Rs == m2.Rs) assert_true(m.mus == m2.mus) assert_true(m.subquantizers == m.subquantizers) os.remove(filename)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: model = pkl.load(open(args.model_pkl)) elif args.model_proto: model = LOPQModel.load_proto(args.model_proto) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map( lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: filename = copy_from_hdfs(args.model_pkl) model = pkl.load(open(filename)) os.remove(filename) elif args.model_proto: filename = copy_from_hdfs(args.model_proto) model = LOPQModel.load_proto(args.model_proto) os.remove(filename) print 'LOPQModel is of type: {}'.format(type(model)) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Deprecated. Now assume m.value.predict will apply PCA if needed # # Apply PCA before encoding if needed # if args.pca_model is not None: # # Check if we should get PCA model # print 'Loading PCA model from {}'.format(args.pca_model) # filename = copy_from_hdfs(args.pca_model) # params = pkl.load(open(filename)) # # TODO: we should also remove tmp dir # os.remove(filename) # P = params['P'] # mu = params['mu'] # print 'Applying PCA from model {}'.format(args.pca_model) # # Use mapValues this time as we DO have the ids as keys # d = d.mapValues(lambda x: apply_PCA(x, mu, P)) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map( lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
def main(sc, args, data_load_fn=default_data_loading): # Load model model = None if args.model_pkl: filename = copy_from_hdfs(args.model_pkl) model = pkl.load(open(filename)) os.remove(filename) elif args.model_proto: filename = copy_from_hdfs(args.model_proto) model = LOPQModel.load_proto(args.model_proto) os.remove(filename) print 'LOPQModel is of type: {}'.format(type(model)) # Load data d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed) # Deprecated. Now assume m.value.predict will apply PCA if needed # # Apply PCA before encoding if needed # if args.pca_model is not None: # # Check if we should get PCA model # print 'Loading PCA model from {}'.format(args.pca_model) # filename = copy_from_hdfs(args.pca_model) # params = pkl.load(open(filename)) # # TODO: we should also remove tmp dir # os.remove(filename) # P = params['P'] # mu = params['mu'] # print 'Applying PCA from model {}'.format(args.pca_model) # # Use mapValues this time as we DO have the ids as keys # d = d.mapValues(lambda x: apply_PCA(x, mu, P)) # Distribute model instance m = sc.broadcast(model) # Compute codes and convert to string codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1]))) codes.saveAsTextFile(args.output)
default=None, help='hdfs path to save protobuf file of resulting model parameters') args = parser.parse_args() # Check that some output format was provided if args.model_pkl is None and args.model_proto is None: parser.error( 'at least one of --model_pkl and --model_proto is required') # Load existing model if provided model = None if args.existing_model_pkl: model = pkl.load(open(args.existing_model_pkl)) elif args.existing_model_proto: model = LOPQModel.load_proto(args.existing_model_proto) args = validate_arguments(args, model) # Build descriptive app name get_step_name = lambda x: { STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers' }.get(x, None) steps_str = ', '.join( filter(lambda x: x is not None, map(get_step_name, sorted(args.steps)))) APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str) sc = SparkContext(appName=APP_NAME)
parser.add_argument('--pca_model', dest='pca_model', type=str, default=None, help='hdfs path to pickle file containing PCA model to be used') parser.add_argument('--model_pkl', dest='model_pkl', type=str, default=None, help='hdfs path to save pickle file of resulting LOPQModel') parser.add_argument('--model_proto', dest='model_proto', type=str, default=None, help='hdfs path to save protobuf file of resulting model parameters') args = parser.parse_args() # Check that some output format was provided if args.model_pkl is None and args.model_proto is None: parser.error('at least one of --model_pkl and --model_proto is required') # Load existing model if provided model = None if args.existing_model_pkl: model = pkl.load(open(args.existing_model_pkl)) elif args.existing_model_proto: model = LOPQModel.load_proto(args.existing_model_proto) args = validate_arguments(args, model) # Build descriptive app name get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None) steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps)))) APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str) sc = SparkContext(appName=APP_NAME) # Load UDF module if provided and load training data RDD if args.data_udf: sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py') sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py') udf_module = __import__(args.data_udf, fromlist=['udf'])