Esempio n. 1
0
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Esempio n. 2
0
File: tests.py Progetto: CVML/lopq
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Esempio n. 3
0
File: tests.py Progetto: CVML/lopq
def test_reconstruction():
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    code = ((0, 1), (0, 1, 2, 3))
    r = m.reconstruct(code)
    expected = [-2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283, 8.11780532, 6.34846377]

    assert_true(np.allclose(expected, r))
Esempio n. 4
0
def test_reconstruction():
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    code = ((0, 1), (0, 1, 2, 3))
    r = m.reconstruct(code)
    expected = [
        -2.27444688, 6.47126941, 4.5042611, 4.76683476, 0.83671082, 9.36027283,
        8.11780532, 6.34846377
    ]

    assert_true(np.allclose(expected, r))
Esempio n. 5
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
Esempio n. 6
0
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
Esempio n. 7
0
File: tests.py Progetto: CVML/lopq
def test_searcher():
    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcher(m)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Test add_codes
    searcher = LOPQSearcher(m)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)
Esempio n. 8
0
File: tests.py Progetto: ml-lab/lopq
def test_searcher():
    data = pkl.load(open(relpath("./testdata/test_searcher_data.pkl")))
    m = LOPQModel.load_proto(relpath("./testdata/random_test_model.lopq"))

    searcher = LOPQSearcher(m)
    searcher.add_data(data)

    q = np.ones(8)

    retrieved, visited = searcher.get_result_quota(q)
    assert_equal(len(retrieved), 12)
    assert_equal(visited, 3)

    retrieved, visited = searcher.get_result_quota(q, quota=20)
    assert_equal(len(retrieved), 28)
    assert_equal(visited, 5)
Esempio n. 9
0
def test_proto():
    import os

    filename = './temp_proto.lopq'
    m = make_random_model()
    m.export_proto(filename)
    m2 = LOPQModel.load_proto(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(np.allclose(m.Rs[0], m2.Rs[0]))
    assert_true(np.allclose(m.mus[0], m2.mus[0]))
    assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0]))

    os.remove(filename)
Esempio n. 10
0
File: tests.py Progetto: CVML/lopq
def test_proto():
    import os

    filename = './temp_proto.lopq'
    m = make_random_model()
    m.export_proto(filename)
    m2 = LOPQModel.load_proto(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(np.allclose(m.Rs[0], m2.Rs[0]))
    assert_true(np.allclose(m.mus[0], m2.mus[0]))
    assert_true(np.allclose(m.subquantizers[0][0], m.subquantizers[0][0]))

    os.remove(filename)
Esempio n. 11
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        model = pkl.load(open(args.model_pkl))
    elif args.model_proto:
        model = LOPQModel.load_proto(args.model_proto)

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Esempio n. 12
0
def test_proto_partial():
    import os

    filename = './temp_proto_partial.lopq'
    c = (np.random.rand(8, 8), np.random.rand(8, 8))
    m = LOPQModel(parameters=(c, None, None, None))
    m.export_proto(filename)
    m2 = LOPQModel.load_proto(filename)

    assert_equal(m.V, m2.V)
    assert_equal(m.M, m2.M)
    assert_equal(m.subquantizer_clusters, m2.subquantizer_clusters)

    assert_true(np.allclose(m.Cs[0], m2.Cs[0]))
    assert_true(m.Rs == m2.Rs)
    assert_true(m.mus == m2.mus)
    assert_true(m.subquantizers == m.subquantizers)

    os.remove(filename)
Esempio n. 13
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        model = pkl.load(open(args.model_pkl))
    elif args.model_proto:
        model = LOPQModel.load_proto(args.model_proto)

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(
        lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Esempio n. 14
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        filename = copy_from_hdfs(args.model_pkl)
        model = pkl.load(open(filename))
        os.remove(filename)
    elif args.model_proto:
        filename = copy_from_hdfs(args.model_proto)
        model = LOPQModel.load_proto(args.model_proto)
        os.remove(filename)

    print 'LOPQModel is of type: {}'.format(type(model))

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Deprecated. Now assume m.value.predict will apply PCA if needed
    # # Apply PCA before encoding if needed
    # if args.pca_model is not None:
    #     # Check if we should get PCA model
    #     print 'Loading PCA model from {}'.format(args.pca_model)
    #     filename = copy_from_hdfs(args.pca_model)
    #     params = pkl.load(open(filename))
    #     # TODO: we should also remove tmp dir
    #     os.remove(filename)
    #     P = params['P']
    #     mu = params['mu']
    #     print 'Applying PCA from model {}'.format(args.pca_model)
    #     # Use mapValues this time as we DO have the ids as keys
    #     d = d.mapValues(lambda x: apply_PCA(x, mu, P))

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(
        lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
Esempio n. 15
0
def main(sc, args, data_load_fn=default_data_loading):

    # Load model
    model = None
    if args.model_pkl:
        filename = copy_from_hdfs(args.model_pkl)
        model = pkl.load(open(filename))
        os.remove(filename)
    elif args.model_proto:
        filename = copy_from_hdfs(args.model_proto)
        model = LOPQModel.load_proto(args.model_proto)
        os.remove(filename)

    print 'LOPQModel is of type: {}'.format(type(model))

    # Load data
    d = data_load_fn(sc, args.data, args.sampling_ratio, args.seed)

    # Deprecated. Now assume m.value.predict will apply PCA if needed
    # # Apply PCA before encoding if needed
    # if args.pca_model is not None:
    #     # Check if we should get PCA model
    #     print 'Loading PCA model from {}'.format(args.pca_model)
    #     filename = copy_from_hdfs(args.pca_model)
    #     params = pkl.load(open(filename))
    #     # TODO: we should also remove tmp dir
    #     os.remove(filename)
    #     P = params['P']
    #     mu = params['mu']
    #     print 'Applying PCA from model {}'.format(args.pca_model)
    #     # Use mapValues this time as we DO have the ids as keys
    #     d = d.mapValues(lambda x: apply_PCA(x, mu, P))

    # Distribute model instance
    m = sc.broadcast(model)

    # Compute codes and convert to string
    codes = d.map(lambda x: (x[0], m.value.predict(x[1]))).map(lambda x: '%s\t%s' % (x[0], json.dumps(x[1])))

    codes.saveAsTextFile(args.output)
        default=None,
        help='hdfs path to save protobuf file of resulting model parameters')

    args = parser.parse_args()

    # Check that some output format was provided
    if args.model_pkl is None and args.model_proto is None:
        parser.error(
            'at least one of --model_pkl and --model_proto is required')

    # Load existing model if provided
    model = None
    if args.existing_model_pkl:
        model = pkl.load(open(args.existing_model_pkl))
    elif args.existing_model_proto:
        model = LOPQModel.load_proto(args.existing_model_proto)

    args = validate_arguments(args, model)

    # Build descriptive app name
    get_step_name = lambda x: {
        STEP_COARSE: 'coarse',
        STEP_ROTATION: 'rotations',
        STEP_SUBQUANT: 'subquantizers'
    }.get(x, None)
    steps_str = ', '.join(
        filter(lambda x: x is not None, map(get_step_name,
                                            sorted(args.steps))))
    APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str)

    sc = SparkContext(appName=APP_NAME)
    parser.add_argument('--pca_model', dest='pca_model', type=str, default=None, help='hdfs path to pickle file containing PCA model to be used')
    parser.add_argument('--model_pkl', dest='model_pkl', type=str, default=None, help='hdfs path to save pickle file of resulting LOPQModel')
    parser.add_argument('--model_proto', dest='model_proto', type=str, default=None, help='hdfs path to save protobuf file of resulting model parameters')

    args = parser.parse_args()

    # Check that some output format was provided
    if args.model_pkl is None and args.model_proto is None:
        parser.error('at least one of --model_pkl and --model_proto is required')

    # Load existing model if provided
    model = None
    if args.existing_model_pkl:
        model = pkl.load(open(args.existing_model_pkl))
    elif args.existing_model_proto:
        model = LOPQModel.load_proto(args.existing_model_proto)

    args = validate_arguments(args, model)

    # Build descriptive app name
    get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None)
    steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps))))
    APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str)

    sc = SparkContext(appName=APP_NAME)

    # Load UDF module if provided and load training data RDD
    if args.data_udf:
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py')
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py')
        udf_module = __import__(args.data_udf, fromlist=['udf'])