Ejemplo n.º 1
0
def write_output(train, test, fn, distance, point_type='float', count=100):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    n = 0
    f = h5py.File(fn, 'w')
    f.attrs['type'] = 'dense'
    f.attrs['distance'] = distance
    f.attrs['dimension'] = len(train[0])
    f.attrs['point_type'] = point_type
    print('train size: %9d * %4d' % train.shape)
    print('test size:  %9d * %4d' % test.shape)
    f.create_dataset('train', (len(train), len(train[0])),
                     dtype=train.dtype)[:] = train
    f.create_dataset('test', (len(test), len(test[0])),
                     dtype=test.dtype)[:] = test
    neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    distances = f.create_dataset('distances', (len(test), count), dtype='f')
    bf = BruteForceBLAS(distance, precision=train.dtype)

    bf.fit(train)
    for i, x in enumerate(test):
        if i % 1000 == 0:
            print('%d/%d...' % (i, len(test)))
        res = list(bf.query_with_distances(x, count))
        res.sort(key=lambda t: t[-1])
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()
Ejemplo n.º 2
0
def write_sparse_output(train, test, fn, distance, dimension, count=100):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    f = h5py.File(fn, 'w')
    f.attrs['type'] = 'sparse'
    f.attrs['distance'] = distance
    f.attrs['dimension'] = dimension
    f.attrs['point_type'] = 'bit'
    print('train size: %9d * %4d' % (train.shape[0], dimension))
    print('test size:  %9d * %4d' % (test.shape[0], dimension))

    # We ensure the sets are sorted
    train = numpy.array(list(map(sorted, train)))
    test = numpy.array(list(map(sorted, test)))

    flat_train = numpy.hstack(train.flatten())
    flat_test = numpy.hstack(test.flatten())

    f.create_dataset('train', (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train
    f.create_dataset('test', (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test
    neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    distances = f.create_dataset('distances', (len(test), count), dtype='f')

    f.create_dataset('size_test', (len(test),), dtype='i')[:] = list(map(len, test))
    f.create_dataset('size_train', (len(train),), dtype='i')[:] = list(map(len, train))

    bf = BruteForceBLAS(distance, precision=train.dtype)
    bf.fit(train)
    for i, x in enumerate(test):
        if i % 1000 == 0:
            print('%d/%d...' % (i, len(test)))
        res = list(bf.query_with_distances(x, count))
        res.sort(key=lambda t: t[-1])
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()
def my_write_output(train, test, out_fn, distance, point_type='float', count=100):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    n = 0
    f = h5py.File(out_fn, 'w')
    f.attrs['distance'] = distance
    f.attrs['point_type'] = point_type
    print('train size: %9d * %4d' % train.shape)
    print('test size:  %9d * %4d' % test.shape)
    # f.create_dataset('train', (len(train), len(
    #     train[0])), dtype=train.dtype)[:] = train
    # f.create_dataset('test', (len(test), len(
    #     test[0])), dtype=test.dtype)[:] = test
    # neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    # distances = f.create_dataset('distances', (len(test), count), dtype='f')
    f.create_dataset('train', (TRAIN_SIZE, len(
        train[0])), dtype=train.dtype)[:] = train[:TRAIN_SIZE]
    f.create_dataset('test', (QUERY_NUM, len(
        test[0])), dtype=test.dtype)[:] = test[:QUERY_NUM]
    neighbors = f.create_dataset('neighbors', (QUERY_NUM, count), dtype='i')
    distances = f.create_dataset('distances', (QUERY_NUM, count), dtype='f')
    bf = BruteForceBLAS(distance, precision=train.dtype)
    train = datasets.dataset_transform[distance](train)
    test = datasets.dataset_transform[distance](test)
    bf.fit(train[:TRAIN_SIZE])
    queries = []
    for i, x in enumerate(test[:QUERY_NUM]):
        if i % 1000 == 0:
            print('%d/%d...' % (i, len(test[:QUERY_NUM])))
        res = list(bf.query_with_distances(x, count))
        res.sort(key=lambda t: t[-1])
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()
Ejemplo n.º 4
0
def compute_distances(distance, count, X_train, X_test):
    print('computing max distances for queries...')

    bf = BruteForceBLAS(distance, precision=X_train.dtype)
    # Prepare queries
    bf.fit(X_train)
    queries = []
    for x in X_test:
        correct = bf.query_with_distances(x, count)
        # disregard queries that don't have near neighbors.
        if len(correct) > 0:
            max_distance = max(correct, key=lambda (_, distance): distance)[1]
            queries.append((x, max_distance, correct))
        if len(queries) % 100 == 0:
            print(len(queries), '...')

    return queries
Ejemplo n.º 5
0
def compute_distances(distance, count, X_train, X_test):
    print('computing max distances for queries...')

    bf = BruteForceBLAS(distance, precision=X_train.dtype)
    # Prepare queries
    bf.fit(X_train)
    queries = []
    for x in X_test:
        correct = bf.query_with_distances(x, count)
        # disregard queries that don't have near neighbors.
        if len(correct) > 0:
            max_distance = max(correct, key=lambda (_, distance): distance)[1]
            queries.append((x, max_distance, correct))
        if len(queries) % 100 == 0:
            print(len(queries), '...')

    return queries
Ejemplo n.º 6
0
def write_output(train, test, fn, distance, count=100):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    n = 0
    f = h5py.File(fn, 'w')
    f.attrs['distance'] = distance
    print('train size: %9d * %4d' % train.shape)
    print('test size:  %9d * %4d' % test.shape)
    f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train
    f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test
    neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    distances = f.create_dataset('distances', (len(test), count), dtype='f')
    bf = BruteForceBLAS(distance, precision=numpy.float32)
    bf.fit(train)
    queries = []
    for i, x in enumerate(test):
        if i % 1000 == 0:
            print('%d/%d...' % (i, test.shape[0]))
        res = list(bf.query_with_distances(x, count))
        res.sort(key=lambda t: t[-1])
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()
Ejemplo n.º 7
0
def write_output(train, test, fn, distance, count=3000):
    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
    n = 0
    f = h5py.File(fn, 'w')
    f.attrs['distance'] = distance
    print('train size: %9d * %4d' % train.shape)
    print('test size:  %9d * %4d' % test.shape)
    f.create_dataset('train', (len(train), len(train[0])),
                     dtype=train.dtype)[:] = train
    f.create_dataset('test', (len(test), len(test[0])),
                     dtype=test.dtype)[:] = test
    neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
    distances = f.create_dataset('distances', (len(test), count), dtype='f')
    bf = BruteForceBLAS(distance, precision=numpy.float32)
    bf.fit(train)
    queries = []
    for i, x in enumerate(test):
        if i % 1000 == 0:
            print('%d/%d...' % (i, test.shape[0]))
        res = list(bf.query_with_distances(x, count))
        res = res[::-1]
        neighbors[i] = [j for j, _ in res]
        distances[i] = [d for _, d in res]
    f.close()
Ejemplo n.º 8
0
                print("epbprtv0 ok")
            else:
                print("epbprtv0 fail")
        else:
            print("epbprtv0 fail")
    if point_type and distance:
        print("epbprtv0 ok")
    else:
        print("epbprtv0 fail")
        sys.exit(1)

    obj = None
    if not fast:
        obj = BruteForce(distance)
    else:
        obj = BruteForceBLAS(distance)

    parser = point_type["parse_entry"]
    # Training mode
    points = []
    for line in next_line():
        if not line:
            break
        elif len(line) == 1:
            point = line[0]
            try:
                parsed = parser(point)
                print("epbprtv0 ok %d" % len(points))
                points.append(parsed)
            except ValueError:
                print("epbprtv0 fail")