def write_sparse_output(train, test, fn, distance, dimension, count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS f = h5py.File(fn, 'w') f.attrs['type'] = 'sparse' f.attrs['distance'] = distance f.attrs['dimension'] = dimension f.attrs['point_type'] = 'bit' print('train size: %9d * %4d' % (train.shape[0], dimension)) print('test size: %9d * %4d' % (test.shape[0], dimension)) # We ensure the sets are sorted train = numpy.array(list(map(sorted, train))) test = numpy.array(list(map(sorted, test))) flat_train = numpy.hstack(train.flatten()) flat_test = numpy.hstack(test.flatten()) f.create_dataset('train', (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train f.create_dataset('test', (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') distances = f.create_dataset('distances', (len(test), count), dtype='f') f.create_dataset('size_test', (len(test),), dtype='i')[:] = list(map(len, test)) f.create_dataset('size_train', (len(train),), dtype='i')[:] = list(map(len, train)) bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train) for i, x in enumerate(test): if i % 1000 == 0: print('%d/%d...' % (i, len(test))) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close()
def write_output(train, test, fn, distance, point_type='float', count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS n = 0 f = h5py.File(fn, 'w') f.attrs['type'] = 'dense' f.attrs['distance'] = distance f.attrs['dimension'] = len(train[0]) f.attrs['point_type'] = point_type print('train size: %9d * %4d' % train.shape) print('test size: %9d * %4d' % test.shape) f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') distances = f.create_dataset('distances', (len(test), count), dtype='f') bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train) for i, x in enumerate(test): if i % 1000 == 0: print('%d/%d...' % (i, len(test))) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close()
def my_write_output(train, test, out_fn, distance, point_type='float', count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS n = 0 f = h5py.File(out_fn, 'w') f.attrs['distance'] = distance f.attrs['point_type'] = point_type print('train size: %9d * %4d' % train.shape) print('test size: %9d * %4d' % test.shape) # f.create_dataset('train', (len(train), len( # train[0])), dtype=train.dtype)[:] = train # f.create_dataset('test', (len(test), len( # test[0])), dtype=test.dtype)[:] = test # neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') # distances = f.create_dataset('distances', (len(test), count), dtype='f') f.create_dataset('train', (TRAIN_SIZE, len( train[0])), dtype=train.dtype)[:] = train[:TRAIN_SIZE] f.create_dataset('test', (QUERY_NUM, len( test[0])), dtype=test.dtype)[:] = test[:QUERY_NUM] neighbors = f.create_dataset('neighbors', (QUERY_NUM, count), dtype='i') distances = f.create_dataset('distances', (QUERY_NUM, count), dtype='f') bf = BruteForceBLAS(distance, precision=train.dtype) train = datasets.dataset_transform[distance](train) test = datasets.dataset_transform[distance](test) bf.fit(train[:TRAIN_SIZE]) queries = [] for i, x in enumerate(test[:QUERY_NUM]): if i % 1000 == 0: print('%d/%d...' % (i, len(test[:QUERY_NUM]))) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close()
def compute_distances(distance, count, X_train, X_test): print('computing max distances for queries...') bf = BruteForceBLAS(distance, precision=X_train.dtype) # Prepare queries bf.fit(X_train) queries = [] for x in X_test: correct = bf.query_with_distances(x, count) # disregard queries that don't have near neighbors. if len(correct) > 0: max_distance = max(correct, key=lambda (_, distance): distance)[1] queries.append((x, max_distance, correct)) if len(queries) % 100 == 0: print(len(queries), '...') return queries
def write_output(train, test, fn, distance, count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS n = 0 f = h5py.File(fn, 'w') f.attrs['distance'] = distance print('train size: %9d * %4d' % train.shape) print('test size: %9d * %4d' % test.shape) f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') distances = f.create_dataset('distances', (len(test), count), dtype='f') bf = BruteForceBLAS(distance, precision=numpy.float32) bf.fit(train) queries = [] for i, x in enumerate(test): if i % 1000 == 0: print('%d/%d...' % (i, test.shape[0])) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close()
def write_output(train, test, fn, distance, count=3000): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS n = 0 f = h5py.File(fn, 'w') f.attrs['distance'] = distance print('train size: %9d * %4d' % train.shape) print('test size: %9d * %4d' % test.shape) f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') distances = f.create_dataset('distances', (len(test), count), dtype='f') bf = BruteForceBLAS(distance, precision=numpy.float32) bf.fit(train) queries = [] for i, x in enumerate(test): if i % 1000 == 0: print('%d/%d...' % (i, test.shape[0])) res = list(bf.query_with_distances(x, count)) res = res[::-1] neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close()