def kmeans(v, k, distance_type=2, nt=1, niter=30, seed=0, redo=1, verbose=True, normalize=False, init='random', output='centroids'): _check_row_float32(v) n, d = v.shape centroids = numpy.zeros((k, d), dtype=numpy.float32) dis = numpy.empty(n, dtype=numpy.float32) assign = numpy.empty(n, dtype=numpy.int32) nassign = numpy.empty(k, dtype=numpy.int32) flags = nt if not verbose: flags |= yael.KMEANS_QUIET if distance_type == 2: pass # default elif distance_type == 1: flags |= yael.KMEANS_L1 elif distance_type == 3: flags |= yael.KMEANS_CHI2 if init == 'random': flags |= yael.KMEANS_INIT_RANDOM # also default elif init == 'kmeans++': flags |= yael.KMEANS_INIT_BERKELEY if normalize: flags |= yael.KMEANS_NORMALIZE_CENTS qerr = yael.kmeans(d, n, k, niter, yael.numpy_to_fvec_ref(v), flags, seed, redo, yael.numpy_to_fvec_ref(centroids), yael.numpy_to_fvec_ref(dis), yael.numpy_to_ivec_ref(assign), yael.numpy_to_ivec_ref(nassign)) if qerr < 0: raise RuntimeError( "kmeans: clustering failed. Is dataset diverse enough?") if output == 'centroids': return centroids else: return (centroids, qerr, dis, assign, nassign)
def kmeans(v, k, distance_type=2, nt=1, niter=30, seed=0, redo=1, verbose=True, normalize=False, init='random', output='centroids'): _check_row_float32(v) n, d = v.shape centroids = numpy.zeros((k, d), dtype=numpy.float32) dis = numpy.empty(n, dtype=numpy.float32) assign = numpy.empty(n, dtype=numpy.int32) nassign = numpy.empty(k, dtype=numpy.int32) flags = nt if not verbose: flags |= yael.KMEANS_QUIET if distance_type == 2: pass # default elif distance_type == 1: flags |= yael.KMEANS_L1 elif distance_type == 3: flags |= yael.KMEANS_CHI2 if init == 'random': flags |= yael.KMEANS_INIT_RANDOM # also default elif init == 'kmeans++': flags |= yael.KMEANS_INIT_BERKELEY if normalize: flags |= yael.KMEANS_NORMALIZE_CENTS qerr = yael.kmeans(d, n, k, niter, yael.numpy_to_fvec_ref(v), flags, seed, redo, yael.numpy_to_fvec_ref(centroids), yael.numpy_to_fvec_ref(dis), yael.numpy_to_ivec_ref(assign), yael.numpy_to_ivec_ref(nassign)) if qerr < 0: raise RuntimeError( "kmeans: clustering failed. Is dataset diverse enough?") if output == 'centroids': return centroids else: return (centroids, qerr, dis, assign, nassign)
def extract_rows_cols(K, subset_rows, subset_cols): " returns K[numpy.ix_(subset_rows, subset_cols)] (also slow in pure numpy)" _check_row_float32(K) _check_row_int32(subset_rows) _check_row_int32(subset_cols) nr = subset_rows.size nc = subset_cols.size assert subset_rows.min() >= 0 and subset_rows.max() < K.shape[0] assert subset_cols.min() >= 0 and subset_cols.max() < K.shape[1] Ksub = numpy.empty((nr, nc), dtype=numpy.float32) yael.fmat_get_rows_cols(yael.numpy_to_fvec_ref(K), K.shape[0], nc, yael.numpy_to_ivec_ref(subset_cols), nr, yael.numpy_to_ivec_ref(subset_rows), yael.numpy_to_fvec_ref(Ksub)) return Ksub
def extract_rows_cols(K, subset_rows, subset_cols): " returns K[numpy.ix_(subset_rows, subset_cols)] (also slow in pure numpy)" _check_row_float32(K) _check_row_int32(subset_rows) _check_row_int32(subset_cols) nr = subset_rows.size nc = subset_cols.size assert subset_rows.min() >= 0 and subset_rows.max() < K.shape[0] assert subset_cols.min() >= 0 and subset_cols.max() < K.shape[1] Ksub = numpy.empty((nr, nc), dtype = numpy.float32) yael.fmat_get_rows_cols(yael.numpy_to_fvec_ref(K), K.shape[0], nc, yael.numpy_to_ivec_ref(subset_cols), nr, yael.numpy_to_ivec_ref(subset_rows), yael.numpy_to_fvec_ref(Ksub)) return Ksub
def extract_lines(a, indices): " returns a[indices, :] from a matrix a (this operation is slow in numpy) " _check_row_float32(a) _check_row_int32(indices) n, d = a.shape assert indices.size == 0 or indices.min() >= 0 and indices.max() < n out = numpy.empty((indices.size, d), dtype=numpy.float32) yael.fmat_get_columns(yael.numpy_to_fvec_ref(a), d, indices.size, yael.numpy_to_ivec_ref(indices), yael.numpy_to_fvec_ref(out)) return out
def extract_lines(a, indices): " returns a[indices, :] from a matrix a (this operation is slow in numpy) " _check_row_float32(a) _check_row_int32(indices) n, d = a.shape assert indices.size == 0 or indices.min() >= 0 and indices.max() < n out = numpy.empty((indices.size, d), dtype=numpy.float32) yael.fmat_get_columns( yael.numpy_to_fvec_ref(a), d, indices.size, yael.numpy_to_ivec_ref(indices), yael.numpy_to_fvec_ref(out) ) return out
def knn(queries, base, nnn=1, distance_type=2, nt=1): _check_row_float32(base) _check_row_float32(queries) n, d = base.shape nq, d2 = queries.shape assert d == d2, "base and queries must have same nb of rows (got %d != %d) " % ( d, d2) idx = numpy.empty((nq, nnn), dtype=numpy.int32) dis = numpy.empty((nq, nnn), dtype=numpy.float32) yael.knn_full_thread(distance_type, nq, n, d, nnn, yael.numpy_to_fvec_ref(base), yael.numpy_to_fvec_ref(queries), None, yael.numpy_to_ivec_ref(idx), yael.numpy_to_fvec_ref(dis), nt) return idx, dis
def knn(queries, base, nnn = 1, distance_type = 2, nt = 1): _check_row_float32(base) _check_row_float32(queries) n, d = base.shape nq, d2 = queries.shape assert d == d2, "base and queries must have same nb of rows (got %d != %d) " % (d, d2) idx = numpy.empty((nq, nnn), dtype = numpy.int32) dis = numpy.empty((nq, nnn), dtype = numpy.float32) yael.knn_full_thread(distance_type, nq, n, d, nnn, yael.numpy_to_fvec_ref(base), yael.numpy_to_fvec_ref(queries), None, yael.numpy_to_ivec_ref(idx), yael.numpy_to_fvec_ref(dis), nt) return idx, dis