def fisher(gmm_npy, v, include='mu'): _check_row_float32(v) n, d = v.shape gmm = _numpy_to_gmm(gmm_npy) assert d == gmm.d flags = 0 if 'mu' in include: flags |= yael.GMM_FLAGS_MU if 'sigma' in include: flags |= yael.GMM_FLAGS_SIGMA if 'w' in include: flags |= yael.GMM_FLAGS_W d_fisher = yael.gmm_fisher_sizeof(gmm, flags) fisher_out = numpy.zeros(d_fisher, dtype=numpy.float32) yael.gmm_fisher(n, yael.numpy_to_fvec_ref(v), gmm, flags, yael.numpy_to_fvec_ref(fisher_out)) return fisher_out
def fisher(gmm_npy, v, include='mu'): _check_row_float32(v) n, d = v.shape gmm = _numpy_to_gmm(gmm_npy) assert d == gmm.d flags = 0 if 'mu' in include: flags |= yael.GMM_FLAGS_MU if 'sigma' in include: flags |= yael.GMM_FLAGS_SIGMA if 'w' in include: flags |= yael.GMM_FLAGS_W d_fisher = yael.gmm_fisher_sizeof(gmm, flags) fisher_out = numpy.zeros(d_fisher, dtype=numpy.float32) yael.gmm_fisher(n, yael.numpy_to_fvec_ref( v), gmm, flags, yael.numpy_to_fvec_ref(fisher_out)) return fisher_out
def extract_lines(a, indices): " returns a[indices, :] from a matrix a (this operation is slow in numpy) " _check_row_float32(a) _check_row_int32(indices) n, d = a.shape assert indices.size == 0 or indices.min() >= 0 and indices.max() < n out = numpy.empty((indices.size, d), dtype=numpy.float32) yael.fmat_get_columns( yael.numpy_to_fvec_ref(a), d, indices.size, yael.numpy_to_ivec_ref(indices), yael.numpy_to_fvec_ref(out) ) return out
def extract_lines(a, indices): " returns a[indices, :] from a matrix a (this operation is slow in numpy) " _check_row_float32(a) _check_row_int32(indices) n, d = a.shape assert indices.size == 0 or indices.min() >= 0 and indices.max() < n out = numpy.empty((indices.size, d), dtype=numpy.float32) yael.fmat_get_columns(yael.numpy_to_fvec_ref(a), d, indices.size, yael.numpy_to_ivec_ref(indices), yael.numpy_to_fvec_ref(out)) return out
def kmeans(v, k, distance_type=2, nt=1, niter=30, seed=0, redo=1, verbose=True, normalize=False, init='random', output='centroids'): _check_row_float32(v) n, d = v.shape centroids = numpy.zeros((k, d), dtype=numpy.float32) dis = numpy.empty(n, dtype=numpy.float32) assign = numpy.empty(n, dtype=numpy.int32) nassign = numpy.empty(k, dtype=numpy.int32) flags = nt if not verbose: flags |= yael.KMEANS_QUIET if distance_type == 2: pass # default elif distance_type == 1: flags |= yael.KMEANS_L1 elif distance_type == 3: flags |= yael.KMEANS_CHI2 if init == 'random': flags |= yael.KMEANS_INIT_RANDOM # also default elif init == 'kmeans++': flags |= yael.KMEANS_INIT_BERKELEY if normalize: flags |= yael.KMEANS_NORMALIZE_CENTS qerr = yael.kmeans(d, n, k, niter, yael.numpy_to_fvec_ref(v), flags, seed, redo, yael.numpy_to_fvec_ref(centroids), yael.numpy_to_fvec_ref(dis), yael.numpy_to_ivec_ref(assign), yael.numpy_to_ivec_ref(nassign)) if qerr < 0: raise RuntimeError( "kmeans: clustering failed. Is dataset diverse enough?") if output == 'centroids': return centroids else: return (centroids, qerr, dis, assign, nassign)
def vlad(centroids, v): _check_row_float32(v) n, d = v.shape _check_row_float32(centroids) k, d2 = centroids.shape assert d2 == d vlad_out = numpy.zeros((k, d), dtype = numpy.float32) yael.vlad_compute(k, d, yael.numpy_to_fvec_ref(centroids), n, yael.numpy_to_fvec_ref(v), yael.numpy_to_fvec_ref(vlad_out)) return vlad_out
def vlad(centroids, v): _check_row_float32(v) n, d = v.shape _check_row_float32(centroids) k, d2 = centroids.shape assert d2 == d vlad_out = numpy.zeros((k, d), dtype=numpy.float32) yael.vlad_compute(k, d, yael.numpy_to_fvec_ref(centroids), n, yael.numpy_to_fvec_ref(v), yael.numpy_to_fvec_ref(vlad_out)) return vlad_out
def extract_rows_cols(K, subset_rows, subset_cols): " returns K[numpy.ix_(subset_rows, subset_cols)] (also slow in pure numpy)" _check_row_float32(K) _check_row_int32(subset_rows) _check_row_int32(subset_cols) nr = subset_rows.size nc = subset_cols.size assert subset_rows.min() >= 0 and subset_rows.max() < K.shape[0] assert subset_cols.min() >= 0 and subset_cols.max() < K.shape[1] Ksub = numpy.empty((nr, nc), dtype=numpy.float32) yael.fmat_get_rows_cols(yael.numpy_to_fvec_ref(K), K.shape[0], nc, yael.numpy_to_ivec_ref(subset_cols), nr, yael.numpy_to_ivec_ref(subset_rows), yael.numpy_to_fvec_ref(Ksub)) return Ksub
def extract_rows_cols(K, subset_rows, subset_cols): " returns K[numpy.ix_(subset_rows, subset_cols)] (also slow in pure numpy)" _check_row_float32(K) _check_row_int32(subset_rows) _check_row_int32(subset_cols) nr = subset_rows.size nc = subset_cols.size assert subset_rows.min() >= 0 and subset_rows.max() < K.shape[0] assert subset_cols.min() >= 0 and subset_cols.max() < K.shape[1] Ksub = numpy.empty((nr, nc), dtype = numpy.float32) yael.fmat_get_rows_cols(yael.numpy_to_fvec_ref(K), K.shape[0], nc, yael.numpy_to_ivec_ref(subset_cols), nr, yael.numpy_to_ivec_ref(subset_rows), yael.numpy_to_fvec_ref(Ksub)) return Ksub
def cross_distances(a, b, distance_type = 12): _check_row_float32(a) na, d = a.shape _check_row_float32(b) nb, d2 = b.shape assert d2 == d dis = numpy.empty((nb, na), dtype = numpy.float32) yael.compute_cross_distances_alt_nonpacked(distance_type, d, na, nb, yael.numpy_to_fvec_ref(a), d, yael.numpy_to_fvec_ref(b), d, yael.numpy_to_fvec_ref(dis), na) return dis
def knn(queries, base, nnn=1, distance_type=2, nt=1): _check_row_float32(base) _check_row_float32(queries) n, d = base.shape nq, d2 = queries.shape assert d == d2, "base and queries must have same nb of rows (got %d != %d) " % ( d, d2) idx = numpy.empty((nq, nnn), dtype=numpy.int32) dis = numpy.empty((nq, nnn), dtype=numpy.float32) yael.knn_full_thread(distance_type, nq, n, d, nnn, yael.numpy_to_fvec_ref(base), yael.numpy_to_fvec_ref(queries), None, yael.numpy_to_ivec_ref(idx), yael.numpy_to_fvec_ref(dis), nt) return idx, dis
def _numpy_to_gmm((w, mu, sigma)): # produce a fake gmm from 3 numpy matrices. They should not be # deallocated while gmm in use _check_row_float32(mu) _check_row_float32(sigma) k, d = mu.shape assert sigma.shape == mu.shape assert w.shape == (k,) gmm = yael.gmm_t() gmm.d = d gmm.k = k gmm.w = yael.numpy_to_fvec_ref(w) gmm.mu = yael.numpy_to_fvec_ref(mu) gmm.sigma = yael.numpy_to_fvec_ref(sigma) gmm.__del__ = _gmm_del return gmm
def partial_pca(mat, nev=6, nt=1): _check_row_float32(mat) n, d = mat.shape avg = mat.mean(axis=0) mat = mat - avg[numpy.newaxis, :] singvals = numpy.empty(nev, dtype=numpy.float32) # pdb.set_trace() pcamat = yael.fmat_new_pca_part(d, n, nev, yael.numpy_to_fvec_ref(mat), yael.numpy_to_fvec_ref(singvals)) assert pcamat != None # print "SVs", singvals pcamat = yael.fvec.acquirepointer(pcamat) pcamat = yael.fvec_to_numpy(pcamat, (nev, d)) return avg, singvals, pcamat
def partial_pca(mat, nev=6, nt=1): _check_row_float32(mat) n, d = mat.shape avg = mat.mean(axis=0) mat = mat - avg[numpy.newaxis, :] singvals = numpy.empty(nev, dtype=numpy.float32) # pdb.set_trace() pcamat = yael.fmat_new_pca_part(d, n, nev, yael.numpy_to_fvec_ref(mat), yael.numpy_to_fvec_ref(singvals)) assert pcamat != None #print "SVs", singvals pcamat = yael.fvec.acquirepointer(pcamat) pcamat = yael.fvec_to_numpy(pcamat, (nev, d)) return avg, singvals, pcamat
def knn(queries, base, nnn = 1, distance_type = 2, nt = 1): _check_row_float32(base) _check_row_float32(queries) n, d = base.shape nq, d2 = queries.shape assert d == d2, "base and queries must have same nb of rows (got %d != %d) " % (d, d2) idx = numpy.empty((nq, nnn), dtype = numpy.int32) dis = numpy.empty((nq, nnn), dtype = numpy.float32) yael.knn_full_thread(distance_type, nq, n, d, nnn, yael.numpy_to_fvec_ref(base), yael.numpy_to_fvec_ref(queries), None, yael.numpy_to_ivec_ref(idx), yael.numpy_to_fvec_ref(dis), nt) return idx, dis
def gmm_learn_sw(v, sw, k, nt=1, niter=30, seed=0, redo=1, use_weights=True): _check_row_float32(v) n, d = v.shape flags = 0 if use_weights: flags |= yael.GMM_FLAGS_W gmm = yael.gmm_learn_sw(d, n, k, niter, yael.numpy_to_fvec_ref(v), yael.numpy_to_fvec_ref(sw), nt, seed, redo, flags) gmm_npy = _gmm_to_numpy(gmm) yael.gmm_delete(gmm) return gmm_npy
def fvecs_write(filename, matrix): _check_row_float32(matrix) n, d = matrix.shape ret = yael.fvecs_write(filename, d, n, yael.numpy_to_fvec_ref(matrix)) if ret != n: raise IOError("write error" + filename)