def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False) .fit_transform(Xt_1), Xt_2) assert_raise_message(ValueError, "inverse_transform only supports " "'encode = ordinal'. Got encode='onehot-dense' " "instead.", est.inverse_transform, Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True) .fit_transform(Xt_1).toarray(), Xt_3.toarray()) assert_raise_message(ValueError, "inverse_transform only supports " "'encode = ordinal'. Got encode='onehot' " "instead.", est.inverse_transform, Xt_2)
def compute_dr_wrt(self, wrt): result = ProjectPoints.compute_dr_wrt(self, wrt) if result is None: return None if sp.issparse(result): drz = self.z_coords.dr_wrt(wrt).tocoo() result = result.tocoo() result.row = result.row*3/2 IS = np.concatenate((result.row, drz.row*3+2)) JS = np.concatenate((result.col, drz.col)) data = np.concatenate((result.data, drz.data)) result = sp.csc_matrix((data, (IS, JS)), shape=(self.v.r.size, wrt.r.size)) else: try: bigger = np.zeros((result.shape[0]/2, 3, result.shape[1])) bigger[:, :2, :] = result.reshape((-1, 2, result.shape[-1])) drz = self.z_coords.dr_wrt(wrt) if drz is not None: if sp.issparse(drz): drz = drz.todense() bigger[:,2,:] = drz.reshape(bigger[:,2,:].shape) result = bigger.reshape((-1, bigger.shape[-1])) except: import pdb; pdb.set_trace() return result
def __call__(self, e1, e2=None, axis=1): """ Method for calculating distances. :param e1: input data instances :type e1: :class:`Orange.data.Table` or :class:`Orange.data.RowInstance` or :class:`numpy.ndarray` :param e2: optional second argument for data instances if provided, distances between each pair, where first item is from e1 and second is from e2, are calculated :type e2: :class:`Orange.data.Table` or :class:`Orange.data.RowInstance` or :class:`numpy.ndarray` :param axis: if axis=1 we calculate distances between rows, if axis=0 we calculate distances between columns :type axis: int :return: the matrix with distances between given examples :rtype: :class:`Orange.misc.DistMatrix` """ x1 = _orange_to_numpy(e1) x2 = _orange_to_numpy(e2) if axis == 0: x1 = x1.T if x2 is not None: x2 = x2.T if not sparse.issparse(x1): x1 = np.atleast_2d(x1) if e2 is not None and not sparse.issparse(x2): x2 = np.atleast_2d(x2) dist = skl_metrics.pairwise.pairwise_distances(x1, x2, metric=self.metric) if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance): dist = DistMatrix(dist, e1, e2) else: dist = DistMatrix(dist) return dist
def kmeans(X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=0): if not issparse(X): X = numpy.asanyarray(X) centres = centres.todense() if issparse(centres) else centres.copy() N, dim = X.shape k, cdim = centres.shape if dim != cdim: raise ValueError("kmeans: X %s and centres %s must have the same number of columns" % (X.shape, centres.shape)) if verbose: print("kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % (X.shape, centres.shape, delta, maxiter, metric)) allx = numpy.arange(N) prevdist = 0 jiter = None xtoc = None distances = None for jiter in range(1, maxiter + 1): D = cdist_sparse(X, centres, metric=metric, p=p) # |X| x |centres| xtoc = D.argmin(axis=1) # X -> nearest centre distances = D[allx, xtoc] avdist = distances.mean() # median ? if verbose >= 2: print("kmeans: av |X - nearest centre| = %.4g" % avdist) if (1 - delta) * prevdist <= avdist <= prevdist or jiter == maxiter: break prevdist = avdist for jc in range(k): # (1 pass in C) c = numpy.where(xtoc == jc)[0] if len(c) > 0: centres[jc] = X[c].mean(axis=0) if verbose: print("kmeans: %d iterations cluster sizes:" % jiter, numpy.bincount(xtoc)) return centres, xtoc, distances
def as_float_array(X, copy=True, force_all_finite=True): """Converts an array-like to an array of floats The new dtype will be np.float32 or np.float64, depending on the original type. The function can create a copy or modify the argument depending on the argument copy. Parameters ---------- X : {array-like, sparse matrix} copy : bool, optional If True, a copy of X will be created. If False, a copy may still be returned if X's dtype is not a floating point type. Returns ------- XT : {array, sparse matrix} An array of type np.float """ if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): return safe_asarray(X, dtype=np.float64, copy=copy, force_all_finite=force_all_finite) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X else: return X.astype(np.float32 if X.dtype == np.int32 else np.float64)
def coef_(self): if self.kernel != 'linear': raise ValueError('coef_ is only available when using a ' 'linear kernel') if self.dual_coef_.shape[0] == 1: # binary classifier coef = -safe_sparse_dot(self.dual_coef_, self.support_vectors_) else: # 1vs1 classifier coef = _one_vs_one_coef(self.dual_coef_, self.n_support_, self.support_vectors_) if sp.issparse(coef[0]): coef = sp.vstack(coef).tocsr() else: coef = np.vstack(coef) # coef_ being a read-only property it's better to mark the value as # immutable to avoid hiding potential bugs for the unsuspecting user if sp.issparse(coef): # sparse matrix do not have global flags coef.data.flags.writeable = False else: # regular dense array coef.flags.writeable = False return coef
def safe_sparse_dot(a, b): """Dot product that handle the sparse matrix case correctly""" from scipy import sparse if sparse.issparse(a) or sparse.issparse(b): return a * b else: return np.dot(a,b)
def chol_solve(U, b, out=None): if isinstance(U, np.ndarray): if sparse.issparse(b): b = b.toarray() # Allocate memory U = np.atleast_2d(U) B = np.atleast_1d(b) sh_u = U.shape[:-2] sh_b = B.shape[:-1] l_u = len(sh_u) l_b = len(sh_b) # Check which axis are iterated over with B along with U ind_b = [Ellipsis] * l_b l_min = min(l_u, l_b) jnd_b = tuple(i for i in range(-l_min, 0) if sh_b[i] == sh_u[i]) if out == None: # Shape of the result (broadcasting rules) sh = utils.broadcasted_shape(sh_u, sh_b) # out = np.zeros(np.shape(B)) out = np.zeros(sh + B.shape[-1:]) for i in utils.nested_iterator(np.shape(U)[:-2]): # The goal is to run Cholesky solver once for all vectors of B # for which the matrices of U are the same (according to the # broadcasting rules). Thus, we collect all the axes of B for # which U is singleton and form them as a 2-D matrix and then # run the solver once. # Select those axes of B for which U and B are not singleton for j in jnd_b: ind_b[j] = i[j] # Collect all the axes for which U is singleton b = B[tuple(ind_b) + (Ellipsis,)] # Reshape it to a 2-D (or 1-D) array orig_shape = b.shape if b.ndim > 1: b = b.reshape((-1, b.shape[-1])) # Ellipsis to all preceeding axes and ellipsis for the last # axis: if len(ind_b) < len(sh): ind_out = (Ellipsis,) + tuple(ind_b) + (Ellipsis,) else: ind_out = tuple(ind_b) + (Ellipsis,) out[ind_out] = linalg.cho_solve((U[i], False), b.T).T.reshape(orig_shape) return out elif isinstance(U, cholmod.Factor): if sparse.issparse(b): b = b.toarray() return U.solve_A(b) else: raise ValueError("Unknown type of Cholesky factor")
def inner(A, B): if sparse.issparse(A) and sparse.issparse(B): return (A*B.T)[0,0] if not sparse.issparse(A) and not sparse.issparse(B): return np.inner(A, B) else: raise ValueError('sparsity of arguments is not consistant')
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert_true(sp.issparse(X1)) assert_true(sp.issparse(Y1)) assert_equal(60915113, X1.data.size) assert_equal(2606875, Y1.data.size) # test shapes assert_equal((804414, 47236), X1.shape) assert_equal((804414, 103), Y1.shape) assert_equal((804414,), s1.shape) assert_equal(103, len(cat_list)) # test ordering of categories first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert_equal(num, Y1[:, j].data.size) # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def check_svm_model_equal(dense_svm, sparse_svm, X_train, y_train, X_test): dense_svm.fit(X_train.toarray(), y_train) if sparse.isspmatrix(X_test): X_test_dense = X_test.toarray() else: X_test_dense = X_test sparse_svm.fit(X_train, y_train) assert sparse.issparse(sparse_svm.support_vectors_) assert sparse.issparse(sparse_svm.dual_coef_) assert_array_almost_equal(dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray()) assert_array_almost_equal(dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray()) if dense_svm.kernel == "linear": assert sparse.issparse(sparse_svm.coef_) assert_array_almost_equal(dense_svm.coef_, sparse_svm.coef_.toarray()) assert_array_almost_equal(dense_svm.support_, sparse_svm.support_) assert_array_almost_equal(dense_svm.predict(X_test_dense), sparse_svm.predict(X_test)) assert_array_almost_equal(dense_svm.decision_function(X_test_dense), sparse_svm.decision_function(X_test)) assert_array_almost_equal(dense_svm.decision_function(X_test_dense), sparse_svm.decision_function(X_test_dense)) if isinstance(dense_svm, svm.OneClassSVM): msg = "cannot use sparse input in 'OneClassSVM' trained on dense data" else: assert_array_almost_equal(dense_svm.predict_proba(X_test_dense), sparse_svm.predict_proba(X_test), 4) msg = "cannot use sparse input in 'SVC' trained on dense data" if sparse.isspmatrix(X_test): assert_raise_message(ValueError, msg, dense_svm.predict, X_test)
def solveLinear(A, b): """ Solve the linear equation Ax=b. Return tuple (x, time to solve). """ error = np.inf # just to be safe, initialize error variable here if sp.issparse(A): # print 'sparse', type(A) start_log_time = clock() result = slinalg.spsolve(A, b) solve_time = deltaT(start_log_time) error = linalg.norm((A * result.reshape(-1, 1) - b.reshape(-1, 1))[0]) # For extensive comparision of methods refer to InversionComparison.txt else: # print 'not sparse, type',type(A) if sp.issparse(A): A = A.todense() # Regularize A # result = linalg.lstsq(A,b); result = result[0] # Extract just the # answer start_log_time = clock() result = linalg.solve(A, b) solve_time = deltaT(start_log_time) # use numpy matrix multiplication if isinstance(A, np.matrixlib.defmatrix.matrix): error = np.linalg.norm( (A * result.reshape(-1, 1) - b.reshape(-1, 1))[0]) elif isinstance(A, np.ndarray): # use array multiplication error = np.linalg.norm( (np.dot(A, result.reshape(-1, 1)) - b.reshape(-1, 1))[0]) else: print 'Attempted to solve linear equation Ax=b in solveLinear() of Tools.py with a non-numpy (array / matrix) type.' sys.exit(1) if error > RESEDUAL_THRESHOLD: print "||Ax-b|| = %0.1f" % error return result.ravel(), solve_time
def test_sgd_l1(self): """Test L1 regularization""" n = len(X4) rng = np.random.RandomState(13) idx = np.arange(n) rng.shuffle(idx) X = X4[idx, :] Y = Y4[idx] clf = self.factory(penalty="l1", alpha=0.2, fit_intercept=False, n_iter=2000, shuffle=False) clf.fit(X, Y) assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) pred = clf.predict(X) assert_array_equal(pred, Y) # test sparsify with dense inputs clf.sparsify() assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y) # pickle and unpickle with sparse coef_ clf = pickle.loads(pickle.dumps(clf)) assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y)
def predict_presence_absence_evidences(self, X): X = check_array(X, accept_sparse="csr") absence_log_prob_ = np.log(1 - np.exp(self.feature_log_prob_)) presence_log_ratios = self.feature_log_prob_[1] - self.feature_log_prob_[0] absence_log_ratios = absence_log_prob_[1] - absence_log_prob_[0] presence_neg_log_ratios = presence_log_ratios * (presence_log_ratios<0) presence_pos_log_ratios = presence_log_ratios * (presence_log_ratios>0) if issparse(X): p_neg_evi = X * presence_neg_log_ratios p_pos_evi = X * presence_pos_log_ratios else: p_neg_evi = np.dot(X, presence_neg_log_ratios) p_pos_evi = np.dot(X, presence_pos_log_ratios) absence_neg_log_ratios = absence_log_ratios * (absence_log_ratios<0) absence_pos_log_ratios = absence_log_ratios * (absence_log_ratios>0) default_a_neg_evi = absence_neg_log_ratios.sum() default_a_pos_evi = absence_pos_log_ratios.sum() if issparse(X): a_neg_evi = -(X * absence_neg_log_ratios) + default_a_neg_evi a_pos_evi = -(X * absence_pos_log_ratios) + default_a_pos_evi else: a_neg_evi = -np.dot(X, absence_neg_log_ratios) + default_a_neg_evi a_pos_evi = -np.dot(X, absence_pos_log_ratios) + default_a_pos_evi return p_neg_evi, p_pos_evi, a_neg_evi, a_pos_evi
def _generate_sample(self, X, nn_data, nn_num, row, col, step): """Generate a synthetic sample with an additional steps for the categorical features. Each new sample is generated the same way than in SMOTE. However, the categorical features are mapped to the most frequent nearest neighbors of the majority class. """ rng = check_random_state(self.random_state) sample = super(SMOTENC, self)._generate_sample(X, nn_data, nn_num, row, col, step) # To avoid conversion and since there is only few samples used, we # convert those samples to dense array. sample = (sample.toarray().squeeze() if sparse.issparse(sample) else sample) all_neighbors = nn_data[nn_num[row]] all_neighbors = (all_neighbors.toarray() if sparse.issparse(all_neighbors) else all_neighbors) categories_size = ([self.continuous_features_.size] + [cat.size for cat in self.ohe_.categories_]) for start_idx, end_idx in zip(np.cumsum(categories_size)[:-1], np.cumsum(categories_size)[1:]): col_max = all_neighbors[:, start_idx:end_idx].sum(axis=0) # tie breaking argmax col_sel = rng.choice(np.flatnonzero( np.isclose(col_max, col_max.max()))) sample[start_idx:end_idx] = 0 sample[start_idx + col_sel] = 1 return sparse.csr_matrix(sample) if sparse.issparse(X) else sample
def safe_sparse_dot(a, b, dense_output=False): """Dot product that handle the sparse matrix case correctly Uses BLAS GEMM as replacement for numpy.dot where possible to avoid unnecessary copies. Parameters ---------- a : array or sparse matrix b : array or sparse matrix dense_output : boolean, default False When False, either ``a`` or ``b`` being sparse will yield sparse output. When True, output will always be an array. Returns ------- dot_product : array or sparse matrix sparse if ``a`` or ``b`` is sparse and ``dense_output=False``. """ if issparse(a) or issparse(b): ret = a * b if dense_output and hasattr(ret, "toarray"): ret = ret.toarray() return ret else: return np.dot(a, b)
def check_equal(x, y): """ Returns True iff x[0] and y[0] are equal (checks the dtype and shape if x and y are numpy.ndarray instances). Used internally. """ # I put the import here to allow using theano without scipy. import scipy.sparse as sp x, y = x[0], y[0] # TODO: bug in current scipy, two sparse matrices are never equal, # remove when moving to 0.7 if sp.issparse(x): x = x.todense() if sp.issparse(y): y = y.todense() if isinstance(x, numpy.ndarray) and isinstance(y, numpy.ndarray): if (x.dtype != y.dtype or x.shape != y.shape or numpy.any(abs(x - y) > 1e-10)): raise Exception("Output mismatch.", {'performlinker': x, 'clinker': y}) else: if x != y: raise Exception("Output mismatch.", {'performlinker': x, 'clinker': y})
def test_svc(): """Check that sparse SVC gives the same result as SVC""" clf = svm.SVC(kernel='linear', probability=True, random_state=0) clf.fit(X, Y) sp_clf = svm.SVC(kernel='linear', probability=True, random_state=0) sp_clf.fit(X_sp, Y) assert_array_equal(sp_clf.predict(T), true_result) assert_true(sparse.issparse(sp_clf.support_vectors_)) assert_array_almost_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) assert_true(sparse.issparse(sp_clf.dual_coef_)) assert_array_almost_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) assert_true(sparse.issparse(sp_clf.coef_)) assert_array_almost_equal(clf.coef_, sp_clf.coef_.toarray()) assert_array_almost_equal(clf.support_, sp_clf.support_) assert_array_almost_equal(clf.predict(T), sp_clf.predict(T)) # refit with a different dataset clf.fit(X2, Y2) sp_clf.fit(X2_sp, Y2) assert_array_almost_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) assert_array_almost_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) assert_array_almost_equal(clf.coef_, sp_clf.coef_.toarray()) assert_array_almost_equal(clf.support_, sp_clf.support_) assert_array_almost_equal(clf.predict(T2), sp_clf.predict(T2)) assert_array_almost_equal(clf.predict_proba(T2), sp_clf.predict_proba(T2), 4)
def getKM(self, X): """Returns the kernel matrix between the basis vectors and X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Returns ------- K : array, shape = [n_samples, n_bvectors] kernel matrix """ X = array_tools.as_2d_array(X, True) test_X = X if sp.issparse(test_X): test_X = array_tools.spmat_resize(test_X, self.train_X.shape[1]) else: test_X = array_tools.as_dense_matrix(test_X) gamma = self.gamma m = self.train_X.shape[0] n = test_X.shape[0] #The Gaussian kernel matrix is constructed from a linear kernel matrix linkm = self.train_X * test_X.T linkm = array_tools.as_dense_matrix(linkm) if sp.issparse(test_X): test_norms = ((test_X.T.multiply(test_X.T)).sum(axis=0)).T else: test_norms = (np.multiply(test_X.T, test_X.T).sum(axis=0)).T K = mat(np.ones((m, 1), dtype = float64)) * test_norms.T K = K + self.train_norms * mat(np.ones((1, n), dtype = float64)) K = K - 2 * linkm K = - gamma * K K = np.exp(K) return K.A.T
def _superdot(self, lhs, rhs): try: if lhs is None: return None if rhs is None: return None if isinstance(lhs, np.ndarray) and lhs.size==1: lhs = lhs.ravel()[0] if isinstance(rhs, np.ndarray) and rhs.size==1: rhs = rhs.ravel()[0] if isinstance(lhs, numbers.Number) or isinstance(rhs, numbers.Number): return lhs * rhs if isinstance(rhs, LinearOperator): return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x))) if isinstance(lhs, LinearOperator): if sp.issparse(rhs): return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x))) else: return lhs.dot(rhs) # TODO: Figure out how/whether to do this. #lhs, rhs = utils.convert_inputs_to_sparse_if_possible(lhs, rhs) if not sp.issparse(lhs) and sp.issparse(rhs): return rhs.T.dot(lhs.T).T return lhs.dot(rhs) except: import pdb; pdb.set_trace()
def fit(self, X, y=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X: array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster """ self.random_state = check_random_state(self.random_state) X = check_arrays(X, sparse_format="csr", copy=False)[0] warn_if_not_float(X, self) n_samples, n_features = X.shape if n_samples < self.k: raise ValueError("Number of samples smaller than number "\ "of clusters.") if hasattr(self.init, '__array__'): self.init = np.asarray(self.init) X_shuffled = shuffle(X, random_state=self.random_state) if sp.issparse(X_shuffled): x_squared_norms = _k_means.csr_row_norm_l2(X) else: x_squared_norms = np.sum(X ** 2.0, axis=1) self.cluster_centers_ = _init_centroids( X_shuffled, self.k, self.init, random_state=self.random_state, x_squared_norms=x_squared_norms) self.counts = np.zeros(self.k, dtype=np.int32) n_batches = int(np.ceil(float(n_samples) / self.chunk_size)) batch_slices = list(gen_even_slices(n_samples, n_batches)) n_iterations = xrange(int(self.max_iter * n_batches)) if sp.issparse(X_shuffled): _mini_batch_step = _mini_batch_step_sparse tol = self.tol else: _mini_batch_step = _mini_batch_step_dense tol = np.mean(np.var(X_shuffled, axis=0)) * self.tol for i, batch_slice in izip(n_iterations, cycle(batch_slices)): old_centers = self.cluster_centers_.copy() _mini_batch_step(X_shuffled, batch_slice, self.cluster_centers_, self.counts, x_squared_norms=x_squared_norms) if np.sum((old_centers - self.cluster_centers_) ** 2) < tol: if self.verbose: print 'Converged to similar centers at iteration', i break self.inertia_ = 0 self.labels_ = np.empty((n_samples,), dtype=np.int) for batch_slice in batch_slices: batch_inertia, batch_labels = _calculate_labels_inertia( X[batch_slice], self.cluster_centers_) self.inertia_ += batch_inertia self.labels_[batch_slice] = batch_labels return self
def test_pair_transformer(): """Test for PairTransformer.""" X = np.array([[0, 1], [2, 0], [2, 5]], dtype=np.float) tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1)) Xt = tf.fit_transform(X) assert_array_almost_equal(Xt, X + 1) X = np.array([[0, 1], [2, 0], [2, 5], [0, 1], [2, 0], [2, 5]], dtype=np.float) tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1), groupby=lambda r: r[0]) Xt = tf.fit_transform(X) assert_array_almost_equal(Xt, X + 1) X = np.array([[0, 1], [2, 3], [4, 5]], dtype=np.float) Xt = PairTransformer(element_transformer=MinMaxScaler()).fit_transform(X) assert_array_almost_equal(Xt, [[0, 0.2], [0.4, 0.6], [0.8, 1.0]]) X = np.array([[0, 1], [2, 3]], dtype=np.float) tf = PairTransformer(element_transformer=OneHotEncoder(sparse=True)) Xt = tf.fit_transform(X) assert sp.issparse(Xt) assert_array_almost_equal(Xt.todense(), [[1, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1]]) X = sp.csr_matrix(np.array([[0, 1], [2, 3]], dtype=np.float)) tf = PairTransformer(element_transformer=StandardScaler(with_mean=False)) Xt = tf.fit_transform(X) assert sp.issparse(Xt) assert_array_almost_equal(Xt.todense(), [[0, 0.89442719], [1.78885438, 2.68328157]])
def cdist_sparse( X, Y, **kwargs ): """ -> |X| x |Y| cdist array, any cdist metric X or Y may be sparse -- best csr """ # todense row at a time, v slow if both v sparse sxy = 2*issparse(X) + issparse(Y) if sxy == 0: if kwargs["metric"] == "cosine": return 1 - cdist( X, Y, **kwargs ) else: return d d = np.empty( (X.shape[0], Y.shape[0]), np.float64 ) if sxy == 2: for j, x in enumerate(X): d[j] = cdist( x.todense(), Y, **kwargs ) [0] elif sxy == 1: for k, y in enumerate(Y): d[:,k] = cdist( X, y.todense(), **kwargs ) [0] else: for j, x in enumerate(X): for k, y in enumerate(Y): d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0] if kwargs["metric"] == "cosine": return 1 - d else: return d
def _assert_allclose_sparse(a, b, **kwargs): # helper function that can deal with sparse matrices if sparse.issparse(a): a = a.toarray() if sparse.issparse(b): b = a.toarray() assert_allclose(a, b, **kwargs)
def _compute_output(self, X): """Get the outputs of the network, for use in prediction methods.""" if not self._is_fitted: raise NotFittedError("Call fit before prediction") X = check_array(X, accept_sparse=['csr', 'dok', 'lil', 'csc', 'coo']) if self.is_sparse_: # For sparse input, make the input a CSR matrix since it can be # indexed by row. X = X.tocsr() if sp.issparse(X) else sp.csr_matrix(X) elif sp.issparse(X): # Convert sparse input to dense. X = X.todense().A # Make predictions in batches. pred_batches = [] start_idx = 0 n_examples = X.shape[0] with self.graph_.as_default(): while start_idx < n_examples: X_batch = \ X[start_idx:min(start_idx + self.batch_size, n_examples)] feed_dict = self._make_feed_dict(X_batch) start_idx += self.batch_size pred_batches.append( self._session.run(self.output_layer_, feed_dict=feed_dict)) y_pred = np.concatenate(pred_batches) return y_pred
def commit(self): self.warning(1) self.error(1) metric = METRICS[self.metric_idx] distances = None data = self.data if data is not None and issparse(data.X) and \ not metric.supports_sparse: data = None if data is not None: if isinstance(metric, distance.MahalanobisDistance): metric.fit(self.data, axis=1-self.axis) if not any(a.is_continuous for a in self.data.domain.attributes): self.error(1, "No continuous features") data = None elif any(a.is_discrete for a in self.data.domain.attributes) or \ (not issparse(self.data.X) and numpy.any(numpy.isnan(self.data.X))): data = distance._preprocess(self.data) if len(self.data.domain.attributes) - len(data.domain.attributes) > 0: self.warning(1, "Ignoring discrete features") else: data = self.data if data is not None: shape = (len(data), len(data.domain.attributes)) if numpy.product(shape) == 0: self.error(1, "Empty data (shape == {})".format(shape)) else: distances = metric(data, data, 1 - self.axis, impute=True) self.send("Distances", distances)
def test_SVC(): """Check that sparse SVC gives the same result as SVC""" clf = svm.SVC(kernel='linear').fit(X, Y) sp_clf = svm.SVC(kernel='linear').fit(X_sp, Y) assert_array_equal(sp_clf.predict(T), true_result) assert_true(sparse.issparse(sp_clf.support_vectors_)) assert_array_almost_equal(clf.support_vectors_, sp_clf.support_vectors_.todense()) assert_true(sparse.issparse(sp_clf.dual_coef_)) assert_array_almost_equal(clf.dual_coef_, sp_clf.dual_coef_.todense()) assert_true(sparse.issparse(sp_clf.coef_)) assert_array_almost_equal(clf.coef_, sp_clf.coef_.todense()) assert_array_almost_equal(clf.predict(T), sp_clf.predict(T)) # refit with a different dataset clf.fit(X2, Y2) sp_clf.fit(X2_sp, Y2) assert_array_almost_equal(clf.support_vectors_, sp_clf.support_vectors_.todense()) assert_array_almost_equal(clf.dual_coef_, sp_clf.dual_coef_.todense()) assert_array_almost_equal(clf.coef_, sp_clf.coef_.todense()) assert_array_almost_equal(clf.predict(T2), sp_clf.predict(T2))
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def encode(table, include_class=False): """ Return a tuple of (bool (one hot) ndarray, {col: (variable_index, value_index)} mapping) If the input table is sparse, a list of nonzero column indices per row (LIL rows) is returned instead of the one-hot ndarray. """ X, encoded, mapping = table.X, [], {} if issparse(X): encoded = X.tolil().rows.tolist() for i, var in enumerate(table.domain.attributes): mapping[i] = i, 0 else: for i, var in enumerate(table.domain.attributes): if not var.is_discrete: continue for j, val in enumerate(var.values): mapping[len(mapping)] = i, j encoded.append(X[:, i] == j) if include_class and table.domain.has_discrete_class: i, var = len(table.domain.attributes), table.domain.class_var for j, val in enumerate(var.values): mapping[len(mapping)] = i, j if issparse(X): for row in encoded: row.append(i + j) else: encoded.append(table.Y == j) if not issparse(X): encoded = np.column_stack(encoded) return encoded, mapping
def patch(data, rows, cols = None): """ data = data matrix, 1D or 2D array (matrix) rows = iterator of rows (list) to select, None means selecting all rows cols = iterator of cols (list) to select, None means selecting all cols return np.array (of the patch shape), but the DIM of return should be the same as data (1D or 2D) if data is a sparse matrix, the return the matrix will be dense np.array """ if not sparse.issparse(data): data = np.asarray(data) dim = get_dim(data) if dim == 1: ## ignore cols return data[rows] if rows is not None else data elif dim == 2: nrows, ncols = data.shape rows = rows if rows is not None else xrange(nrows) cols = cols if cols is not None else xrange(ncols) if sparse.issparse(data): return data.toarray()[np.ix_(rows, cols)] else: return data[np.ix_(rows, cols)] else: raise RuntimeError('only supports 1D or 2D array')
def fit(self, X, y, group, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a decision tree from the training set (X, y, group). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. group : array-like, shape = [n_samples] or [n_samples, n_outputs] The group values, 0 for control, 1 for target. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = [n_samples, n_features], optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) group = check_array(group, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) group = np.atleast_1d(group) expanded_class_weight = None if y.ndim == 1: y = np.reshape(y, (-1, 1)) if group.ndim == 1: group = np.reshape(group, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) # Encode y & group together before passing to the builder. y = np.copy(2 * group + y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded # TODO check if binary if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ # TODO encode group and check if binary self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)): min_samples_leaf = self.min_samples_leaf else: # float min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) stat_param = self.stat_param if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): min_samples_split = self.min_samples_split else: # float min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not (0. < self.min_samples_split <= 1. or 2 <= self.min_samples_split): raise ValueError("min_samples_split must be in at least 2" " or in (0, 1], got %s" % min_samples_split) if not (0. < self.min_samples_leaf <= 0.5 or 1 <= self.min_samples_leaf): raise ValueError("min_samples_leaf must be at least than 1 " "or in (0, 0.5], got %s" % min_samples_leaf) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. if self.presort == 'auto' and issparse(X): presort = False elif self.presort == 'auto': presort = True if presort is True and issparse(X): raise ValueError("Presorting is not supported for sparse " "matrices.") # If multiple trees are built on the same dataset, we only want to # presort once. Splitters now can accept presorted indices if desired, # but do not handle any presorting themselves. Ensemble algorithms # which desire presorting must do presorting themselves and pass that # matrix into each tree. if X_idx_sorted is None and presort: X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0), dtype=np.int32) if presort and X_idx_sorted.shape != X.shape: raise ValueError("The shape of X (X.shape = {}) doesn't match " "the shape of X_idx_sorted (X_idx_sorted" ".shape = {})".format(X.shape, X_idx_sorted.shape)) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state, self.presort, stat_param) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, stat_param) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, stat_param) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def predict_loop(model, f, ins, batch_size=32, verbose=0, steps=None, callbacks=None): """Abstract method to loop over some data in batches. # Arguments model: Keras model instance. f: Keras function returning a list of tensors. ins: list of tensors to be fed to `f`. batch_size: integer batch size. verbose: verbosity mode. steps: Total number of steps (batches of samples) before declaring `predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks or an instance of `keras.callbacks.CallbackList` to be called during prediction. # Returns Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ num_samples = check_num_samples(ins, batch_size=batch_size, steps=steps, steps_name='steps') # Check if callbacks have not been already configured if not isinstance(callbacks, cbks.CallbackList): callbacks = cbks.CallbackList(callbacks) callback_model = model._get_callback_model() callbacks.set_model(callback_model) callback_params = { 'batch_size': batch_size, 'steps': steps, 'samples': num_samples, 'verbose': verbose, } callbacks.set_params(callback_params) if verbose == 1: if steps is not None: progbar = Progbar(target=steps) else: progbar = Progbar(target=num_samples) indices_for_conversion_to_dense = [] for i in range(len(model._feed_inputs)): if issparse(ins[i]) and not K.is_sparse(model._feed_inputs[i]): indices_for_conversion_to_dense.append(i) callbacks.model.stop_training = False callbacks._call_begin_hook('predict') if steps is not None: # Step-based predictions. # Since we do not know how many samples # we will see, we cannot pre-allocate # the returned Numpy arrays. # Instead, we store one array per batch seen # and concatenate them upon returning. unconcatenated_outs = [] for step in range(steps): batch_logs = {'batch': step, 'size': 1} callbacks._call_batch_hook('predict', 'begin', step, batch_logs) batch_outs = f(ins) batch_outs = to_list(batch_outs) if step == 0: for batch_out in batch_outs: unconcatenated_outs.append([]) for i, batch_out in enumerate(batch_outs): unconcatenated_outs[i].append(batch_out) batch_logs['outputs'] = batch_outs callbacks._call_batch_hook('predict', 'end', step, batch_logs) if verbose == 1: progbar.update(step + 1) callbacks.on_predict_end() if len(unconcatenated_outs) == 1: return np.concatenate(unconcatenated_outs[0], axis=0) return [ np.concatenate(unconcatenated_outs[i], axis=0) for i in range(len(unconcatenated_outs)) ] else: # Sample-based predictions. outs = [] batches = make_batches(num_samples, batch_size) index_array = np.arange(num_samples) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] if ins and isinstance(ins[-1], float): # Do not slice the training phase flag. ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_arrays(ins, batch_ids) for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() batch_logs = {'batch': batch_index, 'size': len(batch_ids)} callbacks._call_batch_hook('predict', 'begin', batch_index, batch_logs) batch_outs = f(ins_batch) batch_outs = to_list(batch_outs) if batch_index == 0: # Pre-allocate the results arrays. for batch_out in batch_outs: shape = (num_samples, ) + batch_out.shape[1:] outs.append(np.zeros(shape, dtype=batch_out.dtype)) for i, batch_out in enumerate(batch_outs): outs[i][batch_start:batch_end] = batch_out batch_logs['outputs'] = batch_outs callbacks._call_batch_hook('predict', 'end', batch_index, batch_logs) if verbose == 1: progbar.update(batch_end) callbacks._call_end_hook('predict') return unpack_singleton(outs)
def fit_loop(model, fit_function, fit_inputs, out_labels=None, batch_size=None, epochs=100, verbose=1, callbacks=None, val_function=None, val_inputs=None, shuffle=True, callback_metrics=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1): """Abstract fit function for `fit_function(fit_inputs)`. Assumes that fit_function returns a list, labeled by out_labels. # Arguments model: Keras model instance. fit_function: Keras function returning a list of tensors fit_inputs: List of tensors to be fed to `fit_function` out_labels: List of strings, display names of the outputs of `fit_function` batch_size: Integer batch size or None if unknown. epochs: Number of times to iterate over the data verbose: Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training and validation (if `val_function` and `val_inputs` are not `None`). val_function: Keras function to call for validation val_inputs: List of tensors to be fed to `val_function` shuffle: Whether to shuffle the data at the beginning of each epoch callback_metrics: List of strings, the display names of the metrics passed to the callbacks. They should be the concatenation of list the display names of the outputs of `fit_function` and the list of display names of the outputs of `fit_inputs`. initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or list/tuple/set. If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. validation_freq=2` runs validation every 2 epochs. If a list, tuple, or set, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. # Returns `History` object. """ do_validation = False if val_function and val_inputs: do_validation = True if (verbose and fit_inputs and hasattr(fit_inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')): print('Train on %d samples, validate on %d samples' % (fit_inputs[0].shape[0], val_inputs[0].shape[0])) if validation_steps: do_validation = True if steps_per_epoch is None: raise ValueError('Can only use `validation_steps` ' 'when doing step-wise ' 'training, i.e. `steps_per_epoch` ' 'must be set.') elif do_validation: if steps_per_epoch: raise ValueError('Must specify `validation_steps` ' 'to perform validation ' 'when doing step-wise training.') num_train_samples = check_num_samples(fit_inputs, batch_size=batch_size, steps=steps_per_epoch, steps_name='steps_per_epoch') if num_train_samples is not None: index_array = np.arange(num_train_samples) model.history = cbks.History() _callbacks = [ cbks.BaseLogger(stateful_metrics=model.stateful_metric_names) ] if verbose: if steps_per_epoch is not None: count_mode = 'steps' else: count_mode = 'samples' _callbacks.append( cbks.ProgbarLogger(count_mode, stateful_metrics=model.stateful_metric_names)) _callbacks += (callbacks or []) + [model.history] callbacks = cbks.CallbackList(_callbacks) out_labels = out_labels or [] # it's possible to callback a different model than itself # (used by Sequential models) callback_model = model._get_callback_model() callbacks.set_model(callback_model) callbacks.set_params({ 'batch_size': batch_size, 'epochs': epochs, 'steps': steps_per_epoch, 'samples': num_train_samples, 'verbose': verbose, 'do_validation': do_validation, 'metrics': callback_metrics or [], }) callbacks._call_begin_hook('train') callbacks.model.stop_training = False for cbk in callbacks: cbk.validation_data = val_inputs # To prevent a slowdown, # we find beforehand the arrays that need conversion. feed = (model._feed_inputs + model._feed_targets + model._feed_sample_weights) indices_for_conversion_to_dense = [] for i in range(len(feed)): if issparse(fit_inputs[i]) and not K.is_sparse(feed[i]): indices_for_conversion_to_dense.append(i) for epoch in range(initial_epoch, epochs): # Reset stateful metrics for m in model.stateful_metric_functions: m.reset_states() callbacks.on_epoch_begin(epoch) epoch_logs = {} if steps_per_epoch is not None: for step_index in range(steps_per_epoch): batch_logs = {'batch': step_index, 'size': 1} callbacks._call_batch_hook('train', 'begin', step_index, batch_logs) outs = fit_function(fit_inputs) outs = to_list(outs) for l, o in zip(out_labels, outs): batch_logs[l] = o callbacks._call_batch_hook('train', 'end', step_index, batch_logs) if callback_model.stop_training: break if do_validation and should_run_validation(validation_freq, epoch): val_outs = test_loop(model, val_function, val_inputs, steps=validation_steps, callbacks=callbacks, verbose=0) val_outs = to_list(val_outs) # Same labels assumed. for l, o in zip(out_labels, val_outs): epoch_logs['val_' + l] = o else: if shuffle == 'batch': index_array = batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(num_train_samples, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] try: if isinstance(fit_inputs[-1], float): # Do not slice the training phase flag. ins_batch = slice_arrays(fit_inputs[:-1], batch_ids) + [fit_inputs[-1]] else: ins_batch = slice_arrays(fit_inputs, batch_ids) except TypeError: raise TypeError('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') batch_logs = {'batch': batch_index, 'size': len(batch_ids)} callbacks._call_batch_hook('train', 'begin', batch_index, batch_logs) for i in indices_for_conversion_to_dense: ins_batch[i] = ins_batch[i].toarray() outs = fit_function(ins_batch) outs = to_list(outs) for l, o in zip(out_labels, outs): batch_logs[l] = o callbacks._call_batch_hook('train', 'end', batch_index, batch_logs) if callbacks.model.stop_training: break if batch_index == len(batches) - 1: # Last batch. if do_validation and should_run_validation( validation_freq, epoch): val_outs = test_loop(model, val_function, val_inputs, batch_size=batch_size, callbacks=callbacks, verbose=0) val_outs = to_list(val_outs) # Same labels assumed. for l, o in zip(out_labels, val_outs): epoch_logs['val_' + l] = o callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break callbacks._call_end_hook('train') return model.history
def _linprog_highs(lp, solver, time_limit=None, presolve=True, disp=False, maxiter=None, dual_feasibility_tolerance=None, primal_feasibility_tolerance=None, ipm_optimality_tolerance=None, simplex_dual_edge_weight_strategy=None, **unknown_options): r""" Solve the following linear programming problem using one of the HiGHS solvers: User-facing documentation is in _linprog_doc.py. Parameters ---------- lp : _LPProblem A ``scipy.optimize._linprog_util._LPProblem`` ``namedtuple``. solver : "ipm" or "simplex" or None Which HiGHS solver to use. If ``None``, "simplex" will be used. Options ------- maxiter : int The maximum number of iterations to perform in either phase. For ``solver='ipm'``, this does not include the number of crossover iterations. Default is the largest possible value for an ``int`` on the platform. disp : bool Set to ``True`` if indicators of optimization status are to be printed to the console each iteration; default ``False``. time_limit : float The maximum time in seconds allotted to solve the problem; default is the largest possible value for a ``double`` on the platform. presolve : bool Presolve attempts to identify trivial infeasibilities, identify trivial unboundedness, and simplify the problem before sending it to the main solver. It is generally recommended to keep the default setting ``True``; set to ``False`` if presolve is to be disabled. dual_feasibility_tolerance : double Dual feasibility tolerance. Default is 1e-07. The minimum of this and ``primal_feasibility_tolerance`` is used for the feasibility tolerance when ``solver='ipm'``. primal_feasibility_tolerance : double Primal feasibility tolerance. Default is 1e-07. The minimum of this and ``dual_feasibility_tolerance`` is used for the feasibility tolerance when ``solver='ipm'``. ipm_optimality_tolerance : double Optimality tolerance for ``solver='ipm'``. Default is 1e-08. Minimum possible value is 1e-12 and must be smaller than the largest possible value for a ``double`` on the platform. simplex_dual_edge_weight_strategy : str (default: None) Strategy for simplex dual edge weights. The default, ``None``, automatically selects one of the following. ``'dantzig'`` uses Dantzig's original strategy of choosing the most negative reduced cost. ``'devex'`` uses the strategy described in [15]_. ``steepest`` uses the exact steepest edge strategy as described in [16]_. ``'steepest-devex'`` begins with the exact steepest edge strategy until the computation is too costly or inexact and then switches to the devex method. Curently, using ``None`` always selects ``'steepest-devex'``, but this may change as new options become available. unknown_options : dict Optional arguments not used by this particular solver. If ``unknown_options`` is non-empty, a warning is issued listing all unused options. Returns ------- sol : dict A dictionary consisting of the fields: x : 1D array The values of the decision variables that minimizes the objective function while satisfying the constraints. fun : float The optimal value of the objective function ``c @ x``. slack : 1D array The (nominally positive) values of the slack, ``b_ub - A_ub @ x``. con : 1D array The (nominally zero) residuals of the equality constraints, ``b_eq - A_eq @ x``. success : bool ``True`` when the algorithm succeeds in finding an optimal solution. status : int An integer representing the exit status of the algorithm. ``0`` : Optimization terminated successfully. ``1`` : Iteration or time limit reached. ``2`` : Problem appears to be infeasible. ``3`` : Problem appears to be unbounded. ``4`` : The HiGHS solver ran into a problem. message : str A string descriptor of the exit status of the algorithm. nit : int The total number of iterations performed. For ``solver='simplex'``, this includes iterations in all phases. For ``solver='ipm'``, this does not include crossover iterations. crossover_nit : int The number of primal/dual pushes performed during the crossover routine for ``solver='ipm'``. This is ``0`` for ``solver='simplex'``. ineqlin : OptimizeResult Solution and sensitivity information corresponding to the inequality constraints, `b_ub`. A dictionary consisting of the fields: residual : np.ndnarray The (nominally positive) values of the slack variables, ``b_ub - A_ub @ x``. This quantity is also commonly referred to as "slack". marginals : np.ndarray The sensitivity (partial derivative) of the objective function with respect to the right-hand side of the inequality constraints, `b_ub`. eqlin : OptimizeResult Solution and sensitivity information corresponding to the equality constraints, `b_eq`. A dictionary consisting of the fields: residual : np.ndarray The (nominally zero) residuals of the equality constraints, ``b_eq - A_eq @ x``. marginals : np.ndarray The sensitivity (partial derivative) of the objective function with respect to the right-hand side of the equality constraints, `b_eq`. lower, upper : OptimizeResult Solution and sensitivity information corresponding to the lower and upper bounds on decision variables, `bounds`. residual : np.ndarray The (nominally positive) values of the quantity ``x - lb`` (lower) or ``ub - x`` (upper). marginals : np.ndarray The sensitivity (partial derivative) of the objective function with respect to the lower and upper `bounds`. mip_node_count : int The number of subproblems or "nodes" solved by the MILP solver. Only present when `integrality` is not `None`. mip_dual_bound : float The MILP solver's final estimate of the lower bound on the optimal solution. Only present when `integrality` is not `None`. mip_gap : float The difference between the final objective function value and the final dual bound. Only present when `integrality` is not `None`. Notes ----- The result fields `ineqlin`, `eqlin`, `lower`, and `upper` all contain `marginals`, or partial derivatives of the objective function with respect to the right-hand side of each constraint. These partial derivatives are also referred to as "Lagrange multipliers", "dual values", and "shadow prices". The sign convention of `marginals` is opposite that of Lagrange multipliers produced by many nonlinear solvers. References ---------- .. [15] Harris, Paula MJ. "Pivot selection methods of the Devex LP code." Mathematical programming 5.1 (1973): 1-28. .. [16] Goldfarb, Donald, and John Ker Reid. "A practicable steepest-edge simplex algorithm." Mathematical Programming 12.1 (1977): 361-371. """ _check_unknown_options(unknown_options) # Map options to HiGHS enum values simplex_dual_edge_weight_strategy_enum = _convert_to_highs_enum( simplex_dual_edge_weight_strategy, 'simplex_dual_edge_weight_strategy', choices={ 'dantzig': HIGHS_SIMPLEX_EDGE_WEIGHT_STRATEGY_DANTZIG, 'devex': HIGHS_SIMPLEX_EDGE_WEIGHT_STRATEGY_DEVEX, 'steepest-devex': HIGHS_SIMPLEX_EDGE_WEIGHT_STRATEGY_CHOOSE, 'steepest': HIGHS_SIMPLEX_EDGE_WEIGHT_STRATEGY_STEEPEST_EDGE, None: None }) c, A_ub, b_ub, A_eq, b_eq, bounds, x0, integrality = lp lb, ub = bounds.T.copy() # separate bounds, copy->C-cntgs # highs_wrapper solves LHS <= A*x <= RHS, not equality constraints lhs_ub = -np.ones_like(b_ub) * np.inf # LHS of UB constraints is -inf rhs_ub = b_ub # RHS of UB constraints is b_ub lhs_eq = b_eq # Equality constaint is inequality rhs_eq = b_eq # constraint with LHS=RHS lhs = np.concatenate((lhs_ub, lhs_eq)) rhs = np.concatenate((rhs_ub, rhs_eq)) if issparse(A_ub) or issparse(A_eq): A = vstack((A_ub, A_eq)) else: A = np.vstack((A_ub, A_eq)) A = csc_matrix(A) options = { 'presolve': presolve, 'sense': HIGHS_OBJECTIVE_SENSE_MINIMIZE, 'solver': solver, 'time_limit': time_limit, 'highs_debug_level': MESSAGE_LEVEL_NONE, 'dual_feasibility_tolerance': dual_feasibility_tolerance, 'ipm_optimality_tolerance': ipm_optimality_tolerance, 'log_to_console': disp, 'output_flag': disp, 'primal_feasibility_tolerance': primal_feasibility_tolerance, 'simplex_dual_edge_weight_strategy': simplex_dual_edge_weight_strategy_enum, 'simplex_strategy': HIGHS_SIMPLEX_STRATEGY_DUAL, 'simplex_crash_strategy': HIGHS_SIMPLEX_CRASH_STRATEGY_OFF, 'ipm_iteration_limit': maxiter, 'simplex_iteration_limit': maxiter, } # np.inf doesn't work; use very large constant rhs = _replace_inf(rhs) lhs = _replace_inf(lhs) lb = _replace_inf(lb) ub = _replace_inf(ub) if integrality is None or np.sum(integrality) == 0: integrality = np.empty(0) else: integrality = np.array(integrality) res = _highs_wrapper(c, A.indptr, A.indices, A.data, lhs, rhs, lb, ub, integrality.astype(np.uint8), options) # HiGHS represents constraints as lhs/rhs, so # Ax + s = b => Ax = b - s # and we need to split up s by A_ub and A_eq if 'slack' in res: slack = res['slack'] con = np.array(slack[len(b_ub):]) slack = np.array(slack[:len(b_ub)]) else: slack, con = None, None # lagrange multipliers for equalities/inequalities and upper/lower bounds if 'lambda' in res: lamda = res['lambda'] marg_ineqlin = np.array(lamda[:len(b_ub)]) marg_eqlin = np.array(lamda[len(b_ub):]) marg_upper = np.array(res['marg_bnds'][1, :]) marg_lower = np.array(res['marg_bnds'][0, :]) else: marg_ineqlin, marg_eqlin = None, None marg_upper, marg_lower = None, None # this needs to be updated if we start choosing the solver intelligently solvers = {"ipm": "highs-ipm", "simplex": "highs-ds", None: "highs-ds"} # Convert to scipy-style status and message highs_status = res.get('status', None) highs_message = res.get('message', None) status, message = _highs_to_scipy_status_message(highs_status, highs_message) x = np.array(res['x']) if 'x' in res else None sol = { 'x': x, 'slack': slack, 'con': con, 'ineqlin': OptimizeResult({ 'residual': slack, 'marginals': marg_ineqlin, }), 'eqlin': OptimizeResult({ 'residual': con, 'marginals': marg_eqlin, }), 'lower': OptimizeResult({ 'residual': None if x is None else x - lb, 'marginals': marg_lower, }), 'upper': OptimizeResult({ 'residual': None if x is None else ub - x, 'marginals': marg_upper }), 'fun': res.get('fun'), 'status': status, 'success': res['status'] == MODEL_STATUS_OPTIMAL, 'message': message, 'nit': res.get('simplex_nit', 0) or res.get('ipm_nit', 0), 'crossover_nit': res.get('crossover_nit'), } if np.any(x) and integrality is not None: res.update({ 'mip_node_count': res.get('mip_node_count', 0), 'mip_dual_bound': res.get('mip_dual_bound', 0.0), 'mip_gap': res.get('mip_gap', 0.0), }) return sol
def __init__(self, endog, exog, exog_vc=None, ident=None, family=None, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None, vc_names=None, **kwargs): if len(ident) != exog_vc.shape[1]: msg = "len(ident) should match the number of columns of exog_vc" raise ValueError(msg) # Get the fixed effects parameter names if fep_names is None: if hasattr(exog, "columns"): fep_names = exog.columns.tolist() else: fep_names = ["FE_%d" % (k + 1) for k in range(exog.shape[1])] # Get the variance parameter names if vcp_names is None: vcp_names = ["VC_%d" % (k + 1) for k in range(int(max(ident)) + 1)] else: if len(vcp_names) != len(set(ident)): msg = "The lengths of vcp_names and ident should be the same" raise ValueError(msg) endog = np.asarray(endog) exog = np.asarray(exog) if not sparse.issparse(exog_vc): exog_vc = sparse.csr_matrix(exog_vc) ident = ident.astype(np.int) vcp_p = float(vcp_p) fe_p = float(fe_p) # Number of fixed effects parameters if exog is None: k_fep = 0 else: k_fep = exog.shape[1] # Number of variance component structure parameters and # variance component realizations. if exog_vc is None: k_vc = 0 k_vcp = 0 else: k_vc = exog_vc.shape[1] k_vcp = max(ident) + 1 # power would be better but not available in older scipy exog_vc2 = exog_vc.multiply(exog_vc) super(_BayesMixedGLM, self).__init__(endog, exog, **kwargs) self.exog_vc = exog_vc self.exog_vc2 = exog_vc2 self.ident = ident self.family = family self.k_fep = k_fep self.k_vc = k_vc self.k_vcp = k_vcp self.fep_names = fep_names self.vcp_names = vcp_names self.vc_names = vc_names self.fe_p = fe_p self.vcp_p = vcp_p
def partial_fit(self, X, y, monitor=None, sample_weight=None, **kwargs): """Fit the model on a batch of training data. Parameters ---------- X : numpy array or sparse matrix of shape [n_samples, n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator, and a dictionary with {'loss': loss_value} representing the loss calculated by the objective function at this iteration. If the callable returns True the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspection, and snapshoting. sample_weight : numpy array of shape [n_samples,] Per-sample weights. Re-scale the loss per sample. Higher weights force the estimator to put more emphasis on these samples. Sample weights are normalized per-batch. Returns ------- self : returns an instance of self. """ X, y = self._check_inputs(X, y) assert self.batch_size > 0, "batch_size <= 0" if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) # Initialize the model if it hasn't been already by a previous call. if self._is_fitted: y = self._transform_targets(y) else: self._random_state = check_random_state(self.random_state) self._fit_targets(y, **kwargs) y = self._transform_targets(y) self.is_sparse_ = sp.issparse(X) self.input_layer_sz_ = X.shape[1] # Set which layer transform function points to if self.transform_layer_index is None: self._transform_layer_index = len(self.hidden_units) - 1 else: self._transform_layer_index = self.transform_layer_index if (self._transform_layer_index < -1 or self._transform_layer_index >= len(self.hidden_units)): raise ValueError( "`transform_layer_index` must be in the range " "[-1, len(hidden_units)-1]!") # Instantiate the graph. TensorFlow seems easier to use by just # adding to the default graph, and as_default lets you temporarily # set a graph to be treated as the default graph. self.graph_ = Graph() with self.graph_.as_default(): tf_random_seed.set_random_seed( self._random_state.randint(0, 10000000)) tf.get_variable_scope().set_initializer( tf.contrib.layers.xavier_initializer()) self._build_tf_graph() # Train model parameters. self._session.run(tf.global_variables_initializer()) # Set an attributed to mark this as at least partially fitted. self._is_fitted = True # Train the model with the given data. with self.graph_.as_default(): n_examples = X.shape[0] indices = np.arange(n_examples) for epoch in range(self.n_epochs): self._random_state.shuffle(indices) for start_idx in range(0, n_examples, self.batch_size): batch_ind = indices[start_idx:start_idx + self.batch_size] if sample_weight is None: batch_sample_weight = None else: batch_sample_weight = sample_weight[batch_ind] feed_dict = self._make_feed_dict( X[batch_ind], y[batch_ind], sample_weight=batch_sample_weight) obj_val, _ = self._session.run( [self._obj_func, self._train_step], feed_dict=feed_dict) _LOGGER.debug("objective: %.4f, epoch: %d, idx: %d", obj_val, epoch, start_idx) _LOGGER.info("objective: %.4f, epoch: %d, idx: %d", obj_val, epoch, start_idx) if monitor: stop_early = monitor(epoch, self, {'loss': obj_val}) if stop_early: _LOGGER.info( "stopping early due to monitor function.") return self return self
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. """ check_is_fitted(self) X = self._validate_input(X, in_fit=False) statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) # compute mask before eliminating invalid features missing_mask = _get_mask(X, self.missing_values) # Delete the invalid columns if strategy is not constant if self.strategy == "constant": valid_statistics = statistics valid_statistics_indexes = None else: # same as np.isnan but also works for object dtypes invalid_mask = _get_mask(statistics, np.nan) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) if invalid_mask.any(): missing = np.arange(X.shape[1])[invalid_mask] if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] # Do actual imputation if sp.issparse(X): if self.missing_values == 0: raise ValueError("Imputation not possible when missing_values " "== 0 and input is sparse. Provide a dense " "array instead.") else: # if no invalid statistics are found, use the mask computed # before, else recompute mask if valid_statistics_indexes is None: mask = missing_mask.data else: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr))[mask] X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) else: # use mask computed before eliminating invalid mask if valid_statistics_indexes is None: mask_valid_features = missing_mask else: mask_valid_features = missing_mask[:, valid_statistics_indexes] n_missing = np.sum(mask_valid_features, axis=0) values = np.repeat(valid_statistics, n_missing) coordinates = np.where(mask_valid_features.transpose())[::-1] X[coordinates] = values X_indicator = super()._transform_indicator(missing_mask) return super()._concatenate_indicator(X, X_indicator)
def _fit(self, X, y, sample_weight=None, check_input=True): # check X and y if check_input: X, y = check_X_y( X, y, copy=False, accept_sparse='csc', dtype=[np.float64, np.float32], multi_output=True, y_numeric=True, ) y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) if not sp.issparse(X): self.fit_shape_good_for_daal_ = \ True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False else: self.fit_shape_good_for_daal_ = False log_str = "sklearn.linear_model." + self.__class__.__name__ + ".fit: " sklearn_ready = sp.issparse(X) or not self.fit_shape_good_for_daal_ or \ X.dtype not in [np.float64, np.float32] or sample_weight is not None if sklearn_ready: if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info( log_str + get_patch_message("sklearn") ) if sklearn_check_version('0.23'): res_new = super(ElasticNet, self).fit( X, y, sample_weight=sample_weight, check_input=check_input) else: res_new = super(ElasticNet, self).fit( X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new self.n_iter_ = None self._gap = None if not check_input: # only for compliance with Sklearn, # this assert is not required for Intel(R) oneAPI Data # Analytics Library print(type(X), X.flags['F_CONTIGUOUS']) if isinstance(X, np.ndarray) and \ X.flags['F_CONTIGUOUS'] is False: # print(X.flags) raise ValueError("ndarray is not Fortran contiguous") if sklearn_check_version('1.0'): self._normalize = _deprecate_normalize( self.normalize, default=False, estimator_name=self.__class__.__name__) # only for pass tests # "check_estimators_fit_returns_self(readonly_memmap=True) and # check_regressors_train(readonly_memmap=True) if not X.flags.writeable: X = np.copy(X) if not y.flags.writeable: y = np.copy(y) logging.info(log_str + get_patch_message("daal")) if self.__class__.__name__ == "ElasticNet": res = _daal4py_fit_enet(self, X, y, check_input=check_input) else: res = _daal4py_fit_lasso(self, X, y, check_input=check_input) if res is None: if hasattr(self, 'daal_model_'): del self.daal_model_ logging.info( log_str + get_patch_message("sklearn_after_daal") ) if sklearn_check_version('0.23'): res_new = super(ElasticNet, self).fit( X, y, sample_weight=sample_weight, check_input=check_input) else: res_new = super(ElasticNet, self).fit( X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new return res
def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True, y_numeric=True) n_samples, n_features = X.shape X, y, X_mean, y_mean, X_std = LinearModel._center_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) gcv_mode = self.gcv_mode with_sw = len(np.shape(sample_weight)) if gcv_mode is None or gcv_mode == 'auto': if sparse.issparse(X) or n_features > n_samples or with_sw: gcv_mode = 'eigen' else: gcv_mode = 'svd' elif gcv_mode == "svd" and with_sw: # FIXME non-uniform sample weights not yet supported warnings.warn("non-uniform sample weights unsupported for svd, " "forcing usage of eigen") gcv_mode = 'eigen' if gcv_mode == 'eigen': _pre_compute = self._pre_compute _errors = self._errors _values = self._values elif gcv_mode == 'svd': # assert n_samples >= n_features _pre_compute = self._pre_compute_svd _errors = self._errors_svd _values = self._values_svd else: raise ValueError('bad gcv_mode "%s"' % gcv_mode) v, Q, QT_y = _pre_compute(X, y) n_y = 1 if len(y.shape) == 1 else y.shape[1] cv_values = np.zeros((n_samples * n_y, len(self.alphas))) C = [] scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None for i, alpha in enumerate(self.alphas): weighted_alpha = (sample_weight * alpha if sample_weight is not None else alpha) if error: out, c = _errors(weighted_alpha, y, v, Q, QT_y) else: out, c = _values(weighted_alpha, y, v, Q, QT_y) cv_values[:, i] = out.ravel() C.append(c) if error: best = cv_values.mean(axis=0).argmin() else: # The scorer want an object that will make the predictions but # they are already computed efficiently by _RidgeGCV. This # identity_estimator will just return them def identity_estimator(): pass identity_estimator.decision_function = lambda y_predict: y_predict identity_estimator.predict = lambda y_predict: y_predict out = [ scorer(identity_estimator, y.ravel(), cv_values[:, i]) for i in range(len(self.alphas)) ] best = np.argmax(out) self.alpha_ = self.alphas[best] self.dual_coef_ = C[best] self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) self._set_intercept(X_mean, y_mean, X_std) if self.store_cv_values: if len(y.shape) == 1: cv_values_shape = n_samples, len(self.alphas) else: cv_values_shape = n_samples, n_y, len(self.alphas) self.cv_values_ = cv_values.reshape(cv_values_shape) return self
def test_sparse(self): self.assertTrue(sp.issparse(self.para.scat()))
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32, np.int64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T # Note: with dtype=np.int32 we are performing unsafe casts, # where X.astype(dtype) overflows. The result is # then platform dependent and X_dense.astype(dtype) may be # different from X_sparse.astype(dtype).asarray(). X_input = X.astype(dtype) dump_svmlight_file(X_input, y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() comment = str(comment, "utf-8") assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if sp.issparse(X_input): X_input_dense = X_input.toarray() else: X_input_dense = X_input if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype), y2, 15)
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. See also -------- DBSCAN An estimator interface for this clustering algorithm. OPTICS A similar estimator interface clustering at multiple values of eps. Our implementation is optimized for memory usage. Notes ----- For an example, see :ref:`examples/cluster/plot_dbscan.py <sphx_glr_auto_examples_cluster_plot_dbscan.py>`. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). It may attract a higher memory complexity when querying these nearest neighborhoods, depending on the ``algorithm``. One way to avoid the query complexity is to pre-compute sparse neighborhoods in chunks using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``, then using ``metric='precomputed'`` here. Another way to reduce memory and computation time is to remove (near-)duplicate points and use ``sample_weight`` instead. :func:`cluster.optics <sklearn.cluster.optics>` provides a similar clustering with lower memory usage. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place # set the diagonal to explicit values, as a point is its own neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask))) masked_indptr = masked_indptr[X.indptr[1:-1]] # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array( [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary sample_weight : numpy array of shape [n_samples] Individual weights for each sample .. versionadded:: 0.17 parameter *sample_weight* support to LinearRegression. Returns ------- self : returns an instance of self. """ n_jobs_ = self.n_jobs X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): if y.ndim < 2: out = sparse_lsqr(X, y) self.coef_ = out[0] self._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(sparse_lsqr)(X, y[:, j].ravel()) for j in range(y.shape[1])) self.coef_ = np.vstack(out[0] for out in outs) self._residues = np.vstack(out[3] for out in outs) else: self.coef_, self._residues, self.rank_, self.singular_ = \ linalg.lstsq(X, y) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0): """Solve the ridge equation by the method of normal equations. Parameters ---------- X : {array-like, sparse matrix, LinearOperator}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values alpha : {float, array-like}, shape = [n_targets] if array-like The l_2 penalty to be used. If an array is passed, penalties are assumed to be specific to targets max_iter : int, optional Maximum number of iterations for conjugate gradient solver. The default value is determined by scipy.sparse.linalg. sample_weight : float or numpy array of shape [n_samples] Individual weights for each sample. If sample_weight is set, then the solver will automatically be set to 'cholesky' solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'} Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution via a Cholesky decomposition of dot(X.T, X) - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fatest but may not be available in old scipy versions. It also uses an iterative procedure. All three solvers support both dense and sparse data. tol : float Precision of the solution. verbose : int Verbosity level. Setting verbose > 0 will display additional information depending on the solver used. Returns ------- coef : array, shape = [n_features] or [n_targets, n_features] Weight vector(s). Notes ----- This function won't compute the intercept. """ n_samples, n_features = X.shape if y.ndim > 2: raise ValueError("Target y has the wrong shape %s" % str(y.shape)) ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) has_sw = sample_weight is not None solver = _deprecate_dense_cholesky(solver) if solver == 'auto': # cholesky if it's a dense array and cg in # any other case if not sparse.issparse(X) or has_sw: solver = 'cholesky' else: solver = 'sparse_cg' elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'): warnings.warn("""lsqr not available on this machine, falling back to sparse_cg.""") solver = 'sparse_cg' if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # There should be either 1 or n_targets penalties alpha = np.asarray(alpha).ravel() if alpha.size not in [1, n_targets]: raise ValueError("Number of targets and number of penalties " "do not correspond: %d != %d" % (alpha.size, n_targets)) if alpha.size == 1 and n_targets > 1: alpha = np.repeat(alpha, n_targets) if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr'): raise ValueError('Solver %s not understood' % solver) if solver == 'sparse_cg': coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose) elif solver == "lsqr": coef = _solve_lsqr(X, y, alpha, max_iter, tol) elif solver == 'cholesky': if n_features > n_samples: K = safe_sparse_dot(X, X.T, dense_output=True) try: dual_coef = _solve_cholesky_kernel(K, y, alpha) coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' else: try: coef = _solve_cholesky(X, y, alpha) except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' if solver == 'svd': if sparse.issparse(X): raise TypeError('SVD solver does not support sparse' ' inputs currently') coef = _solve_svd(X, y, alpha) if ravel: # When y was passed as a 1d-array, we flatten the coefficients. coef = coef.ravel() return coef
def test_enet_sample_weight_consistency(fit_intercept, alpha, normalize, precompute): """Test that the impact of sample_weight is consistent.""" rng = np.random.RandomState(0) n_samples, n_features = 10, 5 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) params = dict(alpha=alpha, fit_intercept=fit_intercept, precompute=precompute, tol=1e-6, l1_ratio=0.5) reg = ElasticNet(**params).fit(X, y) coef = reg.coef_.copy() if fit_intercept: intercept = reg.intercept_ # sample_weight=np.ones(..) should be equivalent to sample_weight=None sample_weight = np.ones_like(y) reg.fit(X, y, sample_weight=sample_weight) assert_allclose(reg.coef_, coef, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept) # sample_weight=None should be equivalent to sample_weight = number sample_weight = 123. reg.fit(X, y, sample_weight=sample_weight) assert_allclose(reg.coef_, coef, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept) # scaling of sample_weight should have no effect, cf. np.average() sample_weight = 2 * np.ones_like(y) reg.fit(X, y, sample_weight=sample_weight) assert_allclose(reg.coef_, coef, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept) # setting one element of sample_weight to 0 is equivalent to removing # the corresponding sample sample_weight = np.ones_like(y) sample_weight[-1] = 0 reg.fit(X, y, sample_weight=sample_weight) coef1 = reg.coef_.copy() if fit_intercept: intercept1 = reg.intercept_ reg.fit(X[:-1], y[:-1]) assert_allclose(reg.coef_, coef1, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept1) # check that multiplying sample_weight by 2 is equivalent # to repeating corresponding samples twice if sparse.issparse(X): X = X.toarray() X2 = np.concatenate([X, X[:n_samples//2]], axis=0) y2 = np.concatenate([y, y[:n_samples//2]]) sample_weight_1 = np.ones(len(y)) sample_weight_1[:n_samples//2] = 2 reg1 = ElasticNet(**params).fit( X, y, sample_weight=sample_weight_1 ) reg2 = ElasticNet(**params).fit( X2, y2, sample_weight=None ) assert_allclose(reg1.coef_, reg2.coef_)
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, sample_weight=None, return_mean=False, check_input=True): """ Centers data to have mean zero along axis 0. If fit_intercept=False or if the X is a sparse matrix, no centering is done, but normalization can still be applied. The function returns the statistics necessary to reconstruct the input data, which are X_offset, y_offset, X_scale, such that the output X = (X - X_offset) / X_scale X_scale is the L2 norm of X - X_offset. If sample_weight is not None, then the weighted mean of X and y is zero, and not the mean itself. If return_mean=True, the mean, eventually weighted, is returned, independently of whether X was centered (option used for optimization with sparse data in coordinate_descend). This is here because nearly all linear models will want their data to be centered. This function also systematically makes y consistent with X.dtype """ if isinstance(sample_weight, numbers.Number): sample_weight = None if check_input: X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], dtype=FLOAT_DTYPES) elif copy: if sp.issparse(X): X = X.copy() else: X = X.copy(order='K') y = np.array(y, dtype=X.dtype, copy=copy, order='C') if fit_intercept: if sp.issparse(X): X_offset, X_var = mean_variance_axis(X, axis=0) if not return_mean: X_offset[:] = X.dtype.type(0) if normalize: # TODO: f_normalize could be used here as well but the function # inplace_csr_row_normalize_l2 must be changed such that it # can return also the norms computed internally # transform variance to norm in-place X_var *= X.shape[0] X_scale = np.sqrt(X_var, X_var) del X_var X_scale[X_scale == 0] = 1 inplace_column_scale(X, 1. / X_scale) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) else: X_offset = np.average(X, axis=0, weights=sample_weight) X -= X_offset if normalize: X, X_scale = f_normalize(X, axis=0, copy=False, return_norm=True) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale
def plot_contours(A, Cn, thr=None, thr_method='max', maxthr=0.2, nrgthr=0.9, display_numbers=True, max_number=None, cmap=None, swap_dim=False, colors='w', vmin=None, vmax=None, **kwargs): """Plots contour of spatial components against a background image and returns their coordinates Parameters: ----------- A: np.ndarray or sparse matrix Matrix of Spatial components (d x K) Cn: np.ndarray (2D) Background image (e.g. mean, correlation) thr_method: [optional] string Method of thresholding: 'max' sets to zero pixels that have value less than a fraction of the max value 'nrg' keeps the pixels that contribute up to a specified fraction of the energy maxthr: [optional] scalar Threshold of max value nrgthr: [optional] scalar Threshold of energy thr: scalar between 0 and 1 Energy threshold for computing contours (default 0.9) Kept for backwards compatibility. If not None then thr_method = 'nrg', and nrgthr = thr display_number: Boolean Display number of ROIs if checked (default True) max_number: int Display the number for only the first max_number components (default None, display all numbers) cmap: string User specifies the colormap (default None, default colormap) Returns: -------- Coor: list of coordinates with center of mass, contour plot coordinates and bounding box for each component """ if issparse(A): A = np.array(A.todense()) else: A = np.array(A) if swap_dim: Cn = Cn.T print('Swapping dim') d1, d2 = np.shape(Cn) d, nr = np.shape(A) if max_number is None: max_number = nr if thr is not None: thr_method = 'nrg' nrgthr = thr warn("The way to call utilities.plot_contours has changed. Look at the definition for more details.") x, y = np.mgrid[0:d1:1, 0:d2:1] ax = pl.gca() if vmax is None and vmin is None: pl.imshow(Cn, interpolation=None, cmap=cmap, vmin=np.percentile(Cn[~np.isnan(Cn)], 1), vmax=np.percentile(Cn[~np.isnan(Cn)], 99)) else: pl.imshow(Cn, interpolation=None, cmap=cmap, vmin=vmin, vmax=vmax) coordinates = [] cm = com(A, d1, d2) for i in range(np.minimum(nr, max_number)): pars = dict(kwargs) if thr_method == 'nrg': indx = np.argsort(A[:, i], axis=None)[::-1] cumEn = np.cumsum(A[:, i].flatten()[indx]**2) cumEn /= cumEn[-1] Bvec = np.zeros(d) Bvec[indx] = cumEn thr = nrgthr else: # thr_method = 'max' if thr_method != 'max': warn("Unknown threshold method. Choosing max") Bvec = A[:, i].flatten() Bvec /= np.max(Bvec) thr = maxthr if swap_dim: Bmat = np.reshape(Bvec, np.shape(Cn), order='C') else: Bmat = np.reshape(Bvec, np.shape(Cn), order='F') cs = pl.contour(y, x, Bmat, [thr], colors=colors) # this fix is necessary for having disjoint figures and borders plotted correctly p = cs.collections[0].get_paths() v = np.atleast_2d([np.nan, np.nan]) for pths in p: vtx = pths.vertices num_close_coords = np.sum(np.isclose(vtx[0, :], vtx[-1, :])) if num_close_coords < 2: if num_close_coords == 0: # case angle newpt = np.round(old_div(vtx[-1, :], [d2, d1])) * [d2, d1] #import ipdb; ipdb.set_trace() vtx = np.concatenate((vtx, newpt[np.newaxis, :]), axis=0) else: # case one is border vtx = np.concatenate((vtx, vtx[0, np.newaxis]), axis=0) #import ipdb; ipdb.set_trace() v = np.concatenate((v, vtx, np.atleast_2d([np.nan, np.nan])), axis=0) pars['CoM'] = np.squeeze(cm[i, :]) pars['coordinates'] = v pars['bbox'] = [np.floor(np.min(v[:, 1])), np.ceil(np.max(v[:, 1])), np.floor(np.min(v[:, 0])), np.ceil(np.max(v[:, 0]))] pars['neuron_id'] = i + 1 coordinates.append(pars) if display_numbers: for i in range(np.minimum(nr, max_number)): if swap_dim: ax.text(cm[i, 0], cm[i, 1], str(i + 1), color=colors) else: ax.text(cm[i, 1], cm[i, 0], str(i + 1), color=colors) return coordinates
def _sort_if_sparse(X): if issparse(X) and not X.has_sorted_indices: X.sort_indices()
def fit(self, X, y=None, sample_weight=None): """ Fit estimator. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices are also supported, use sparse ``csc_matrix`` for maximum efficiency. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Returns ------- self : object Fitted estimator. """ X = check_array(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() rnd = check_random_state(self.random_state) y = rnd.uniform(size=X.shape[0]) # ensure that max_sample is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, str): if self.max_samples == 'auto': max_samples = min(256, n_samples) else: raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, numbers.Integral): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not 0. < self.max_samples <= 1.: raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples max_depth = int(np.ceil(np.log2(max(max_samples, 2)))) super()._fit(X, y, max_samples, max_depth=max_depth, sample_weight=sample_weight) if self.contamination == "auto": # 0.5 plays a special role as described in the original paper. # we take the opposite as we consider the opposite of their score. self.offset_ = -0.5 return self # else, define offset_ wrt contamination parameter self.offset_ = np.percentile(self.score_samples(X), 100. * self.contamination) return self
def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): """Binarize labels in a one-vs-all fashion Several regression and binary classification algorithms are available in scikit-learn. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. This function makes it possible to compute this transformation for a fixed set of class labels known ahead of time. Parameters ---------- y : array-like Sequence of integer labels or multilabel data to encode. classes : array-like of shape [n_classes] Uniquely holds the label for each class. neg_label : int (default: 0) Value with which negative labels must be encoded. pos_label : int (default: 1) Value with which positive labels must be encoded. sparse_output : boolean (default: False), Set to true if output binary array is desired in CSR sparse format Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. Examples -------- >>> from mrex.preprocessing import label_binarize >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) array([[1, 0, 0, 0], [0, 0, 0, 1]]) The class ordering is preserved: >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) array([[1, 0, 0, 0], [0, 1, 0, 0]]) Binary targets transform to a column vector >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) array([[1], [0], [0], [1]]) See also -------- LabelBinarizer : class used to wrap the functionality of label_binarize and allow for fitting to classes independently of the transform operation """ if not isinstance(y, list): # XXX Workaround that will be removed when list of list format is # dropped y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None) else: if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) if (sparse_output and (pos_label == 0 or neg_label != 0)): raise ValueError("Sparse binarization is only supported with non " "zero pos_label and zero neg_label, got " "pos_label={0} and neg_label={1}" "".format(pos_label, neg_label)) # To account for pos_label == 0 in the dense case pos_switch = pos_label == 0 if pos_switch: pos_label = -neg_label y_type = type_of_target(y) if 'multioutput' in y_type: raise ValueError("Multioutput target data is not supported with label " "binarization") if y_type == 'unknown': raise ValueError("The type of target data is not known") n_samples = y.shape[0] if sp.issparse(y) else len(y) n_classes = len(classes) classes = np.asarray(classes) if y_type == "binary": if n_classes == 1: if sparse_output: return sp.csr_matrix((n_samples, 1), dtype=int) else: Y = np.zeros((len(y), 1), dtype=np.int) Y += neg_label return Y elif len(classes) >= 3: y_type = "multiclass" sorted_class = np.sort(classes) if (y_type == "multilabel-indicator" and classes.size != y.shape[1]): raise ValueError("classes {0} missmatch with the labels {1}" "found in the data".format(classes, unique_labels(y))) if y_type in ("binary", "multiclass"): y = column_or_1d(y) # pick out the known labels from y y_in_classes = np.in1d(y, classes) y_seen = y[y_in_classes] indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) data = np.empty_like(indices) data.fill(pos_label) Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": Y = sp.csr_matrix(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) Y.data = data else: raise ValueError("%s target data is not supported with label " "binarization" % y_type) if not sparse_output: Y = Y.toarray() Y = Y.astype(int, copy=False) if neg_label != 0: Y[Y == 0] = neg_label if pos_switch: Y[Y == pos_label] = 0 else: Y.data = Y.data.astype(int, copy=False) # preserve label ordering if np.any(classes != sorted_class): indices = np.searchsorted(sorted_class, classes) Y = Y[:, indices] if y_type == "binary": if sparse_output: Y = Y.getcol(-1) else: Y = Y[:, -1].reshape((-1, 1)) return Y
def _declare_partials(self, of, wrt, dependent=True, rows=None, cols=None, val=None): """ Store subjacobian metadata for later use. Parameters ---------- of : str or list of str The name of the residual(s) that derivatives are being computed for. May also contain a glob pattern. wrt : str or list of str The name of the variables that derivatives are taken with respect to. This can contain the name of any input or output variable. May also contain a glob pattern. dependent : bool(True) If False, specifies no dependence between the output(s) and the input(s). This is only necessary in the case of a sparse global jacobian, because if 'dependent=False' is not specified and declare_partials is not called for a given pair, then a dense matrix of zeros will be allocated in the sparse global jacobian for that pair. In the case of a dense global jacobian it doesn't matter because the space for a dense subjac will always be allocated for every pair. rows : ndarray of int or None Row indices for each nonzero entry. For sparse subjacobians only. cols : ndarray of int or None Column indices for each nonzero entry. For sparse subjacobians only. val : float or ndarray of float or scipy.sparse Value of subjacobian. If rows and cols are not None, this will contain the values found at each (row, col) location in the subjac. """ if dependent and val is not None and not issparse(val): val = np.atleast_1d(val) # np.promote_types will choose the smallest dtype that can contain both arguments safe_dtype = np.promote_types(val.dtype, float) val = val.astype(safe_dtype, copy=False) if dependent and rows is not None: rows = np.array(rows, dtype=int, copy=False) cols = np.array(cols, dtype=int, copy=False) if rows.shape != cols.shape: raise ValueError('rows and cols must have the same shape,' ' rows: {}, cols: {}'.format( rows.shape, cols.shape)) if val is not None and val.shape != ( 1, ) and rows.shape != val.shape: raise ValueError( 'If rows and cols are specified, val must be a scalar or have the ' 'same shape, val: {}, rows/cols: {}'.format( val.shape, rows.shape)) if val is None: val = np.zeros_like(rows, dtype=float) pattern_matches = self._find_partial_matches(of, wrt) multiple_items = False for of_bundle, wrt_bundle in product(*pattern_matches): of_pattern, of_matches = of_bundle wrt_pattern, wrt_matches = wrt_bundle if not of_matches: raise ValueError( 'No matches were found for of="{}"'.format(of_pattern)) if not wrt_matches: raise ValueError( 'No matches were found for wrt="{}"'.format(wrt_pattern)) make_copies = (multiple_items or len(of_matches) > 1 or len(wrt_matches) > 1) # Setting this to true means that future loop iterations (i.e. if there are multiple # items in either of or wrt) will make copies. multiple_items = True for rel_key in product(of_matches, wrt_matches): abs_key = rel_key2abs_key(self, rel_key) if not dependent: if abs_key in self._subjacs_info: del self._subjacs_info[abs_key] continue meta_changes = { 'rows': rows, 'cols': cols, 'value': deepcopy(val) if make_copies else val, 'dependent': dependent } if abs_key in self._subjacs_info: meta = self._subjacs_info[abs_key] else: meta = SUBJAC_META_DEFAULTS.copy() meta.update(meta_changes) self._check_partials_meta(abs_key, meta) self._subjacs_info[abs_key] = meta
def _integrate(self, model, t_eval, inputs=None): """ Solve a model defined by dydt with initial conditions y0. Parameters ---------- model : :class:`pybamm.BaseModel` The model whose solution to calculate. t_eval : numeric type The times at which to compute the solution inputs : dict, optional Any input parameters to pass to the model when solving """ derivs = model.rhs_eval y0 = model.y0 events = model.terminate_events_eval jacobian = model.jacobian_eval def eqsydot(t, y, return_ydot): return_ydot[:] = derivs(t, y) def rootfn(t, y, return_root): return_root[:] = [event(t, y) for event in events] if jacobian: jac_y0_t0 = jacobian(t_eval[0], y0) if sparse.issparse(jac_y0_t0): def jacfn(t, y, fy, J): J[:][:] = jacobian(t, y).toarray() def jac_times_vecfn(v, Jv, t, y, userdata): Jv[:] = userdata._jac_eval * v return 0 else: def jacfn(t, y, fy, J): J[:][:] = jacobian(t, y) def jac_times_vecfn(v, Jv, t, y, userdata): Jv[:] = np.matmul(userdata._jac_eval, v) return 0 def jac_times_setupfn(t, y, fy, userdata): userdata._jac_eval = jacobian(t, y) return 0 extra_options = { "old_api": False, "rtol": self.rtol, "atol": self.atol, "linsolver": self.linsolver, } if jacobian: if self.linsolver in ("dense", "lapackdense"): extra_options.update({"jacfn": jacfn}) elif self.linsolver in ("spgmr", "spbcgs", "sptfqmr"): extra_options.update({ "jac_times_setupfn": jac_times_setupfn, "jac_times_vecfn": jac_times_vecfn, "user_data": self, }) if events: extra_options.update({"rootfn": rootfn, "nr_rootfns": len(events)}) ode_solver = scikits_odes.ode(self.method, eqsydot, **extra_options) sol = ode_solver.solve(t_eval, y0) # return solution, we need to tranpose y to match scipy's ivp interface if sol.flag in [0, 2]: # 0 = solved for all t_eval if sol.flag == 0: termination = "final time" # 2 = found root(s) elif sol.flag == 2: termination = "event" if sol.roots.t is None: t_root = None else: t_root = sol.roots.t return pybamm.Solution( sol.values.t, np.transpose(sol.values.y), t_root, np.transpose(sol.roots.y), termination, ) else: raise pybamm.SolverError(sol.message)
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ if self.axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data need to be recomputed # when the imputation is done per sample else: X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) if sparse.issparse(X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, self.axis) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, self.axis) # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] elif self.axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), np.diff(X.indptr))[mask] X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) else: if sparse.issparse(X): X = X.toarray() mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) values = np.repeat(valid_statistics, n_missing) if self.axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask X[coordinates] = values return X
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. check_input : bool, default=True Run check_array on X. y : Ignored Returns ------- self : object Returns the instance itself. """ first_pass = not hasattr(self, "components_") if check_input: if sparse.issparse(X): raise TypeError( "IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches.") X = self._validate_data(X, copy=self.copy, dtype=[np.float64, np.float32], reset=first_pass) n_samples, n_features = X.shape if first_pass: self.components_ = None if self.n_components is None: if self.components_ is None: self.n_components_ = min(n_samples, n_features) else: self.n_components_ = self.components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) elif not self.n_components <= n_samples: raise ValueError("n_components=%r must be less or equal to " "the batch number of samples " "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # This is the first partial_fit if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = 0 self.mean_ = .0 self.var_ = .0 # Update stats - they are 0 if this is the first step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1])) n_total_samples = n_total_samples[0] # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ np.sqrt((self.n_samples_seen_ / n_total_samples) * n_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape( (-1, 1)) * self.components_, X, mean_correction)) U, S, Vt = linalg.svd(X, full_matrices=False, check_finite=False) U, Vt = svd_flip(U, Vt, u_based_decision=False) explained_variance = S**2 / (n_total_samples - 1) explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples) self.n_samples_seen_ = n_total_samples self.components_ = Vt[:self.n_components_] self.singular_values_ = S[:self.n_components_] self.mean_ = col_mean self.var_ = col_var self.explained_variance_ = explained_variance[:self.n_components_] self.explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self.noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self.noise_variance_ = 0. return self
def query(self, query_point, num_results=None, distance_func=None): """ Takes `query_point` which is a sparse CSR matrix of 1 x `input_dim`, returns `num_results` of results as a list of tuples that are ranked based on the supplied metric function `distance_func`. :param query_point: A sparse CSR matrix. The dimension needs to be 1 * `input_dim`. Used by :meth:`._hash`. :param num_results: (optional) Integer, specifies the max amount of results to be returned. If not specified all candidates will be returned as a list in ranked order. NOTE: You do not save processing by limiting the results. Currently, a similarity ranking and sort is done on all items in the hashtable. :param distance_func: (optional) The distance function to be used. Currently it needs to be one of ("hamming", "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm"). By default "euclidean" will used. """ assert sparse.issparse(query_point), "query_point needs to be sparse" candidates = [] if not distance_func: distance_func = "euclidean" for i, table in enumerate(self.hash_tables): # get hash of query point binary_hash = self._hash(self.uniform_planes[i], query_point) for key in list(table.keys()): # calculate distance from query point hash to all hashes distance = LSH.hamming_dist( self._string_bits_to_array(key), self._string_bits_to_array(binary_hash)) # NOTE: we could make this threshold user defined if distance < 2: members = table.get_list(key) candidates.extend(members) d_func = LSH.euclidean_dist_square else: if distance_func == "euclidean": d_func = LSH.euclidean_dist_square elif distance_func == "true_euclidean": d_func = LSH.euclidean_dist elif distance_func == "centred_euclidean": d_func = LSH.euclidean_dist_centred elif distance_func == "cosine": d_func = LSH.cosine_dist elif distance_func == "l1norm": d_func = LSH.l1norm_dist else: raise ValueError("The distance function name is invalid.") # TODO: pull out into fn w/ optional threshold arg for i, table in enumerate(self.hash_tables): binary_hash = self._hash(self.uniform_planes[i], query_point) candidates.extend(table.get_list(binary_hash)[0]) # # rank candidates by distance function ranked_candidates = [] for ix in candidates: point = self._as_np_array(ix) dist = d_func(query_point, point) ranked_candidates.append((ix, dist)) # TODO: stop sorting when we have top num_results, instead of truncating # TODO: (do this by replacing set with ordered set) # after we've done the entire list ranked_candidates.sort(key=lambda x: x[1]) return ranked_candidates[:num_results] if num_results else ranked_candidates
def resample(*arrays, **options): """Resample arrays or sparse matrices in a consistent way The default strategy implements one step of the bootstrapping procedure. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. replace : boolean, True by default Implements resampling with replacement. If False, this will implement (sliced) random permutations. n_samples : int, None by default Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. If replace is False it should not be larger than the length of arrays. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- resampled_arrays : sequence of indexable data-structures Sequence of resampled views of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) >>> X array([[ 1., 0.], [ 2., 1.], [ 1., 0.]]) >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <3x2 sparse matrix of type '<... 'numpy.float64'>' with 4 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[ 1., 0.], [ 2., 1.], [ 1., 0.]]) >>> y array([0, 1, 0]) >>> resample(y, n_samples=2, random_state=0) array([0, 1]) See also -------- :func:`sklearn.utils.shuffle` """ random_state = check_random_state(options.pop('random_state', None)) replace = options.pop('replace', True) max_n_samples = options.pop('n_samples', None) if options: raise ValueError("Unexpected kw arguments: %r" % options.keys()) if len(arrays) == 0: return None first = arrays[0] n_samples = first.shape[0] if hasattr(first, 'shape') else len(first) if max_n_samples is None: max_n_samples = n_samples elif (max_n_samples > n_samples) and (not replace): raise ValueError("Cannot sample %d out of arrays with dim %d " "when replace is False" % (max_n_samples, n_samples)) check_consistent_length(*arrays) if replace: indices = random_state.randint(0, n_samples, size=(max_n_samples,)) else: indices = np.arange(n_samples) random_state.shuffle(indices) indices = indices[:max_n_samples] # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] resampled_arrays = [safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] else: return resampled_arrays
def correlation_matrix( adata: AnnData, name_list: Optional[Collection[str]] = None, groupby: Optional[str] = None, group: Optional[int] = None, n_genes: int = 20, data: Literal['Complete', 'Group', 'Rest'] = 'Complete', method: Literal['pearson', 'kendall', 'spearman'] = 'pearson', annotation_key: Optional[str] = None, ) -> None: """\ Calculate correlation matrix. Calculate a correlation matrix for genes strored in sample annotation using :func:`~scanpy.tl.rank_genes_groups`. Parameters ---------- adata Annotated data matrix. name_list Takes a list of genes for which to calculate the correlation matrix groupby If no name list is passed, genes are selected from the results of rank_gene_groups. Then this is the key of the sample grouping to consider. Note that in this case also a group index has to be specified. group Group index for which the correlation matrix for top_ranked genes should be calculated. Currently only int is supported, will change very soon n_genes For how many genes to calculate correlation matrix? If specified, cuts the name list (in whatever order it is passed). data At the moment, this is only relevant for the case that name_list is drawn from rank_gene_groups results. If specified, collects mask for the called group and then takes only those cells specified. If 'Complete', calculate correlation using full data If 'Group', calculate correlation within the selected group. If 'Rest', calculate corrlation for everything except the group method Which kind of correlation coefficient to use pearson standard correlation coefficient kendall Kendall Tau correlation coefficient spearman Spearman rank correlation annotation_key Allows to define the name of the anndata entry where results are stored. """ # TODO: At the moment, only works for int identifiers # If no genes are passed, selects ranked genes from sample annotation. # At the moment, only calculate one table (Think about what comes next) if name_list is None: name_list = list() for j, k in enumerate(adata.uns['rank_genes_groups_gene_names']): if j >= n_genes: break name_list.append( adata.uns['rank_genes_groups_gene_names'][j][group]) else: if len(name_list) > n_genes: name_list = name_list[0:n_genes] # If special method (later) , truncate adata_relevant = adata[:, name_list] # This line just makes group_mask access easier. Nothing else but 'all' will stand here. groups = 'all' if data == 'Complete' or groupby is None: if issparse(adata_relevant.X): Data_array = adata_relevant.X.todense() else: Data_array = adata_relevant.X else: # get group_mask groups_order, groups_masks = select_groups(adata, groups, groupby) if data == 'Group': if issparse(adata_relevant.X): Data_array = adata_relevant.X[groups_masks[group], :].todense() else: Data_array = adata_relevant.X[groups_masks[group], :] elif data == 'Rest': if issparse(adata_relevant.X): Data_array = adata_relevant.X[ ~groups_masks[group], :].todense() else: Data_array = adata_relevant.X[~groups_masks[group], :] else: logg.error( 'data argument should be either <Complete> or <Group> or <Rest>' ) # Distinguish between sparse and non-sparse data DF_array = pd.DataFrame(Data_array, columns=name_list) cor_table = DF_array.corr(method=method) if annotation_key is None: if groupby is None: adata.uns['Correlation_matrix'] = cor_table else: adata.uns['Correlation_matrix' + groupby + str(group)] = cor_table else: adata.uns[annotation_key] = cor_table
def explain_prediction_xgboost( xgb, doc, vec=None, top=None, top_targets=None, target_names=None, targets=None, feature_names=None, feature_re=None, # type: Pattern[str] feature_filter=None, vectorized=False, # type: bool is_regression=None, # type: bool missing=None, # type: bool ): """ Return an explanation of XGBoost prediction (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights. See :func:`eli5.explain_prediction` for description of ``top``, ``top_targets``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. Parameters ---------- vec : vectorizer, optional A vectorizer instance used to transform raw features to the input of the estimator ``xgb`` (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. vectorized : bool, optional A flag which tells eli5 if ``doc`` should be passed through ``vec`` or not. By default it is False, meaning that if ``vec`` is not None, ``vec.transform([doc])`` is passed to the estimator. Set it to True if you're passing ``vec``, but ``doc`` is already vectorized. is_regression : bool, optional Pass if an ``xgboost.Booster`` is passed as the first argument. True if solving a regression problem ("objective" starts with "reg") and False for a classification problem. If not set, regression is assumed for a single target estimator and proba will not be shown. missing : optional Pass if an ``xgboost.Booster`` is passed as the first argument. Set it to the same value as the ``missing`` argument to ``xgboost.DMatrix``. Matters only if sparse values are used. Default is ``np.nan``. Method for determining feature importances follows an idea from http://blog.datadive.net/interpreting-random-forests/. Feature weights are calculated by following decision paths in trees of an ensemble. Each leaf has an output score, and expected scores can also be assigned to parent nodes. Contribution of one feature on the decision path is how much expected score changes from parent to child. Weights of all features sum to the output score of the estimator. """ booster, is_regression = _check_booster_args(xgb, is_regression) xgb_feature_names = booster.feature_names vec, feature_names = handle_vec(xgb, doc, vec, vectorized, feature_names, num_features=len(xgb_feature_names)) if feature_names.bias_name is None: # XGBoost estimators do not have an intercept, but here we interpret # them as having an intercept feature_names.bias_name = '<BIAS>' X = get_X(doc, vec, vectorized=vectorized) if sp.issparse(X): # Work around XGBoost issue: # https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543 X = X.tocsc() if missing is None: missing = np.nan if isinstance(xgb, Booster) else xgb.missing dmatrix = DMatrix(X, missing=missing) if isinstance(xgb, Booster): prediction = xgb.predict(dmatrix) n_targets = prediction.shape[-1] # type: int if is_regression is None: # When n_targets is 1, this can be classification too, # but it's safer to assume regression. # If n_targets > 1, it must be classification. is_regression = n_targets == 1 if is_regression: proba = None else: if n_targets == 1: p, = prediction proba = np.array([1 - p, p]) else: proba, = prediction else: proba = predict_proba(xgb, X) n_targets = _xgb_n_targets(xgb) if is_regression: names = ['y'] elif isinstance(xgb, Booster): names = np.arange(max(2, n_targets)) else: names = xgb.classes_ scores_weights = _prediction_feature_weights(booster, dmatrix, n_targets, feature_names, xgb_feature_names) x = get_X0(add_intercept(X)) x = _missing_values_set_to_nan(x, missing, sparse_missing=True) return get_decision_path_explanation( xgb, doc, vec, x=x, feature_names=feature_names, feature_filter=feature_filter, feature_re=feature_re, top=top, vectorized=vectorized, original_display_names=names, target_names=target_names, targets=targets, top_targets=top_targets, is_regression=is_regression, is_multiclass=n_targets > 1, proba=proba, get_score_weights=lambda label_id: scores_weights[label_id], )
def ROC_AUC_analysis( adata: AnnData, groupby: str, group: Optional[str] = None, n_genes: int = 100, ): """\ Calculate correlation matrix. Calculate a correlation matrix for genes strored in sample annotation Parameters ---------- adata Annotated data matrix. groupby The key of the sample grouping to consider. group Group name or index for which the correlation matrix for top ranked genes should be calculated. If no parameter is passed, ROC/AUC is calculated for all groups n_genes For how many genes to calculate ROC and AUC. If no parameter is passed, calculation is done for all stored top ranked genes. """ if group is None: pass # TODO: Loop over all groups instead of just taking one. # Assume group takes an int value for one group for the moment. name_list = list() for j, k in enumerate(adata.uns['rank_genes_groups_gene_names']): if j >= n_genes: break name_list.append(adata.uns['rank_genes_groups_gene_names'][j][group]) # TODO: For the moment, see that everything works for comparison against the rest. Resolve issues later. groups = 'all' groups_order, groups_masks = select_groups(adata, groups, groupby) # Use usual convention, better for looping later. mask = groups_masks[group] # TODO: Allow for sample weighting requires better mask access... later # We store calculated data in dict, access it via dict to dict. Check if this is the best way. fpr = {} tpr = {} thresholds = {} roc_auc = {} y_true = mask for i, j in enumerate(name_list): vec = adata[:, [j]].X if issparse(vec): y_score = vec.todense() else: y_score = vec ( fpr[name_list[i]], tpr[name_list[i]], thresholds[name_list[i]], ) = metrics.roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=False) roc_auc[name_list[i]] = metrics.auc(fpr[name_list[i]], tpr[name_list[i]]) adata.uns['ROCfpr' + groupby + str(group)] = fpr adata.uns['ROCtpr' + groupby + str(group)] = tpr adata.uns['ROCthresholds' + groupby + str(group)] = thresholds adata.uns['ROC_AUC' + groupby + str(group)] = roc_auc
import scanpy.api as sc import scipy.sparse as sp_sparse # andata = sc.read_h5ad("./ExprMatrix.h5ad") andata = sc.read_h5ad("./100_test_data.h5ad") print("Finished reading.") andata.var_names_make_unique() if sp_sparse.issparse(andata.X): andata.X = andata.X.toarray() # andata = andata partial_data = andata[:100, :] print("Finished processing") sc.write("100_test_data.h5ad", partial_data) print("Finished writing.")