def predict_loo(self): """Predict the class labels for the training data via leave-one-out. Returns ------- y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each training data sample. """ neigh_dist, neigh_ind = self.kneighbors() classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] n_outputs = len(classes_) n_queries = len(neigh_dist) weights = _get_weights(neigh_dist, self.weights) y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) for k, classes_k in enumerate(classes_): if weights is None: mode, _ = stats.mode(_y[neigh_ind, k], axis=1) else: mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) mode = np.asarray(mode.ravel(), dtype=np.intp) y_pred[:, k] = classes_k.take(mode) if not self.outputs_2d_: y_pred = y_pred.ravel() return y_pred
def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : sktime-format pandas dataframe or array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ self.check_is_fitted() temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ X = check_array(X, accept_sparse='csr') neigh_dist, neigh_ind = self.kneighbors(X) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] n_samples = X.shape[0] weights = _get_weights(neigh_dist, self.weights) if weights is None: weights = np.ones_like(neigh_ind) all_rows = np.arange(X.shape[0]) probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = _y[:, k][neigh_ind] proba_k = np.zeros((n_samples, classes_k.size)) # a simple ':' index doesn't work right for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) proba_k[all_rows, idx] += weights[:, i] # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer probabilities.append(proba_k) if not self.outputs_2d_: probabilities = probabilities[0] check_array.__wrapped__.__code__ = temp return probabilities
def predict_loo(self): """Predict the target for the training data via leave-one-out. Returns ------- y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int Target values. """ neigh_dist, neigh_ind = self.kneighbors() weights = _get_weights(neigh_dist, self.weights) _y = self._y if _y.ndim == 1: _y = _y.reshape((-1, 1)) if weights is None: y_pred = np.mean(_y[neigh_ind], axis=1) else: y_pred = np.empty((len(neigh_dist), _y.shape[1]), dtype=np.float64) denom = np.sum(weights, axis=1) for j in range(_y.shape[1]): num = np.sum(_y[neigh_ind, j] * weights, axis=1) y_pred[:, j] = num / denom if self._y.ndim == 1: y_pred = y_pred.ravel() return y_pred
def _get_kdn(self, knn, y): dist, kid = knn.kneighbors( ) # (n_estimators,K) : ids & dist of nn's for every sample in X weights = _get_weights(dist, self.weight) if weights is None: weights = np.ones_like(kid) agreement = y[kid] == y.reshape(-1, 1) return np.average(agreement, axis=1, weights=weights)
def predict(self, X): train_X, train_Y = self.data_ dist = self.pairwise_distance(train_X, X) assert np.all(dist >= 0) idx = np.argsort(dist, axis=1) nn_idx = idx[:, :self.K] nn_dist = dist[np.arange(len(X))[:, None], nn_idx] nn_labels = train_Y[nn_idx] weights = _get_weights(nn_dist, 'distance') # Weighted KNN a, _ = weighted_mode(nn_labels, weights, axis=1) return a.reshape(-1)
def predict(self, X): """Predict the class labels for the provided data Parameters ---------- X : sktime-format pandas dataframe or array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- y : array of shape [n_samples] or [n_samples, n_outputs] Class labels for each data sample. """ self.check_is_fitted() if hasattr(check_array, '__wrapped__'): temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ else: temp = check_array.__code__ check_array.__code__ = _check_array_ts.__code__ neigh_dist, neigh_ind = self.kneighbors(X) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] n_outputs = len(classes_) n_samples = X.shape[0] weights = _get_weights(neigh_dist, self.weights) y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) for k, classes_k in enumerate(classes_): if weights is None: mode, _ = stats.mode(_y[neigh_ind, k], axis=1) else: mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) mode = np.asarray(mode.ravel(), dtype=np.intp) y_pred[:, k] = classes_k.take(mode) if not self.outputs_2d_: y_pred = y_pred.ravel() if hasattr(check_array, '__wrapped__'): check_array.__wrapped__.__code__ = temp else: check_array.__code__ = temp return y_pred
def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col, col): """Helper function to impute a single column. Parameters ---------- dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors) Distance matrix between the receivers and potential donors from training set. There must be at least one non-nan distance between a receiver and a potential donor. n_neighbors : int Number of neighbors to consider. fit_X_col : ndarray of shape (n_potential_donors,) Column of potential donors from training set. mask_fit_X_col : ndarray of shape (n_potential_donors,) Missing mask for fit_X_col. col: int The index of the column to be imputed. This was not passed to the function in the original version. Returns ------- imputed_values: ndarray of shape (n_receivers,) Imputed values for receiver. """ # Get donors donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[:, :n_neighbors] # Get weight matrix from from distance matrix donors_dist = dist_pot_donors[np.arange(donors_idx.shape[0])[:, None], donors_idx] weight_matrix = _get_weights(donors_dist, self.weights) # fill nans with zeros if weight_matrix is not None: weight_matrix[np.isnan(weight_matrix)] = 0.0 # Retrieve donor values and calculate kNN average donors = fit_X_col.take(donors_idx) mask = mask_fit_X_col.take(donors_idx) donors_mask = np.ma.array(donors, mask=mask) # Adapted the function to compute the mode for categorical variables. if (self.metric == 'nan_cat_euclidean') and (self.ncat is not None): if self.ncat[col] > 1: donors[mask] = np.nan res = mode(donors, axis=1).mode.ravel() else: res = np.ma.average(donors_mask, axis=1, weights=weight_matrix).data else: res = np.ma.average(donors_mask, axis=1, weights=weight_matrix).data return res
def _impute(self, dist, X, fitted_X, mask, mask_fx): """Helper function to find and impute missing values""" # For each column, find and impute n_rows_X, n_cols_X = X.shape for c in range(n_cols_X): if not np.any(mask[:, c], axis=0): continue # Row index for receivers and potential donors (pdonors) receivers_row_idx = np.where(mask[:, c])[0] pdonors_row_idx = np.where(~mask_fx[:, c])[0] # Impute using column mean if n_neighbors are not available if len(pdonors_row_idx) < self.n_neighbors: warnings.warn("Insufficient number of neighbors! " "Filling in column mean.") X[receivers_row_idx, c] = self.statistics_[c] continue # Get distance from potential donors dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] dist_pdonors = dist_pdonors.reshape(-1, len(pdonors_row_idx)) # Argpartition to separate actual donors from the rest pdonors_idx = np.argpartition( dist_pdonors, self.n_neighbors - 1, axis=1) # Get final donors row index from pdonors donors_idx = pdonors_idx[:, :self.n_neighbors] # Get weights or None dist_pdonors_rows = np.arange(len(donors_idx))[:, None] weight_matrix = _get_weights( dist_pdonors[ dist_pdonors_rows, donors_idx], self.weights) donor_row_idx_ravel = donors_idx.ravel() # Retrieve donor values and calculate kNN score fitted_X_temp = fitted_X[pdonors_row_idx] donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( (-1, self.n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) # Final imputation imputed = np.ma.average(donors, axis=1, weights=weight_matrix) X[receivers_row_idx, c] = imputed.data return X
def predict_proba_loo(self): """Return probability estimates for the training data via leave-one-out. Returns ------- p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the training data samples. Classes are ordered by lexicographic order. """ neigh_dist, neigh_ind = self.kneighbors() classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] n_queries = len(neigh_dist) weights = _get_weights(neigh_dist, self.weights) if weights is None: weights = np.ones_like(neigh_ind) all_rows = np.arange(n_queries) probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = _y[:, k][neigh_ind] proba_k = np.zeros((n_queries, classes_k.size)) # a simple ':' index doesn't work right for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) proba_k[all_rows, idx] += weights[:, i] # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer probabilities.append(proba_k) if not self.outputs_2d_: probabilities = probabilities[0] return probabilities