def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None): if eps <= 0.0: raise ValueError("eps must be positive.") X = check_array(X, dtype=[np.float64, np.float32]) if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) ww = make2d(sample_weight) else: ww = None XX = make2d(X) fpt = getFPType(XX) alg = daal4py.dbscan(method='defaultDense', fptype=fpt, epsilon=float(eps), minObservations=int(min_samples), memorySavingMode=False, resultsToCompute="computeCoreIndices") daal_res = alg.compute(XX, ww) n_clusters = daal_res.nClusters[0, 0] assignments = daal_res.assignments.ravel() if daal_res.coreIndices is not None: core_ind = daal_res.coreIndices.ravel() else: core_ind = np.array([], dtype=np.intc) return (core_ind, assignments)
def evaluate_print(clf_name, y, y_pred): """Utility function for evaluating and printing the results for examples. Default metrics include ROC and Precision @ n Parameters ---------- clf_name : str The name of the detector. y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. """ y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format( clf_name=clf_name, roc=np.round(roc_auc_score(y, y_pred), decimals=4), prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
def _check_x_y(x, y): x = check_array(x, ensure_2d=False, force_all_finite=True) y = check_array(y, ensure_2d=False, force_all_finite=True) check_consistent_length(x, y) return x, y
def evaluate_print(clf_name, y, y_pred): """Utility function for evaluating and printing the results for examples. Default metrics include accuracy, roc, and F1 score Parameters ---------- clf_name : str The name of the estimator. y : list or numpy array of shape (n_samples,) The ground truth. y_pred : list or numpy array of shape (n_samples,) The raw scores as returned by a fitted model. """ y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) print('{clf_name} Accuracy:{acc}, ROC:{roc}, F1:{f1}'.format( clf_name=clf_name, acc=np.round(accuracy_score(y, y_pred), decimals=4), roc=np.round(roc_auc_score(y, y_pred), decimals=4), f1=np.round(f1_score(y, y_pred), decimals=4)))
def _init_fit(self, X, y, lipschitz): """Initialise model and check inputs. """ self.random_state_ = check_random_state(self.random_state) check_consistent_length(X, y) X, X_means, y, lipschitz = self._prepare_dataset(X, y, lipschitz) self.subsampler_ = Subsampler(X.shape[0], self.subsampling_scheme, self.random_state_) groups = self.groups if groups is None: groups = np.arange(X.shape[1] - 1) self.group_ids_ = np.array(_parse_group_iterable(groups)) self.groups_ = [ self.group_ids_ == u for u in np.unique(self.group_ids_) if u >= 0 ] self.group_reg_vector_ = self._get_reg_vector(self.group_reg) self.losses_ = [] if not self.warm_start or not hasattr(self, "coef_"): self.coef_ = np.zeros((X.shape[1] - 1, y.shape[1])) self.intercept_ = np.zeros((1, self.coef_.shape[1])) self._check_valid_parameters() self.X_aug_, self.y_, self.lipschitz_ = X, y, lipschitz self._X_means_ = X_means if not self.old_regularisation and not self.supress_warning: warnings.warn(_OLD_REG_WARNING)
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def score_predictor_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * mean absolute error * root mean squared error * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'mean absolute error': skm.mean_absolute_error(y_true, y_pred), 'root mean squared error': np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)), 'n_samples': y_true.size, 'true': { 'mean': np.mean(y_true), 'stdev': np.std(y_true) }, 'predicted': { 'mean': np.mean(y_pred), 'stdev': np.std(y_pred) } } # display statistics if disp: print(json.dumps(stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def _preprocess_data_for_fit(self, X, Y, copy): """Check if data conforms to expectations and possibly center-scale it. Parameters ---------- X : np.ndarray (n_samples, n_X_features) data matrix X Y : np.ndarray (n_samples, n_Y_features) data matrix Y copy : bool whether a copy of the data is returned Returns ------- prepared_X : np.ndarray (n_samples, n_X_features) data matrix X prepared_Y : np.ndarray (n_samples, n_Y_features) data matrix Y """ check_consistent_length([X, Y]) if Y.ndim == 1: Y = Y.reshape(-1, 1) X = check_array(X, dtype=np.float64, copy=copy, ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=copy, ensure_min_samples=2) # Scale (in place) X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = \ _center_scale_xy(X, Y, scale=self.scale, ddof=self.std_ddof) return X, Y
def mean_absolute_error(y_true, y_pred): """ Mean absolute error and its standard deviation. If you need only mean absolute error, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- mean : float mean of squared errors stdev : float standard deviation of squared errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = np.abs(y_true - y_pred) mean = np.nanmean(errs) stdev = np.nanstd(errs) return mean, stdev
def obs_fuzziness(y_true, p_values): """**Classification** - Calculate the Observed Fuzziness (OF) Significance independent metric, smaller is better Parameters ---------- y_true : 1D numpy array, list or pandas Series True labels p_values : 2D numpy array or DataFrame The predicted p-values, first column for the class 0, second for class 1, .. Returns ------- obs_fuzz : float Observed fuzziness """ p_values = to_numpy2D(p_values,'p_values') y_true = to_numpy1D_int(y_true, 'y_true') check_consistent_length(y_true, p_values) of_sum = 0 for i in range(0,p_values.shape[0]): # Mask the p-value of the true label p_vals_masked = np.ma.array(p_values[i,:], mask=False) p_vals_masked.mask[y_true[i]] = True # Sum the remaining p-values of_sum += p_vals_masked.sum() return of_sum / len(y_true)
def _check_consistent_input(self, y_true, y_pred, multioutput): check_consistent_length(y_true, y_pred) y_true = check_array(y_true, ensure_2d=False) if not isinstance(y_pred, pd.DataFrame): ValueError("y_pred should be a dataframe.") if not all(y_pred.dtypes == float): ValueError("Data should be numeric.") if y_true.ndim == 1: y_true = y_true.reshape((-1, 1)) n_outputs = y_true.shape[1] allowed_multioutput_str = ("raw_values", "uniform_average", "variance_weighted") if isinstance(multioutput, str): if multioutput not in allowed_multioutput_str: raise ValueError("Allowed 'multioutput' string values are {}. " "You provided multioutput={!r}".format( allowed_multioutput_str, multioutput)) elif multioutput is not None: multioutput = check_array(multioutput, ensure_2d=False) if n_outputs == 1: raise ValueError( "Custom weights are useful only in multi-output case.") elif n_outputs != len(multioutput): raise ValueError( "There must be equally many custom weights (%d) as outputs (%d)." % (len(multioutput), n_outputs)) return y_true, y_pred, multioutput
def check_arrays_survival(X, y, force_all_finite=True): """Check that all arrays have consistent first dimensions. Parameters ---------- X : array-like Data matrix containing feature vectors. y : structured array with two fields A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. force_all_finite : boolean (default=True) Whether to raise an error on np.inf and np.nan in X. Returns ------- X : array, shape=[n_samples, n_features] Feature vectors. event : array, shape=[n_samples,], dtype=bool Binary event indicator. time : array, shape=[n_samples,], dtype=float Time of event or censoring. """ event, time = check_y_survival(y) X = check_array(X, dtype=float, ensure_min_samples=2, force_all_finite=force_all_finite) check_consistent_length(X, event, time) return X, event, time
def _check_targets(y_true, y_pred): check_consistent_length(y_true, y_pred) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = {type_true, type_pred} if y_type == {"binary", "multiclass"}: y_type = {"multiclass"} if len(y_type) > 1: raise ValueError("Classification metrics can't handle a mix of {0} " "and {1} targets".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if (y_type not in ["binary", "multiclass", "multilabel-indicator"]): raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) if y_type == "binary": unique_values = np.union1d(y_true, y_pred) if len(unique_values) > 2: y_type = "multiclass" if y_type.startswith('multilabel'): y_true = csr_matrix(y_true) y_pred = csr_matrix(y_pred) y_type = 'multilabel-indicator' return y_type, y_true, y_pred
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, y_test_pred): """Internal shape to check input data shapes are consistent. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. Returns ------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. """ # check input data shapes are consistent X_train, y_train = check_X_y(X_train, y_train) X_test, y_test = check_X_y(X_test, y_test) y_test_pred = column_or_1d(y_test_pred) y_train_pred = column_or_1d(y_train_pred) check_consistent_length(y_train, y_train_pred) check_consistent_length(y_test, y_test_pred) if X_train.shape[1] != X_test.shape[1]: raise ValueError("X_train {0} and X_test {1} have different number " "of features.".format(X_train.shape, X_test.shape)) return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
def _validate_X_y_sample_weight(self, X, y, sample_weight): """Validate if X, y and sample_weight are numeric and of equal length. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples used to fit the classifier. y : array-like of shape (n_samples) Labels of the input samples 'X'. There may be missing labels. sample_weight : array-like of shape (n_samples,) (default=None) Sample weights for X, used to fit the clf. Returns ------- X : array-like of shape (n_samples, n_features) Checked Input samples. y : array-like of shape (n_samples) Checked Labels of the input samples 'X'. Converts y to a numpy array """ if sample_weight is not None: sample_weight = np.array(sample_weight) check_consistent_length(sample_weight, y) if X is not None and y is not None: X = check_array(X) y = np.array(y) check_consistent_length(X, y) return X, y, sample_weight
def fit(self, X, y): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) the targets have to be encodes as {-1, 1}. """ y = _validate_class_labels(y) self.classes_ = np.unique(y) if len(self.classes_) != 2: raise ValueError("This solver only supports binary classification" " but the data contains" " class: %r" % self.classes_) # fastFM-core expects labels to be in {-1,1} y_train = y.copy() i_class1 = (y_train == self.classes_[0]) y_train[i_class1] = -1 y_train[-i_class1] = 1 check_consistent_length(X, y) y = y.astype(np.float64) X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) return self
def fit(self, X, y): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) the targets have to be encodes as {-1, 1}. """ y = _validate_class_labels(y) self.classes_ = np.unique(y) if len(self.classes_) != 2: raise ValueError("This solver only supports binary classification" " but the data contains" " class: %r" % self.classes_) # fastFM-core expects labels to be in {-1,1} y_train = y.copy() i_class1 = (y_train == self.classes_[0]) y_train[i_class1] = -1 y_train[-i_class1] = 1 check_consistent_length(X, y) y = y.astype(np.float64) X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) return self
def fit(self, X, y, strata): """ Args: X: numpy matrix of predictors. y: numpy array. strata: numpy array of strata. It's sensible to envision the need of a multi-column matrix strata, but there is no such need for now. """ X, y = check_X_y(X, y, ensure_2d=True, copy=False, y_numeric=True, multi_output=False) strata = check_array(strata, ensure_2d=False) check_consistent_length(y, strata) uniq = np.unique(strata) models = {} residues = 0.0 for key in uniq: model = self.model_class(**self.model_kwargs) idx = np.where(strata == key)[0] model.fit(X[idx, :], y[idx]) models[key] = model residues += model._residues self.models_ = models self._residues = residues self._n_obs = X.shape[0] return self
def check_inputs(X, y, sample_weight=None, ensure_2d=True): """Input validation for debiasing algorithms. Checks all inputs for consistent length, validates shapes (optional for X), and returns an array of all ones if sample_weight is ``None``. Args: X (array-like): Input data. y (array-like, shape = (n_samples,)): Target values. sample_weight (array-like, optional): Sample weights. ensure_2d (bool, optional): Whether to raise a ValueError if X is not 2D. Returns: tuple: * **X** (`array-like`) -- Validated X. Unchanged. * **y** (`array-like`) -- Validated y. Possibly converted to 1D if not a :class:`pandas.Series`. * **sample_weight** (`array-like`) -- Validated sample_weight. If no sample_weight is provided, returns a consistent-length array of ones. """ if ensure_2d and X.ndim != 2: raise ValueError("Expected X to be 2D, got ndim == {} instead.".format( X.ndim)) if not isinstance(y, pd.Series): # don't cast Series -> ndarray y = column_or_1d(y) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) else: sample_weight = np.ones(X.shape[0]) check_consistent_length(X, y, sample_weight) return X, y, sample_weight
def _binary_clf_curve(y_true, y_score): check_consistent_length(y_true, y_score, None) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) assert_all_finite(y_score) # make y_true a boolean vector y_true = (y_true == 1) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true)[threshold_idxs] fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs]
def _prepare_dataset(self, X, y): check_consistent_length(X, y) check_array(X) check_array(y) if len(y.shape) == 1: y = y.reshape(-1, 1) return X, y
def _check_data(X, y, solver=None): if solver == 'sag': X = check_array(X, accept_sparse=['csr'], dtype=np.float64, order='C') y = check_array(y, dtype=np.float64, ensure_2d=False, order='F') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) y = check_array(y, dtype='numeric', ensure_2d=False) check_consistent_length(X, y) n_samples, n_features = X.shape if y.ndim > 2: raise ValueError("Target y has the wrong shape %s" % str(y.shape)) ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) return X, y, n_samples, n_features, n_targets, ravel
def fit(self, X, y, trt, n_trt=None): X, y = check_X_y(X, y, accept_sparse="csr") self.trt_, self.n_trt_ = check_trt(trt, n_trt) check_consistent_length(X, y, self.trt_) self.n_models_ = self.n_trt_ + 1 self.models_ = self._check_base_estimator(self.n_models_) self.n_ = np.empty(self.n_models_, dtype=int) self.p = X.shape[1] self.sigma = np.empty(self.n_models_) #import pdb; pdb.set_trace() for i in range(self.n_models_): mi = self.models_[i][1] ind = (trt == i) self.n_[i] = ind.sum() Xi = X[ind] yi = y[ind] Sigma = np.eye(self.p) M = GaussianMatrix(Xi, yi, 3 * Sigma) #mi.fit(Xi-M[0], yi-M[1]) mi.fit(Xi, yi) mi.coef_ = np.linalg.inv( (Xi - M[0]).T @ (Xi - M[0])) @ (Xi - M[0]).T @ (yi - M[1]) #mi.intercept_=0 #g = GaussianMatrix2(Xi,yi,3*Sigma,mi.coef_) #mi.fit(Xi, yi-g) #mi.coef_ return self
def _prepare_dataset(self, X, y, lipschitz): """Ensure that the inputs are valid and prepare them for fit. """ self.label_binarizer_ = LabelBinarizer() self.label_binarizer_.fit(y) y = self._encode(y) check_consistent_length(X, y) X = check_array(X, accept_sparse="csr") check_array(y, ensure_2d=False) if set(np.unique(y)) != {0, 1}: raise ValueError( "The target array must either be a 2D dummy encoded (binary)" "array or a 1D array with class labels as array elements.") # Add the intercept column and compute Lipschitz bound the correct way if self.fit_intercept: X = _add_intercept_col(X) X = check_array(X, accept_sparse="csr") if lipschitz is None: lipschitz = self._compute_lipschitz(X, y) if not self.fit_intercept: X = _add_intercept_col(X) X = check_array(X, accept_sparse="csr") return X, y, lipschitz
def __call__(self, y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): if self.lb_ is None: self.lb_ = LabelBinarizer() T = self.lb_.fit_transform(y_true) else: T = self.lb_.transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) Y = np.clip(y_pred, eps, 1 - eps) if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) check_consistent_length(T, Y) T = check_array(T) Y = check_array(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def evaluate_print(clf_name, y, y_pred): """Utility function for evaluating and printing the results for examples. Default metrics include ROC and Precision @ n Parameters ---------- clf_name : str The name of the detector. y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. """ y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format( clf_name=clf_name, roc=np.round(roc_auc_score(y, y_pred), decimals=4), prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, y_test_pred): """Internal shape to check input data shapes are consistent. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. Returns ------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. """ # check input data shapes are consistent X_train, y_train = check_X_y(X_train, y_train) X_test, y_test = check_X_y(X_test, y_test) y_test_pred = column_or_1d(y_test_pred) y_train_pred = column_or_1d(y_train_pred) check_consistent_length(y_train, y_train_pred) check_consistent_length(y_test, y_test_pred) if X_train.shape[1] != X_test.shape[1]: raise ValueError("X_train {0} and X_test {1} have different number " "of features.".format(X_train.shape, X_test.shape)) return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
def fit(self, X_train, y_train, n_more_iter=0): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) n_more_iter : int Number of iterations to continue from the current Coefficients. """ check_consistent_length(X_train, y_train) y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, order="F") self.n_iter = self.n_iter + n_more_iter if n_more_iter > 0: _check_warm_start(self, X_train) self.warm_start = True self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) if self.iter_count != 0: self.iter_count = self.iter_count + n_more_iter else: self.iter_count = self.n_iter # reset to default setting self.warm_start = False return self
def check_arrays_survival(X, y, **kwargs): """Check that all arrays have consistent first dimensions. Parameters ---------- X : array-like Data matrix containing feature vectors. y : structured array with two fields A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. kwargs : dict Additional arguments passed to :func:`sklearn.utils.check_array`. Returns ------- X : array, shape=[n_samples, n_features] Feature vectors. event : array, shape=[n_samples,], dtype=bool Binary event indicator. time : array, shape=[n_samples,], dtype=float Time of event or censoring. """ event, time = check_y_survival(y) kwargs.setdefault("dtype", numpy.float64) X = check_array(X, ensure_min_samples=2, **kwargs) check_consistent_length(X, event, time) return X, event, time
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. check_consistent_length(T, Y) T = check_array(T) Y = check_array(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return loss
def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=['csr', 'csc'], dtype=None) y = check_array(y, accept_sparse=['csr', 'csc'], dtype=None, ensure_2d=False) check_consistent_length(X, y) return X, y, binarize_y
def _process_graphs(graphs, inner_hier_labels, outer_hier_labels, transform, sort_nodes): """ Handles transformation and sorting of graphs for plotting """ for g in graphs: check_consistent_length(g, inner_hier_labels, outer_hier_labels) graphs = [_transform(arr, transform) for arr in graphs] if inner_hier_labels is not None: inner_hier_labels = np.array(inner_hier_labels) if outer_hier_labels is None: outer_hier_labels = np.ones_like(inner_hier_labels) else: outer_hier_labels = np.array(outer_hier_labels) else: inner_hier_labels = np.ones(graphs[0].shape[0], dtype=int) outer_hier_labels = np.ones_like(inner_hier_labels) graphs = [ _sort_graph(arr, inner_hier_labels, outer_hier_labels, sort_nodes) for arr in graphs ] return graphs
def _check_X_y(self, X, y): if hasattr(X, "loc"): # store information to build dataframe self._X_columns = X.columns self._X_dtypes = X.dtypes else: self._X_columns = None self._X_dtypes = None if hasattr(y, "loc"): # store information to build a series self._y_name = y.name self._y_dtype = y.dtype else: self._y_name = None self._y_dtype = None y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) y = check_array(y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False) check_consistent_length(X, y) return X, y, binarize_y
def nonnegative_regression(X, y, sample_weight=None): r"""Solve the nonnegative least squares estimate regression problem. Solves :math:`\underset{x}{\text{argmin}} \| Ax - b \|_2^2` subject to :math:`x \geq 0` using `scipy.optimize.nnls <https://docs.scipy.org/doc/scipy/reference/ generated/scipy.optimize.nnls.html>`_ Parameters ---------- X : array, shape = (n_samples, n_features) Training data. y : array, shape = (n_samples,) or (n_samples, n_targets) Target values. sample_weight : float or array-like, shape (n_samples,), optional (default = None) Individual weights for each sample. Returns ------- coef : array, shape = (n_features,) or (n_samples, n_features) Weight vector(s). res : float The residual, :math:`\| Ax - y \|_2`. """ # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls # TODO order='F'? X = check_array(X) y = check_array(y, ensure_2d=False) check_consistent_length(X, y) n_samples, n_features = X.shape ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) has_sw = sample_weight is not None if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y = _rescale_data(X, y, sample_weight) coef, res = _solve_nnls(X, y) if ravel: # When y was passed as 1d-array, we flatten the coefficients coef = coef.ravel() return coef, res
def _check_targets(y_true, y_pred): """Check that y_true and y_pred belong to the same classification task This converts multiclass or binary types to a common shape, and raises a ValueError for a mix of multilabel and multiclass targets, a mix of multilabel formats, for the presence of continuous-valued or multioutput targets, or for targets of different lengths. Column vectors are squeezed to 1d, while multilabel formats are returned as CSR sparse label indicators. Parameters ---------- y_true : array-like y_pred : array-like Returns ------- type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} The type of the true target data, as output by ``utils.multiclass.type_of_target`` y_true : array or indicator matrix y_pred : array or indicator matrix """ check_consistent_length(y_true, y_pred) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = {type_true, type_pred} if y_type == {"binary", "multiclass"}: y_type = {"multiclass"} if len(y_type) > 1: raise ValueError("Classification metrics can't handle a mix of {0} " "and {1} targets".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if (y_type not in ["binary", "multiclass", "multilabel-indicator"]): raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) if y_type == "binary": unique_values = np.union1d(y_true, y_pred) if len(unique_values) > 2: y_type = "multiclass" if y_type.startswith('multilabel'): y_true = csr_matrix(y_true) y_pred = csr_matrix(y_pred) y_type = 'multilabel-indicator' return y_type, y_true, y_pred
def _evaluate_pointwise_score(y_true, y_pred, score_func): pointwise_scores = score_func(y_true, y_pred) check_consistent_length(pointwise_scores, y_true) mean_score = np.mean(pointwise_scores) n = pointwise_scores.shape[0] stderr = np.std(pointwise_scores) / np.sqrt(n - 1) return mean_score, stderr
def _dump_df_excel(obj, file, **kwargs): '''dump df to excel obj: 2d array like data file: str or file obj: ''' writer = pd.ExcelWriter(file) obj = get_flat_list(obj) sheet_name = kwargs.get('sheet_name') if sheet_name is None: sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))] else: sheet_name = get_flat_list(sheet_name) check_consistent_length(obj, sheet_name) for data, name in zip(obj, sheet_name): try: data = pd.DataFrame(data) kw = get_kwargs(data.to_excel, **kwargs) kw.update({ 'sheet_name': name, 'index': kwargs.get('index', False) }) data.to_excel(writer, **kw) except Exception as e: print(repr(e)) continue writer.save()
def _check_targets_hmc(y_true, y_pred): check_consistent_length(y_true, y_pred) y_type = set([type_of_target(y_true), type_of_target(y_pred)]) if y_type == set(["binary", "multiclass"]): y_type = set(["multiclass"]) if y_type != set(["multiclass"]): raise ValueError("{0} is not supported".format(y_type)) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) return y_true, y_pred
def _validate_mcmc_fit_input(X_train, y_train, X_test): check_consistent_length(X_train, y_train) assert_all_finite(y_train) y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) assert X_train.shape[1] == X_test.shape[1] X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, order="F") X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, order="F") return X_train, y_train, X_test
def item_finder_statistics(y_true, y_pred): """ Full Statistics of prediction performance * n_samples * mean_absolute_error: mean, stdev * mean_squared_error: mean, rmse, stdev * predicted: mean, stdev * true: mean, stdev Parameters ---------- y_true : array, shape=(n_samples,) Ground truth scores y_pred : array, shape=(n_samples,) Predicted scores Returns ------- stats : dict Full statistics of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = {} # dataset size stats['n_samples'] = y_true.size # descriptive statistics of ground truth scores stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)} # descriptive statistics of ground predicted scores stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): # AUC (area under the curve) stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) return stats
def item_finder_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * AUC * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) # display statistics if disp: print( json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def score_predictor_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * mean absolute error * root mean squared error * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'mean absolute error': skm.mean_absolute_error(y_true, y_pred), 'root mean squared error': np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)), 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # display statistics if disp: print(json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def __init__(self, x, y, status, time=None): self.x, self.y = check_X_y(x, y) assert numpy.issubdtype(y.dtype, numpy.integer), \ "y vector must have integer type, but was {0}".format(y.dtype) assert y.min() == 0, "minimum element of y vector must be 0" if time is None: self.status = check_array(status, dtype=bool, ensure_2d=False) check_consistent_length(self.x, self.status) else: self.status = check_array(status, dtype=bool, ensure_2d=False) self.time = check_array(time, ensure_2d=False) check_consistent_length(self.x, self.status, self.time) self.eps = numpy.finfo(self.x.dtype).eps
def get_label_n(y, y_pred, n=None): """Function to turn raw outlier scores into binary labels by assign 1 to top n outlier scores. Parameters ---------- y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. n : int, optional (default=None) The number of outliers. if not defined, infer using ground truth. Returns ------- labels : numpy array of shape (n_samples,) binary labels 0: normal points and 1: outliers Examples -------- >>> from pyod.utils.utility import get_label_n >>> y = [0, 1, 1, 0, 0, 0] >>> y_pred = [0.1, 0.5, 0.3, 0.2, 0.7] >>> get_label_n(y, y_pred) >>> [0, 1, 0, 0, 1] """ # enforce formats of inputs y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) y_len = len(y) # the length of targets # calculate the percentage of outliers if n is not None: outliers_fraction = n / y_len else: outliers_fraction = np.count_nonzero(y) / y_len threshold = scoreatpercentile(y_pred, 100 * (1 - outliers_fraction)) y_pred = (y_pred > threshold).astype('int') return y_pred
def fit(self, X, y): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) """ check_consistent_length(X, y) y = check_array(y, ensure_2d=False, dtype=np.float64) X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) return self
def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): check_consistent_length(y_true, pred_decision, sample_weight) pred_decision = check_array(pred_decision, ensure_2d=False) y_true = column_or_1d(y_true) y_true_unique = np.unique(y_true) if y_true_unique.size > 2: if (labels is None and pred_decision.ndim > 1 and (np.size(y_true_unique) != pred_decision.shape[1])): raise ValueError("Please include all labels in y_true " "or pass labels as third argument") if labels is None: labels = y_true_unique le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) mask = np.ones_like(pred_decision, dtype=bool) mask[np.arange(y_true.shape[0]), y_true] = False margin = pred_decision[~mask] margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1) else: # Handles binary class case # this code assumes that positive and negative labels # are encoded as +1 and -1 respectively pred_decision = column_or_1d(pred_decision) pred_decision = np.ravel(pred_decision) lbin = LabelBinarizer(neg_label=-1) y_true = lbin.fit_transform(y_true)[:, 0] try: margin = y_true * pred_decision except TypeError: raise TypeError("pred_decision should be an array of floats.") losses = 1 - margin # The hinge_loss doesn't penalize good enough predictions. losses[losses <= 0] = 0 return losses
def mean_squared_error(y_true, y_pred): """ Root mean square error, mean square error, and its standard deviation. If you need only RMSE, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- rmse : float root mean squared error mean : float mean of absolute errors stdev : float standard deviation of absolute errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = (y_true - y_pred) ** 2 mean = np.nanmean(errs) stdev = np.nanstd(errs) rmse = np.sqrt(np.maximum(mean, 0.)) return rmse, mean, stdev
def fit(self, X, y, qids, sample_weight=None, monitor=None, ): """Fit lambdamart onto a dataset. Parameters ---------- X : array_like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array_like, shape = [n_samples] Target values (integers in classification, real numbers in regression) For classification, labels must correspond to classes. qids : array_like, shape = [n_samples] Query ids for each sample. Samples must be grouped by query such that all queries with the same qid appear in one contiguous block. monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of ``_fit_stages`` as keyword arguments ``callable(i, self, locals())``. If the callable returns ``True`` the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspecting, and snapshoting. """ if not self.warm_start: self._clear_state() X, y = check_X_y(X, y, dtype=DTYPE) n_samples, self.n_features = X.shape check_consistent_length(X, y, qids) if y.dtype.kind == 'O': y = y.astype(np.float64) random_state = check_random_state(self.random_state) self._check_params() if not self._is_initialized(): self._init_state() begin_at_stage = 0 y_pred = np.zeros(y.shape[0]) else: if self.n_estimators < self.estimators_.shape[0]: raise ValueError('n_estimators=%d must be larger or equal to ' 'estimators_.shape[0]=%d when ' 'warm_start==True' % (self.n_estimators, self.estimators_.shape[0])) begin_at_stage = self.estimators_.shape[0] self.estimators_fitted_ = begin_at_stage self.estimators_.resize((self.n_estimators, 1)) self.train_score_.resize(self.n_estimators) if self.query_subsample < 1.0: self.oob_improvement_.resize(self.n_estimators) y_pred = self.predict(X) n_stages = self._fit_stages(X, y, qids, y_pred, random_state, begin_at_stage, monitor) if n_stages < self.estimators_.shape[0]: self.trim(n_stages) return self
def score_predictor_statistics(y_true, y_pred, score_domain=(1, 5, 1)): """ Full Statistics of prediction performance * n_samples * mean_absolute_error: mean, stdev * mean_squared_error: mean, rmse, stdev * predicted: mean, stdev * true: mean, stdev Parameters ---------- y_true : array, shape=(n_samples,) Ground truth scores y_pred : array, shape=(n_samples,) Predicted scores score_domain : array, shape=(3,) Domain of scores, represented by a triple: start, end, and stride default=(1, 5, 1). Returns ------- stats : dict Full statistics of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = {} # dataset size stats['n_samples'] = y_true.size # a list of possible score levels stats['score levels'] = np.hstack([ np.arange(score_domain[0], score_domain[1], score_domain[2], dtype=float), score_domain[1]]) # mean absolute error mean, stdev = mean_absolute_error(y_true, y_pred) stats['mean absolute error'] = {'mean': mean, 'stdev': stdev} # root mean squared error rmse, mean, stdev = mean_squared_error(y_true, y_pred) stats['mean squared error'] = {'rmse': rmse, 'mean': mean, 'stdev': stdev} # descriptive statistics of ground truth scores stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)} hist, _ = score_histogram(y_true, score_domain=score_domain) stats['true']['histogram'] = hist stats['true']['histogram density'] = (hist / hist.sum()) # descriptive statistics of ground predicted scores stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)} hist, _ = score_histogram(y_pred, score_domain=score_domain) stats['predicted']['histogram'] = hist stats['predicted']['histogram density'] = (hist / hist.sum()) return stats
def fit(self, X, y): """ The Gaussian Process model fitting method. Parameters ---------- X : double array_like An array with shape (n_samples, n_features) with the input at which observations were made. y : double array_like An array with shape (n_samples, ) or shape (n_samples, n_targets) with the observations of the output to be predicted. Returns ------- gp : self A fitted Gaussian Process model object awaiting data to perform predictions. """ # Run input checks self._check_params() self.random_state = check_random_state(self.random_state) # Force data to 2D numpy.array X = check_array(X) y = np.asarray(y) self.y_ndim_ = y.ndim if y.ndim == 1: y = y[:, np.newaxis] check_consistent_length(X, y) # Check shapes of DOE & observations n_samples, n_features = X.shape _, n_targets = y.shape # Run input checks self._check_params(n_samples) # Normalize data or don't if self.normalize: X_mean = np.mean(X, axis=0) X_std = np.std(X, axis=0) y_mean = np.mean(y, axis=0) y_std = np.std(y, axis=0) X_std[X_std == 0.] = 1. y_std[y_std == 0.] = 1. # center and scale X if necessary X = (X - X_mean) / X_std y = (y - y_mean) / y_std else: X_mean = np.zeros(1) X_std = np.ones(1) y_mean = np.zeros(1) y_std = np.ones(1) # Fit correlation model self.corr.fit(X, self.nugget) # Regression matrix and parameters F = self.regr(X) n_samples_F = F.shape[0] if F.ndim > 1: p = F.shape[1] else: p = 1 if n_samples_F != n_samples: raise Exception("Number of rows in F and X do not match. Most " "likely something is going wrong with the " "regression model.") if p > n_samples_F: raise Exception(("Ordinary least squares problem is undetermined " "n_samples=%d must be greater than the " "regression model size p=%d.") % (n_samples, p)) if self.beta0 is not None: if self.beta0.shape[0] != p: raise Exception("Shapes of beta0 and F do not match.") # Set attributes self.X = X self.y = y self.F = F self.X_mean, self.X_std = X_mean, X_std self.y_mean, self.y_std = y_mean, y_std # Determine Gaussian Process model parameters if self.thetaL is not None and self.thetaU is not None: # Maximum a Posterior estimation of the parameters if self.verbose: print("Performing Maximum a Posterior estimation of the " "autocorrelation parameters...") self.theta_, self.posterior_function_value_, par = \ self._arg_max_posterior() # compute reduced_likelihood_function_value_ for backward # compatibility self.reduced_likelihood_function_value_, _ = \ self.reduced_likelihood_function() if np.isinf(self.posterior_function_value_): raise Exception("Bad parameter region. " "Try increasing upper bound") else: # Given parameters if self.verbose: print("Given autocorrelation parameters. " "Computing Gaussian Process model parameters...") self.theta_ = self.theta0 self.reduced_likelihood_function_value_, par = \ self.reduced_likelihood_function() self.posterior_function_value_ = \ self.reduced_likelihood_function_value_ \ + self.corr.log_prior(self.theta_) if np.isinf(self.posterior_function_value_): raise Exception("Bad point. Try increasing theta0.") self.beta = par['beta'] self.gamma = par['gamma'] self.sigma2 = par['sigma2'] self.C = par['C'] self.Ft = par['Ft'] self.G = par['G'] if self.storage_mode == 'light': # Delete heavy data (it will be computed again if required) # (it is required only when MSE is wanted in self.predict) if self.verbose: print("Light storage mode specified. " "Flushing autocorrelation matrix...") self.F = None self.C = None self.Ft = None self.G = None return self
def concordance_index_censored(event_indicator, event_time, estimate): """Concordance index for right-censored data The concordance index is defined as the proportion of all comparable pairs in which the predictions and outcomes are concordant. Samples are comparable if for at least one of them an event occurred. If the estimated risk is larger for the sample with a higher time of event/censoring, the predictions of that pair are said to be concordant. If an event occurred for one sample and the other is known to be event-free at least until the time of event of the first, the second sample is assumed to *outlive* the first. When predicted risks are identical for a pair, 0.5 rather than 1 is added to the count of concordant pairs. A pair is not comparable if an event occurred for both of them at the same time or an event occurred for one of them but the time of censoring is smaller than the time of event of the first one. Parameters ---------- event_indicator : array-like, shape = [n_samples,] Boolean array denotes whether an event occurred event_time : array-like, shape = [n_samples,] Array containing the time of an event or time of censoring estimate : array-like, shape = [n_samples,] Estimated risk of experiencing an event Returns ------- cindex : float Concordance index concordant : int Number of concordant pairs discordant : int Number of discordant pairs tied_risk : int Number of pairs having tied estimated risks tied_time : int Number of pairs having an event at the same time References ---------- .. [1] Harrell, F.E., Califf, R.M., Pryor, D.B., Lee, K.L., Rosati, R.A, "Multivariable prognostic models: issues in developing models, evaluating assumptions and adequacy, and measuring and reducing errors", Statistics in Medicine, 15(4), 361-87, 1996. """ check_consistent_length(event_indicator, event_time, estimate) event_indicator = check_array(event_indicator, ensure_2d=False) event_time = check_array(event_time, ensure_2d=False) estimate = check_array(estimate, ensure_2d=False) if not numpy.issubdtype(event_indicator.dtype, numpy.bool_): raise ValueError( 'only boolean arrays are supported as class labels for survival analysis, got {0}'.format( event_indicator.dtype)) n_samples = len(event_time) if n_samples < 2: raise ValueError("Need a minimum of two samples") if not event_indicator.any(): raise ValueError("All samples are censored") order = numpy.argsort(event_time) tied_time = 0 comparable = {} for i in range(n_samples - 1): inext = i + 1 j = inext time_i = event_time[order[i]] while j < n_samples and event_time[order[j]] == time_i: j += 1 if event_indicator[order[i]]: mask = numpy.zeros(n_samples, dtype=bool) mask[inext:] = True if j - i > 1: # event times are tied, need to check for coinciding events event_at_same_time = event_indicator[order[inext:j]] mask[inext:j] = numpy.logical_not(event_at_same_time) tied_time += event_at_same_time.sum() comparable[i] = mask elif j - i > 1: # events at same time are comparable if at least one of them is positive mask = numpy.zeros(n_samples, dtype=bool) mask[inext:j] = event_indicator[order[inext:j]] comparable[i] = mask concordant = 0 discordant = 0 tied_risk = 0 for ind, mask in comparable.items(): est_i = estimate[order[ind]] event_i = event_indicator[order[ind]] est = estimate[order[mask]] if event_i: # an event should have a higher score con = (est < est_i).sum() else: # a non-event should have a lower score con = (est > est_i).sum() concordant += con tie = (est == est_i).sum() tied_risk += tie discordant += est.size - con - tie cindex = (concordant + 0.5 * tied_risk) / (concordant + discordant + tied_risk) return cindex, concordant, discordant, tied_risk, tied_time
def fit(self, X1, y1, X2, y2, left_right_bounds=None): """Fit estimator using RANSAC algorithm. Namely, the fit is done into two main steps: - pre-fitting: quickly select n_prefits configurations which seems suitable given topological constraints. - finding best fit: select the pre-fit with the maximum number of inliers as the best fit. Inputs: X1, y1: Left lane points (supposedly) X2, y2: Right lane points (supposedly) """ check_consistent_length(X1, y1) check_consistent_length(X2, y2) # Assume linear model by default min_samples = X1.shape[1] + 1 if min_samples > X1.shape[0] or min_samples > X2.shape[0]: raise ValueError("`min_samples` may not be larger than number " "of samples ``X1-2.shape[0]``.") # Check additional parameters... if self.stop_probability < 0 or self.stop_probability > 1: raise ValueError("`stop_probability` must be in range [0, 1].") if self.residual_threshold is None: residual_threshold = np.median(np.abs(y - np.median(y))) else: residual_threshold = self.residual_threshold delta_left_right = (left_right_bounds[0, 0, 1] + left_right_bounds[0, 0, 0]) / 2. # random_state = check_random_state(self.random_state) # Set up lambdas for computing score. score_lambdas = np.copy(self.score_lambdas) score_lambdas[0] = score_lambdas[0] / (y1.size + y2.size) # Collections... self.w_fits = [] self.w_fits_l2 = [] self.inliers_masks = [] self.n_inliers = [] self.score_fits = [] # === Left lane, and then, right lane === # w_left_prefits = lanes_ransac_prefit(X1, y1, self.n_prefits, self.max_trials, self.w_refs_left, self.is_valid_bounds_left) (w_left1, in_mask_left1, score_left1) = \ lanes_ransac_select_best(X1, y1, w_left_prefits, residual_threshold, self.w_refs_left, score_lambdas) n_inliers_left1 = np.sum(in_mask_left1) w_refs = np.vstack((self.w_refs_right, np.reshape(w_left1, (1, 3)))) is_valid_bounds = np.vstack((self.is_valid_bounds_right, left_right_bounds)) w_right_prefits = lanes_ransac_prefit(X2, y2, self.n_prefits, self.max_trials, w_refs, is_valid_bounds) w0 = lane_translate(w_left1, delta_left_right) w_right_prefits = np.vstack((w0, w_right_prefits)) (w_right1, in_mask_right1, score_right1) = \ lanes_ransac_select_best(X2, y2, w_right_prefits, residual_threshold, self.w_refs_right, score_lambdas) n_inliers_right1 = np.sum(in_mask_right1) n_inliers1 = n_inliers_right1 + n_inliers_left1 self.w_fits.append((w_left1, w_right1)) self.n_inliers.append(n_inliers1) self.inliers_masks.append((in_mask_left1, in_mask_right1)) self.score_fits.append((score_left1, score_right1)) # === Right lane and then left lane === # w_right_prefits = lanes_ransac_prefit(X2, y2, self.n_prefits, self.max_trials, self.w_refs_right, self.is_valid_bounds_right) (w_right2, in_mask_right2, score_right2) = \ lanes_ransac_select_best(X2, y2, w_right_prefits, residual_threshold, self.w_refs_right, score_lambdas) n_inliers_right2 = np.sum(in_mask_right2) w_refs = np.vstack((self.w_refs_left, np.reshape(w_right2, (1, 3)))) is_valid_bounds = np.vstack((self.is_valid_bounds_left, left_right_bounds)) w_left_prefits = lanes_ransac_prefit(X1, y1, self.n_prefits, self.max_trials, w_refs, is_valid_bounds) w0 = lane_translate(w_right2, -delta_left_right) w_left_prefits = np.vstack((w0, w_left_prefits)) (w_left2, in_mask_left2, score_left2) = \ lanes_ransac_select_best(X1, y1, w_left_prefits, residual_threshold, self.w_refs_left, score_lambdas) n_inliers_left2 = np.sum(in_mask_left2) n_inliers2 = n_inliers_right2 + n_inliers_left2 self.w_fits.append((w_left2, w_right2)) self.n_inliers.append(n_inliers2) self.inliers_masks.append((in_mask_left2, in_mask_right2)) self.score_fits.append((score_left2, score_right2)) # === Previous frame??? === # if self.w_refs_left.size > 0 and self.w_refs_right.size > 0: in_mask_left3 = lanes_inliers(X1, y1, self.w_refs_left[0], residual_threshold) in_mask_right3 = lanes_inliers(X2, y2, self.w_refs_right[0], residual_threshold) n_inliers3 = np.sum(in_mask_left3) + np.sum(in_mask_right3) score_left3 = lane_score(np.sum(in_mask_left3), self.w_refs_left[0], self.w_refs_left, score_lambdas) score_right3 = lane_score(np.sum(in_mask_right3), self.w_refs_right[0], self.w_refs_right, score_lambdas) self.w_fits.append((self.w_refs_left[0], self.w_refs_right[0])) self.n_inliers.append(n_inliers3) self.inliers_masks.append((in_mask_left3, in_mask_right3)) self.score_fits.append((score_left3, score_right3)) # L2 regression regularisation of fits. self.w_fits_l2 = copy.deepcopy(self.w_fits) if self.l2_scales is not None: for i in range(len(self.w_fits)): w1, w2 = self.w_fits[i] # Some regression: ignored when inversed matrix error. try: w_left = m_regression_exp(X1, y1, w1, self.l2_scales) except Exception: w_left = w1 try: w_right = m_regression_exp(X2, y2, w2, self.l2_scales) except Exception: w_right = w2 in_mask_left = lanes_inliers(X1, y1, w_left, residual_threshold) in_mask_right = lanes_inliers(X2, y2, w_right, residual_threshold) n_inliers = np.sum(in_mask_left) + np.sum(in_mask_right) score_left = lane_score(np.sum(in_mask_left), w_left, self.w_refs_left, score_lambdas) score_right = lane_score(np.sum(in_mask_right), w_right, self.w_refs_right, score_lambdas) self.w_fits_l2[i] = (w_left, w_right) self.n_inliers[i] = n_inliers self.inliers_masks[i] = (in_mask_left, in_mask_right) self.score_fits[i] = (score_left, score_right) # Best fit? scores = [s1+s2 for (s1, s2) in self.score_fits] idx = np.argmax(scores) w_left, w_right = self.w_fits_l2[idx] in_mask_left, in_mask_right = self.inliers_masks[idx] # Smoothing. smoothing = self.smoothing if self.w_refs_left.size > 0 and self.w_refs_right.size > 0: w_left = smoothing * w_left + (1. - smoothing) * self.w_refs_left[0] w_right = smoothing * w_right + (1. - smoothing) * self.w_refs_right[0] self.w1_ = w_left self.w2_ = w_right # Set regression parameters. base_estimator1 = LinearRegression(fit_intercept=False) base_estimator1.coef_ = w_left base_estimator1.intercept_ = 0.0 base_estimator2 = LinearRegression(fit_intercept=False) base_estimator2.coef_ = w_right base_estimator2.intercept_ = 0.0 # Save final model parameters. self.estimator1_ = base_estimator1 self.estimator2_ = base_estimator2 self.inlier_mask1_ = in_mask_left self.inlier_mask2_ = in_mask_right # # Estimate final model using all inliers # # base_estimator1.fit(X1_inlier_best, y1_inlier_best) # # base_estimator2.fit(X2_inlier_best, y2_inlier_best) return self
def fit(self, X1, y1, X2, y2): """Fit estimator using RANSAC algorithm. Namely, the fit is done into two main steps: - pre-fitting: quickly select n_prefits configurations which seems suitable given topological constraints. - finding best fit: select the pre-fit with the maximum number of inliers as the best fit. Inputs: X1, y1: Left lane points (supposedly) X2, y2: Right lane points (supposedly) """ check_consistent_length(X1, y1) check_consistent_length(X2, y2) # Assume linear model by default min_samples = X1.shape[1] + 1 if min_samples > X1.shape[0] or min_samples > X2.shape[0]: raise ValueError("`min_samples` may not be larger than number " "of samples ``X1-2.shape[0]``.") # Check additional parameters... if self.stop_probability < 0 or self.stop_probability > 1: raise ValueError("`stop_probability` must be in range [0, 1].") if self.residual_threshold is None: residual_threshold = np.median(np.abs(y - np.median(y))) else: residual_threshold = self.residual_threshold # random_state = check_random_state(self.random_state) # === Pre-fit with small subsets (4 points) === # # Allows to quickly pre-select some good configurations. w1_prefits, w2_prefits = lanes_ransac_prefit(X1, y1, X2, y2, self.n_prefits, self.max_trials, self.is_valid_diffs, self.is_valid_bounds) # === Select best pre-fit, using the full dataset === # post_fit = 0 (w1, w2, inlier_mask1, inlier_mask2) = lanes_ransac_select_best(X1, y1, X2, y2, w1_prefits, w2_prefits, residual_threshold, post_fit) self.w1_ = w1 self.w2_ = w2 # Set regression parameters. base_estimator1 = LinearRegression(fit_intercept=False) base_estimator1.coef_ = w1 base_estimator1.intercept_ = 0.0 base_estimator2 = LinearRegression(fit_intercept=False) base_estimator2.coef_ = w2 base_estimator2.intercept_ = 0.0 # Save final model parameters. self.estimator1_ = base_estimator1 self.estimator2_ = base_estimator2 self.inlier_mask1_ = inlier_mask1 self.inlier_mask2_ = inlier_mask2 # # Estimate final model using all inliers # # base_estimator1.fit(X1_inlier_best, y1_inlier_best) # # base_estimator2.fit(X2_inlier_best, y2_inlier_best) return self