def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. The predicted class probability is the fraction of samples of the same class in a leaf. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ check_is_fitted(self, 'n_outputs_') if check_input: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise NotFittedError("Tree not initialized. Perform a fit first.") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) if self.n_outputs_ == 1: proba = proba[:, :self.n_classes_] normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer return proba else: all_proba = [] for k in range(self.n_outputs_): proba_k = proba[:, k, :self.n_classes_[k]] normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer all_proba.append(proba_k) return all_proba
def fit(self, X, y): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) the targets have to be encodes as {-1, 1}. """ y = _validate_class_labels(y) self.classes_ = np.unique(y) if len(self.classes_) != 2: raise ValueError("This solver only supports binary classification" " but the data contains" " class: %r" % self.classes_) # fastFM-core expects labels to be in {-1,1} y_train = y.copy() i_class1 = (y_train == self.classes_[0]) y_train[i_class1] = -1 y_train[-i_class1] = 1 check_consistent_length(X, y) y = y.astype(np.float64) X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) return self
def _validate_mcmc_fit_input(X_train, y_train, X_test): check_consistent_length(X_train, y_train) assert_all_finite(y_train) y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) assert X_train.shape[1] == X_test.shape[1] X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, order="F") X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, order="F") return X_train, y_train, X_test
def fit(self, X, y): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) """ check_consistent_length(X, y) y = check_array(y, ensure_2d=False, dtype=np.float64) X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) return self
def predict(self, X_test): """ Return predictions Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) Returns ------ T : array, shape (n_samples) The labels are returned for classification. """ X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, order="F") assert sp.isspmatrix_csc(X_test) assert X_test.shape[1] == len(self.w_) return ffm.ffm_predict(self.w0_, self.w_, self.V_, X_test)
def fit(self, X, pairs): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_compares, 2) Each row `i` defines a pair of samples such that the first returns a high value then the second FM(X[i,0]) > FM(X[i, 1]). """ X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) assert_all_finite(pairs) pairs = pairs.astype(np.float64) # check that pairs contain no real values np.testing.assert_array_equal(pairs, pairs.astype(np.int32)) assert pairs.max() <= X.shape[1] assert pairs.min() >= 0 self.w0_, self.w_, self.V_ = ffm.ffm_fit_sgd_bpr(self, X, pairs) return self
def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if check_input: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise NotFittedError("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0]
def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def _validate_class_labels(y): assert len(set(y)) == 2 assert y.min() == -1 assert y.max() == 1 return check_array(y, ensure_2d=False, dtype=np.float64)
def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if check_input: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise NotFittedError("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take( np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0]