def fit(self, X, y, input_checks = True): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. input_checks: boolean whether to check the X and y parameters Returns ------- self : object """ if input_checks: validate_X_y(X, y) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) # setup label encoding self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes_ = self.label_encoder.classes_ self.y = self.label_encoder.transform(y) if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.X_exemplar, self.y_exemplar = self.pick_exemplars(self) return self
def fit(self, X, y, input_checks = True): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. input_checks: boolean whether to check the X and y parameters Returns ------- self : object """ if input_checks: validate_X_y(X, y) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) # setup label encoding self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes_ = self.label_encoder.classes_ self.y = self.label_encoder.transform(y) if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure_getter(self) self.distance_measure = self.get_distance_measure(self) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) self.trees = parallel(delayed(self._fit_tree)(X, y, index, self.random_state.randint(0, self.n_trees)) for index in range(self.n_trees)) else: self.trees = [self._fit_tree(X, y, index, self.random_state.randint(0, self.n_trees)) for index in range(self.n_trees)] return self
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours ---------- X : array-like or sparse matrix of shape = [n_instances,series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ validate_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier.fit(X, y) # self.shapelet_transform.fit(X,y) # print("Shapelet Search complete") # self.st_X =self.shapelet_transform.transform(X) # print("Transform complete") # X = np.asarray([a.values for a in X.iloc[:, 0]]) # self.classifier.fit(X,y) # print("Build classifier complete") return self
def fit(self, X, y, input_checks=True): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. input_checks: boolean whether to check the X and y parameters Returns ------- self : object """ if input_checks: validate_X_y(X, y) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) if self.find_stump is None: self.find_stump = best_of_n_stumps(self.n_stump_evaluations) # setup label encoding if self.label_encoder == None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) y = self.label_encoder.transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.stump = self.find_stump(self) n_branches = len(self.stump.y_exemplar) self.branches = [None] * n_branches if self.depth < self.max_depth: for index in range(n_branches): sub_y = self.stump.y_branches[index] if not self.is_leaf(sub_y): sub_tree = ProximityTree( random_state=self.random_state, get_exemplars=self.get_exemplars, distance_measure=self.distance_measure, setup_distance_measure=self.setup_distance_measure, get_distance_measure=self.get_distance_measure, get_gain=self.get_gain, is_leaf=self.is_leaf, verbosity=self.verbosity, max_depth=self.max_depth, n_jobs=self.n_jobs) sub_tree.label_encoder = self.label_encoder sub_tree.depth = self.depth + 1 self.branches[index] = sub_tree sub_X = self.stump.X_branches[index] sub_tree.fit(sub_X, sub_y) return self
def _set_oob_score(self, X, y): """Compute out-of-bag score""" validate_X_y(X, y) check_X_is_univariate(X) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object """ validate_X_y(X, y) # Validate or convert input data if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output self.n_features_ = X.shape[1] if X.ndim == 2 else 1 y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: for standard random forests, the threading # backend is preferred as the Cython code for fitting the trees # is internally releasing the Python GIL making threading more # efficient than multiprocessing in that case. However, in this case, # for fitting pipelines in parallel, multiprocessing is more efficient. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_parallel_build_trees)(t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, y) return self