def test_class_distribution(): y = np.array([[1, 0, 0, 1], [2, 2, 0, 1], [1, 3, 0, 1], [4, 2, 0, 1], [2, 0, 0, 1], [1, 3, 0, 1]]) # Define the sparse matrix with a mix of implicit and explicit zeros data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1]) indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5]) indptr = np.array([0, 6, 11, 11, 17]) y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4)) classes, n_classes, class_prior = class_distribution(y) classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp) classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]] n_classes_expected = [3, 3, 1, 1] class_prior_expected = [[3/6, 2/6, 1/6], [1/3, 1/3, 1/3], [1.0], [1.0]] for k in range(y.shape[1]): assert_array_almost_equal(classes[k], classes_expected[k]) assert_array_almost_equal(n_classes[k], n_classes_expected[k]) assert_array_almost_equal(class_prior[k], class_prior_expected[k]) assert_array_almost_equal(classes_sp[k], classes_expected[k]) assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) # Test again with explicit sample weights (classes, n_classes, class_prior) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]) (classes_sp, n_classes_sp, class_prior_sp) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]) class_prior_expected = [[4/9, 3/9, 2/9], [2/9, 4/9, 3/9], [1.0], [1.0]] for k in range(y.shape[1]): assert_array_almost_equal(classes[k], classes_expected[k]) assert_array_almost_equal(n_classes[k], n_classes_expected[k]) assert_array_almost_equal(class_prior[k], class_prior_expected[k]) assert_array_almost_equal(classes_sp[k], classes_expected[k]) assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
def fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y, **kwargs): self.nb_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for itr in range(self.nb_iterations): # each construction shall have a different random initialisation y_cur, fit_time_cur, predict_time_cur = self.load_network_probs( self.network_name, itr, self.res_path, self.dataset_name, self.random_seed, ) if itr == 0: self.y_pred = y_cur self.fit_time = fit_time_cur self.predict_time = predict_time_cur else: self.y_pred = self.y_pred + y_cur self.fit_time = self.fit_time + fit_time_cur self.predict_time = self.predict_time + predict_time_cur self.y_pred = self.y_pred / self.nb_iterations # check if binary classification if self.y_pred.shape[1] == 1: # first column is probability of class 0 and second is of class 1 self.y_pred = np.hstack([1 - self.y_pred, self.y_pred]) self._is_fitted = True return self
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier.fit(X, y) # self.shapelet_transform.fit(X,y) # print("Shapelet Search complete") # self.st_X =self.shapelet_transform.transform(X) # print("Transform complete") # X = np.asarray([a.values for a in X.iloc[:, 0]]) # self.classifier.fit(X,y) # print("Build classifier complete") self._is_fitted = True return self
def fit(self, X, y): """Fit a random catch22 feature forest classifier. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] c22 = Catch22(outlier_norm=self.outlier_norm) c22_list = c22.fit_transform(X) self.classifier = RandomForestClassifier( n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X_c22 = np.nan_to_num(np.array(c22_list, dtype=np.float32), False, 0, 0, 0) self.classifier.fit(X_c22, y) self._is_fitted = True return self
def fit(self, X, y): self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.constituent_build_times = np.zeros(len(self.classifiers)) self.train_accs_by_classifier = np.zeros(len(self.classifiers)) self.training_preds = np.empty((len(self.classifiers), len(y)),dtype=type(y[0])) self.training_probas = np.empty((len(self.classifiers), len(y), len(self.classes_))) # build each classifier for c_id in range(len(self.classifiers)): if self.verbose > 0: print("Building "+self.classifier_names[c_id]) start_time = time.time() self.classifiers[c_id].random_state=self.random_state if self.classifier_param_grids is None or self.classifier_param_grids[c_id] is None: pass else: grid = GridSearchCV(estimator=self.classifiers[c_id], param_grid=self.classifier_param_grids[c_id], scoring='accuracy', cv=self.param_cv_folds, verbose=self.verbose) self.classifiers[c_id] = grid.fit(X, y).best_estimator_ self.training_probas[c_id] = cross_val_predict(self.classifiers[c_id], X=X, y=y, cv=self.cv_folds, method='predict_proba') end_time = time.time() self.training_preds[c_id] = np.array([self.classes_[np.argmax(x)] for x in self.training_probas[c_id]]) self.train_accs_by_classifier[c_id] = accuracy_score(y, self.training_preds[c_id]) self.constituent_build_times[c_id] = end_time-start_time self.classifiers[c_id].fit(train_x, train_y)
def fit(self, X, y): """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self.classifier = rocket_pipeline = make_pipeline( Rocket( num_kernels=self.num_kernels, random_state=self.random_state, n_jobs=self.n_jobs, ), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using supervised intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. STSF has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y( X, y, enforce_univariate=True, coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, _ = X.shape rng = check_random_state(self.random_state) cls, class_counts = np.unique(y, return_counts=True) self.n_classes = class_counts.shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals_ = [[[] for _ in range(3)] for _ in range(self.n_estimators)] _, X_p = signal.periodogram(X) X_d = np.diff(X, 1) balance_cases = np.zeros(0, dtype=np.int32) average = math.floor(n_instances / self.n_classes) for i, c in enumerate(cls): if class_counts[i] < average: cls_idx = np.where(y == c)[0] balance_cases = np.concatenate( (rng.choice(cls_idx, size=average - class_counts[i]), balance_cases)) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_estimator)( X, X_p, X_d, y, np.concatenate((rng.choice(n_instances, size=n_instances), balance_cases)), i, ) for i in range(self.n_estimators)) self._is_fitted = True return self
def _fit(self, X, y): """Fit an estimator using transformed data from the Catch22 transformer. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_dims] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_classes = np.unique(y).shape[0] self._transformer = Catch22(outlier_norm=self.outlier_norm) self._estimator = _clone_estimator( RandomForestClassifier(n_estimators=200) if self.estimator is None else self.estimator, self.random_state, ) m = getattr(self._estimator, "n_jobs", None) if m is not None: self._estimator.n_jobs = self.n_jobs X_t = self._transformer.fit_transform(X, y) X_t = np.nan_to_num(X_t, False, 0, 0, 0) self._estimator.fit(X_t, y) return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y). Uses random intervals and catch22/tsf summary features. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances,n_dimensions, series_length] or shape = [n_instances,series_length] The training input samples. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.base_estimator is None or self.base_estimator == "DTC": self.tree = DecisionTreeClassifier(criterion="entropy") elif self.base_estimator == "CIT": self.tree = ContinuousIntervalTree() elif isinstance(self.base_estimator, BaseEstimator): self.tree = self.base_estimator else: raise ValueError("DrCIF invalid base estimator given.") if self.n_intervals is None: self.__n_intervals = int( math.sqrt(self.series_length) * math.sqrt(self.n_dims) ) if self.__n_intervals <= 0: self.__n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length if self.max_interval is None: self.__max_interval = self.series_length / 2 if self.__max_interval < self.min_interval: self.__max_interval = self.min_interval fit = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_estimator)( X, y, i, ) for i in range(self.n_estimators) ) self.classifiers, self.intervals, self.dims, self.atts = zip(*fit) self._is_fitted = True return self
def fit(self, X, y): """Fit a single TD classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self.dims, self.transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self.dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self.transformers[i].transform(X_dim, y) dim_words = dim_words[0] for i in range(self.n_instances): for word, count in dim_words[i].items(): words[i][word << self.highest_dim_bit | dim] = count self.transformed_data = words else: self.transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, save_words=False, n_jobs=self.n_jobs, )) sfa = self.transformers[0].fit_transform(X, y) self.transformed_data = sfa[0] self._is_fitted = True return self
def fit(self, X, y): sfa = self.transform.fit_transform(X) self.transformed_data = [series.to_dict() for series in sfa.iloc[:, 0]] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index
def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] cv_size = 10 _, counts = np.unique(y, return_counts=True) min_class = np.min(counts) if min_class < cv_size: cv_size = min_class self.stc = ShapeletTransformClassifier( random_state=self.random_state, time_contract_in_mins=60, ) self.stc.fit(X, y) train_preds = cross_val_predict( ShapeletTransformClassifier( random_state=self.random_state, time_contract_in_mins=60, ), X=X, y=y, cv=cv_size, ) self.stc_weight = accuracy_score(y, train_preds)**4 self.tsf = TimeSeriesForest(random_state=self.random_state) self.tsf.fit(X, y) train_preds = cross_val_predict( TimeSeriesForest(random_state=self.random_state), X=X, y=y, cv=cv_size, ) self.tsf_weight = accuracy_score(y, train_preds)**4 self.rise = RandomIntervalSpectralForest( random_state=self.random_state) self.fit(X, y) train_preds = cross_val_predict( RandomIntervalSpectralForest(random_state=self.random_state), X=X, y=y, cv=cv_size, ) self.rise_weight = accuracy_score(y, train_preds)**4 self.cboss = ContractableBOSS(random_state=self.random_state) self.cboss.fit(X, y) train_probs = self.cboss._get_train_probs(X) train_preds = self.cboss.classes_[np.argmax(train_probs, axis=1)] self.cboss_weight = accuracy_score(y, train_preds)**4 return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. TSF has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y( X, y, enforce_univariate=not TimeSeriesForest.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)( X, y, self.base_estimator, self.intervals_[i], self.random_state, ) for i in range(self.n_estimators) ) self._is_fitted = True return self
def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y) using random intervals and summary measures. Parameters ---------- X : array-like or sparse matrix of shape = [n_samps, num_atts] The training input samples. If a Pandas data frame is passed, the column _dim_to_use is extracted y : array-like, shape = [n_samples] or [n_samples, n_outputs] The class labels. Returns ------- self : object """ if isinstance(X, pd.DataFrame): if isinstance(X.iloc[0,self.dim_to_use], pd.Series): X = np.asarray([a.values for a in X.iloc[:,0]]) else: raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe containing Series objects") n_samps, self.series_length = X.shape self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals=np.zeros((self.num_trees, 2), dtype=int) self.intervals[0][0] = 0 self.intervals[0][1] = self.series_length for i in range(1, self.num_trees): self.intervals[i][0]=random.randint(self.series_length - self.min_interval) self.intervals[i][1]=random.randint(self.intervals[i][0] + self.min_interval, self.series_length) # Check lag against global properties if self.acf_lag > self.series_length-self.acf_min_values: self.acf_lag = self.series_length - self.acf_min_values if self.acf_lag < 0: self.acf_lag = 1 self.lags=np.zeros(self.num_trees, dtype=int) for i in range(0, self.num_trees): temp_lag=self.acf_lag if temp_lag > self.intervals[i][1]-self.intervals[i][0]-self.acf_min_values: temp_lag = self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values if temp_lag < 0: temp_lag = 1 self.lags[i] = int(temp_lag) acf_x = np.empty(shape=(n_samps,self.lags[i])) ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2 ps_x = np.empty(shape=(n_samps,int(ps_len))) for j in range(0, n_samps): acf_x[j] = acf(X[j,self.intervals[i][0]:self.intervals[i][1]], temp_lag) ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]]) transformed_x = np.concatenate((acf_x,ps_x),axis=1) # transformed_x=acf_x tree = deepcopy(self.base_estimator) tree.fit(transformed_x, y) self.classifiers.append(tree) return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and catch22/tsf summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,series_length] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification). y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.n_intervals is None: self.__n_intervals = 4 + int( (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3) if self.__n_intervals <= 0: self.__n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length if self.max_interval is None: self.__max_interval = self.series_length / 2 if self.__max_interval < self.min_interval: self.__max_interval = self.min_interval _, X_p = signal.periodogram(X) X_d = np.diff(X, 1) self.total_intervals = self.__n_intervals * 2 + int( self.__n_intervals / 2) fit = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_estimator)( X, X_p, X_d, y, i, ) for i in range(self.n_estimators)) self.classifiers, self.intervals, self.dims, self.atts = zip(*fit) self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances,series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ if isinstance(X, pd.DataFrame): if X.shape[1] > 1: raise TypeError("TSF cannot handle multivariate problems yet") elif isinstance(X.iloc[0,0], pd.Series): X = np.asarray([a.values for a in X.iloc[:,0]]) else: raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (TSF cannot yet handle multivariate problems") n_samps, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals==0: self.n_intervals = 1 if self.series_length <self.min_interval: self.min_interval=self.series_length self.intervals=np.zeros((self.n_trees, 3 * self.n_intervals, 2), dtype=int) for i in range(0, self.n_trees): transformed_x = np.empty(shape=(3 * self.n_intervals, n_samps)) # Find the random intervals for classifier i and concatentate features for j in range(0, self.n_intervals): self.intervals[i][j][0]=random.randint(self.series_length - self.min_interval) length=random.randint(self.series_length - self.intervals[i][j][0] - 1) if length < self.min_interval: length = self.min_interval self.intervals[i][j][1] = self.intervals[i][j][0] + length # Transforms here, just hard coding it, so not configurable means = np.mean(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1) std_dev = np.std(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1) slope = self.lsq_fit(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]]) transformed_x[3*j]=means transformed_x[3*j+1]=std_dev transformed_x[3*j+2]=slope tree = deepcopy(self.base_estimator) transformed_x=transformed_x.T tree.fit(transformed_x, y) self.classifiers.append(tree) return self
def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.time_limit_in_minutes > 0: # contracting 2/3 transform (with 1/5 of that taken away for final # transform), 1/3 classifier third = self.time_limit_in_minutes / 3 self._classifier_limit_in_minutes = third self._transform_limit_in_minutes = (third * 2) / 5 * 4 elif self.transform_limit_in_minutes > 0: self._transform_limit_in_minutes = self.transform_limit_in_minutes self._transformer = RandomShapeletTransform( n_shapelet_samples=self.n_shapelet_samples, max_shapelets=self.max_shapelets, max_shapelet_length=self.max_shapelet_length, time_limit_in_minutes=self._transform_limit_in_minutes, contract_max_n_shapelet_samples=self. contract_max_n_shapelet_samples, n_jobs=self.n_jobs, batch_size=self.batch_size, random_state=self.random_state, ) self._estimator = _clone_estimator( RotationForest() if self.estimator is None else self.estimator, self.random_state, ) if isinstance(self._estimator, RotationForest): self._estimator.save_transformed_data = self.save_transformed_data m = getattr(self._estimator, "n_jobs", None) if m is not None: self._estimator.n_jobs = self._n_jobs m = getattr(self._estimator, "time_limit_in_minutes", None) if m is not None and self.time_limit_in_minutes > 0: self._estimator.time_limit_in_minutes = self._classifier_limit_in_minutes X_t = self._transformer.fit_transform(X, y).to_numpy() if self.save_transformed_data: self.transformed_data = X_t self._estimator.fit(X_t, y)
def fit(self, X, y): """Build a forest of trees from the training set (X, y). Parameters ---------- Xt: np.ndarray or pd.DataFrame Panel training data. y : np.ndarray The class labels. Returns ------- self : object An fitted instance of the classifier """ X, y = check_X_y( X, y, enforce_univariate=not self.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, self.series_length = X.shape n_jobs = check_n_jobs(self.n_jobs) rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=n_jobs)( delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng), X, y, self.intervals_[i]) for i in range(self.n_estimators)) self._is_fitted = True return self
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # if y is a pd.series then convert to array. if isinstance(y, pd.Series): y = y.to_numpy() # generate pipeline in fit so that random state can be propagated properly. self.classifier_ = Pipeline([ ( "st", ContractedShapeletTransform( time_contract_in_mins=self.transform_contract_in_mins, verbose=False, random_state=self.random_state, ), ), ( "rf", RandomForestClassifier(n_estimators=self.n_estimators, random_state=self.random_state), ), ]) self.n_classes_ = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier_.fit(X, y) self._is_fitted = True return self
def fit(self, X, y): """Fit a tree on cases (X,y), where y is the target variable. Build an information gain based tree for continuous attributes using the margin gain metric for ties. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances,n_attributes] The training input samples. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ if not isinstance(X, np.ndarray) or len(X.shape) > 2: raise ValueError( "ContinuousIntervalTree is not a time series classifier. " "A 2d numpy array is required.") X, y = check_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index rng = check_random_state(self.random_state) self.root = _TreeNode(random_state=rng) thresholds = np.linspace(np.min(X, axis=0), np.max(X, axis=0), 20) distribution_cls, distribution = unique_count(y) e = _entropy(distribution, distribution.sum()) self.root.build_tree( X, y, thresholds, e, distribution_cls, distribution, 0, self.max_depth, False, ) self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self._class_vals = y self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self._dims, self._transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self._dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self._transformers[i].transform(X_dim, y) dim_words = dim_words[0] for n in range(self.n_instances): for word, count in dim_words[n].items(): words[n][word << self._highest_dim_bit | dim] = count self._transformed_data = words else: self._transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, lower_bounding=False, save_words=False, use_fallback_dft=True, n_jobs=self._n_jobs, )) sfa = self._transformers[0].fit_transform(X, y) self._transformed_data = sfa[0]
def fit(self, X, y): """ Build a single or ensemble of pipelines containing the ROCKET transformer and RidgeClassifierCV classifier. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index if self.ensemble: for i in range(self.ensemble_size): rocket_pipeline = make_pipeline( Rocket(num_kernels=self.num_kernels, random_state=self.random_state), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) self.classifiers.append(rocket_pipeline) self.weights.append(rocket_pipeline.steps[1][1].best_score_) self.weight_sum = self.weight_sum + self.weights[i] else: rocket_pipeline = make_pipeline( Rocket(num_kernels=self.num_kernels, random_state=self.random_state), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) self.classifiers.append(rocket_pipeline) self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.base_estimator == "DTC": self._base_estimator = DecisionTreeClassifier(criterion="entropy") elif self.base_estimator == "CIT": self._base_estimator = ContinuousIntervalTree() elif isinstance(self.base_estimator, BaseEstimator): self._base_estimator = self.base_estimator else: raise ValueError("DrCIF invalid base estimator given.") if self.n_intervals is None: self._n_intervals = int( math.sqrt(self.series_length) * math.sqrt(self.n_dims) ) if self._n_intervals <= 0: self._n_intervals = 1 if self.att_subsample_size > 25: self._att_subsample_size = 25 if self.series_length < self.min_interval: self._min_interval = self.series_length elif self.min_interval < 3: self._min_interval = 3 if self.max_interval is None: self._max_interval = self.series_length / 2 if self._max_interval < self._min_interval: self._max_interval = self._min_interval fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( X, y, i, ) for i in range(self.n_estimators) ) self.estimators_, self.intervals, self.dims, self.atts = zip(*fit)
def fit(self, X, y): if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): X, y = check_X_y(X, y, enforce_univariate=True) X = tabularize(X, return_array=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y): """Fit an estimator using transformed data from the Catch22 transformer. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_dims] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_classes = np.unique(y).shape[0] self._transformer = (TSFreshRelevantFeatureExtractor( default_fc_parameters=self.default_fc_parameters, n_jobs=self.n_jobs, chunksize=self.chunksize, ) if self.relevant_feature_extractor else TSFreshFeatureExtractor( default_fc_parameters=self.default_fc_parameters, n_jobs=self.n_jobs, chunksize=self.chunksize, )) self._estimator = _clone_estimator( RandomForestClassifier(n_estimators=200) if self.estimator is None else self.estimator, self.random_state, ) if self.verbose < 2: self._transformer.show_warnings = False if self.verbose < 1: self._transformer.disable_progressbar = True m = getattr(self._estimator, "n_jobs", None) if callable(m): self._estimator.n_jobs = self.n_jobs X_t = self._transformer.fit_transform(X, y) self._estimator.fit(X_t, y) self._is_fitted = True return self
def fit(self, X, y): """ Build an ensemble of pipelines containing the ROCKET transformer and RidgeClassifierCV classifier. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) n_jobs = check_n_jobs(self.n_jobs) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index base_estimator = _make_estimator(self.num_kernels, self.random_state) self.estimators_ = Parallel(n_jobs=n_jobs)( delayed(_fit_estimator)(_clone_estimator(base_estimator, self.random_state), X, y) for _ in range(self.n_estimators)) self.weights = [] self.weight_sum = 0 for rocket_pipeline in self.estimators_: weight = rocket_pipeline.steps[1][1].best_score_ self.weights.append(weight) self.weight_sum += weight self._is_fitted = True return self
def fit(self, X, y): """Fit a random catch22 feature forest classifier Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] c22_list = [] for i in range(n_instances): series = X[i, :] c22_dict = catch22_all(series) c22_list.append(c22_dict["values"]) self.classifier = RandomForestClassifier( n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X_c22 = np.array(c22_list) np.nan_to_num(X_c22, False, 0, 0, 0) self.classifier.fit(X_c22, y) self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) if self.n_parameter_samples <= self.randomly_selected_params: print( # noqa "TDE Warning: n_parameter_samples <= randomly_selected_params, ", "ensemble member parameters will be fully randomly selected.", ) time_limit = self.time_limit_in_minutes * 60 self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index self.estimators_ = [] self.weights = [] self._prev_parameters_x = [] self._prev_parameters_y = [] # Window length parameter space dependent on series length max_window_searches = self.series_length / 4 max_window = int(self.series_length * self.max_win_len_prop) win_inc = int((max_window - self.min_window) / max_window_searches) if win_inc < 1: win_inc = 1 if self.min_window > max_window + 1: raise ValueError( f"Error in TemporalDictionaryEnsemble, min_window =" f"{self.min_window} is bigger" f" than max_window ={max_window}," f" series length is {self.series_length}" f" try set min_window to be smaller than series length in " f"the constructor, but the classifier may not work at " f"all with very short series") possible_parameters = self._unique_parameters(max_window, win_inc) num_classifiers = 0 subsample_size = int(self.n_instances * 0.7) lowest_acc = 1 lowest_acc_idx = 0 time_limit = self.time_limit_in_minutes * 60 start_time = time.time() train_time = 0 if time_limit > 0: n_parameter_samples = 0 else: n_parameter_samples = self.n_parameter_samples rng = check_random_state(self.random_state) if self.bigrams is None: if self.n_dims > 1: use_bigrams = False else: use_bigrams = True else: use_bigrams = self.bigrams # use time limit or n_parameter_samples if limit is 0 while (train_time < time_limit or num_classifiers < n_parameter_samples ) and len(possible_parameters) > 0: if num_classifiers < self.randomly_selected_params: parameters = possible_parameters.pop( rng.randint(0, len(possible_parameters))) else: scaler = preprocessing.StandardScaler() scaler.fit(self._prev_parameters_x) gp = KernelRidge(kernel="poly", degree=1) gp.fit(scaler.transform(self._prev_parameters_x), self._prev_parameters_y) preds = gp.predict(scaler.transform(possible_parameters)) parameters = possible_parameters.pop( rng.choice(np.flatnonzero(preds == preds.max()))) subsample = rng.choice(self.n_instances, size=subsample_size, replace=False) X_subsample = X[subsample] y_subsample = y[subsample] tde = IndividualTDE( *parameters, alphabet_size=self._alphabet_size, bigrams=use_bigrams, dim_threshold=self.dim_threshold, max_dims=self.max_dims, random_state=self.random_state, ) tde.fit(X_subsample, y_subsample) tde._subsample = subsample if self.save_train_predictions: tde._train_predictions = np.zeros(subsample_size) tde._accuracy = self._individual_train_acc( tde, y_subsample, subsample_size, 0 if num_classifiers < self.max_ensemble_size else lowest_acc, ) if tde._accuracy > 0: weight = math.pow(tde._accuracy, 4) else: weight = 0.000000001 if num_classifiers < self.max_ensemble_size: if tde._accuracy < lowest_acc: lowest_acc = tde._accuracy lowest_acc_idx = num_classifiers self.weights.append(weight) self.estimators_.append(tde) elif tde._accuracy > lowest_acc: self.weights[lowest_acc_idx] = weight self.estimators_[lowest_acc_idx] = tde lowest_acc, lowest_acc_idx = self._worst_ensemble_acc() self._prev_parameters_x.append(parameters) self._prev_parameters_y.append(tde._accuracy) num_classifiers += 1 train_time = time.time() - start_time self.n_estimators = len(self.estimators_) self._weight_sum = np.sum(self.weights)
def fit(self, X, y, sample_weight=None): """Fit the random classifier. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_outputs] Target values. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- self : object Returns self. """ if self.strategy not in ("most_frequent", "stratified", "uniform", "constant"): raise ValueError("Unknown strategy type.") if self.strategy == "uniform" and sp.issparse(y): y = y.toarray() warnings.warn('A local copy of the target data has been converted ' 'to a numpy array. Predicting on sparse target data ' 'with the uniform strategy would not save memory ' 'and would be slower.', UserWarning) self.sparse_output_ = sp.issparse(y) if not self.sparse_output_: y = np.atleast_1d(y) self.output_2d_ = y.ndim == 2 if y.ndim == 1: y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if self.strategy == "constant": if self.constant is None: raise ValueError("Constant target value has to be specified " "when the constant strategy is used.") else: constant = np.reshape(np.atleast_1d(self.constant), (-1, 1)) if constant.shape[0] != self.n_outputs_: raise ValueError("Constant target value should have " "shape (%d, 1)." % self.n_outputs_) (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(y, sample_weight) if self.strategy == "constant": for k in range(self.n_outputs_): # Checking in case of constant strategy if the constant # provided by the user is in y. if constant[k] not in self.classes_[k]: raise ValueError("The constant target value must be " "present in training data") if self.n_outputs_ == 1 and not self.output_2d_: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.class_prior_ = self.class_prior_[0] return self