def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ if type(X) == pd.DataFrame: X = X.values if type(y) in [pd.DataFrame, pd.Series]: y = y.values self.n_features_ = X.shape[1] self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = list(self.feature_dict_.keys()) self.feature_names = list(self.feature_dict_.values()) extracted_rules = self._extract_rules(X, y) self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules) self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity = self._get_complexity() return self
def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ X, y = check_X_y(X, y) self.n_features_in_ = X.shape[1] self.n_features_ = X.shape[1] self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = list(self.feature_dict_.keys()) self.feature_names = list(self.feature_dict_.values()) extracted_rules = self._extract_rules(X, y) self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules( X, y, extracted_rules) self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self
def fit(self, X, y, feature_names=None, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = list(self.feature_dict_.keys()) self.feature_names = list(self.feature_dict_.values()) n_train = y.shape[0] w = np.ones(n_train) / n_train self.estimators_ = [] self.estimator_weights_ = [] self.estimator_errors_ = [] self.feature_names = feature_names for _ in range(self.n_estimators): # Fit a classifier with the specific weights clf = self.estimator() clf.fit(X, y, sample_weight=w) # uses w as the sampling weight! preds = clf.predict(X) # Indicator function miss = preds != y # Equivalent with 1/-1 to update weights miss2 = np.ones(miss.size) miss2[~miss] = -1 # Error err_m = np.dot(w, miss) / sum(w) if err_m < 1e-3: return self # Alpha alpha_m = 0.5 * np.log((1 - err_m) / float(err_m)) # New weights w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2])) self.estimators_.append(deepcopy(clf)) self.estimator_weights_.append(alpha_m) self.estimator_errors_.append(err_m) rules = [] for est, est_weight in zip(self.estimators_, self.estimator_weights_): if type(clf) == DecisionTreeClassifier: est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True) est_rules = list(map(lambda x: x[0], est_rules_values)) # BRS scores are difference between class 1 % and class 0 % in a node est_values = np.array(list(map(lambda x: x[1], est_rules_values))) rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1) compos_score = est_weight * rule_scores rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)] if type(clf) == SlipperClassifier: # SLIPPER uses uniform confidence over in rule observations est_rule = dict_to_rule(est.rule, est.feature_dict) rules += [Rule(est_rule, args=[est_weight])] self.rules_without_feature_names_ = rules self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self
def fit(self, X, y, feature_names=None, sample_weight=None) -> 'SkopeRulesClassifier': """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.sample_weight = sample_weight self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError( "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer") if not set(self.classes_) == {0, 1}: warn( "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as " "0. Any label different from 0 will be considered as being from the target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError( 'max_samples (%s) is not supported. Valid choices are: "auto", int or float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn( "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set " "to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self._max_depths = self.max_depth if isinstance( self.max_depth, Iterable) else [self.max_depth] self.feature_names_, self.feature_dict_ = self._enum_features( X, feature_names) self.tree_generators = self._get_tree_ensemble() self._fit_tree_ensemble(X, y) extracted_rules = self._extract_rules() scored_rules = self._score_rules(X, y, extracted_rules) self.rules_ = self._prune_rules(scored_rules) self.rules_without_feature_names_ = self.rules_ self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
def fit(self, X, y, sample_weight=None) -> 'SkopeRulesClassifier': """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError( "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer") if not set(self.classes_) == {0, 1}: warn( "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as " "0. Any label different from 0 will be considered as being from the target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError( 'max_samples (%s) is not supported. Valid choices are: "auto", int or float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn( "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set " "to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples # default columns names : feature_names_ = [ BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str) ] if self.feature_names is not None: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names) } else: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_) } self.feature_names_ = feature_names_ self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] # define regression target: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow( (weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging clfs = self._get_tree_ensemble(classify=True) regs = self._get_tree_ensemble(classify=False) self._fit_tree_ensemble(clfs, X, y) self._fit_tree_ensemble(regs, X, y_reg) self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], [] for ensemble in clfs + regs: self.estimators_ += ensemble.estimators_ self.estimators_samples_ += ensemble.estimators_samples_ self.estimators_features_ += ensemble.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): rules_from_tree = tree_to_rules( estimator, np.array(self.feature_names_)[features]) rules_ += self._add_OOB_scores_to_rules(X, y, rules_from_tree, samples, features) self.rules_ = self._filter_rules(rules_) self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
def fit(self, X, y, feature_names: list = None, undiscretized_features=[], verbose=False): """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_names : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception( "Only binary classification is supported at this time!") X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = np.array(list(self.feature_dict_.keys())) self.feature_names = np.array(list(self.feature_dict_.values())) itemsets, self.discretizer = extract_fpgrowth( X, y, feature_names=self.feature_placeholders, minsupport=self.minsupport, maxcardinality=self.maxcardinality, undiscretized_features=undiscretized_features, disc_strategy=self.disc_strategy, disc_kwargs=self.disc_kwargs, verbose=verbose) X_df_onehot = self.discretizer.transform(X) # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [ c if x == 1 else '' for x in list(X_df_onehot[c]) ] X = [{}] * (len(itemsets) + 1) X[0] = set(range( len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([ i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi) ]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = np.array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all) permsdic = defaultdict( default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose, seed=self.random_state) # Merge the chains permsdic = merge_chains(res) # The point estimate, BRL-point self.d_star = get_point_estimate( permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) # get the point estimate if self.d_star: # Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) self.final_itemsets = np.array(self.itemsets, dtype=object)[self.d_star] rule_strs = itemsets_to_rules(self.final_itemsets) self.rules_without_feature_names_ = [Rule(r) for r in rule_strs] self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self