def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.ensemble = None self.combiner = Combiner(rule=combination_rule)
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features
def __init__(self, ensemble=None, selector=None, combiner=None): self.ensemble = ensemble self.selector = selector if combiner is None: self.combiner = Combiner(rule='majority_vote') elif isinstance(combiner, str): self.combiner = Combiner(rule=combiner) elif isinstance(combiner, Combiner): self.combiner = combiner else: raise ValueError('Invalid parameter combiner')
def stacking_learning(train_data, train_Y, test_data, test_Y): log_clf = LogisticRegression(max_iter=10) svm_clf = SVC(probability=True, decision_function_shape='ovo', kernel="linear", C=0.2) knnclf = KNeighborsClassifier(n_neighbors=10) NB_clf = MultinomialNB(alpha=0.01) tree_clf = DecisionTreeClassifier(max_depth=3) rnd_clf = RandomForestClassifier() ensemble = Ensemble([log_clf, knnclf, tree_clf, rnd_clf]) # ('lr', log_clf), ('SVM', svm_clf), ("KNN_10", knnclf), ('NB', NB_clf), ('Tree', tree_clf), ('rf', rnd_clf) eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean')) # Creating Stacking layer_1 = Ensemble([log_clf, knnclf, tree_clf, rnd_clf]) layer_2 = Ensemble([clone(log_clf)]) stack = EnsembleStack(cv=5) stack.add_layer(layer_1) stack.add_layer(layer_2) sclf = EnsembleStackClassifier(stack) clf_list = [log_clf, knnclf, tree_clf, rnd_clf, eclf, sclf] lbl_list = [ 'Logistic Regression', 'KNN', 'tree_clf', 'rnd_clf', 'Ensemble', 'Stacking' ] itt = itertools.product([0, 1, 2, 3, 4, 5, 6, 7], repeat=10) print("brew----------------") for clf, lab, grd in zip(clf_list, lbl_list, itt): clf.fit(train_data, train_Y) y_pred = clf.predict(test_data) print(clf.__class__.__name__, accuracy_score(test_Y, y_pred), sep=":")
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule)
class BaggingSK(PoolGenerator): """" This class should not be used, use brew.generation.bagging.Bagging instead. """ def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.sk_bagging.fit(X, y) self.ensemble.add_classifiers(self.sk_bagging.estimators_) # self.classes_ = set(y) def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class Bagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.ensemble = None self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice( X.shape[1], int(np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBagging(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' # TODO normalize diversity metric. ''' self.ensemble.add(classifier) out = self.ensemble.output(self.validation_X) y_pred = self.combiner.combine(out) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, self.validation_y) # diversity = entropy_measure_e(self.ensemble, # self.validation_X, self.validation_y) self.ensemble.classifiers.pop() return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.combiner.combine(self.ensemble.output(self.validation_X)) mask = self.positive_label == self.validation_y pos_acc = float(sum(y_pred[mask] == self.validation_y[mask])) / len( self.validation_y[mask]) neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask])) / len( self.validation_y[~mask]) return 1.0 - (pos_acc / (pos_acc + neg_acc)) def bootstrap_classifiers(self, X, y, K, pos_prob): mask = self.positive_label == y negative_label = y[~mask][0] clfs = [] sets_cX, sets_cy = [], [] for i in range(K): cX, cy = [], [] for j in range(X.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X[mask]) - 1) cX = cX + [X[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X[~mask]) - 1) cX = cX + [X[~mask][idx]] cy = cy + [negative_label] if self.positive_label not in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[mask]) - 1) cX[idx_1] = X[mask][idx_2] cy[idx_1] = self.positive_label elif negative_label not in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[~mask]) - 1) cX[idx_1] = X[~mask][idx_2] cy[idx_1] = negative_label # print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy) sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy] clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs def fit(self, X, y): # if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for _ in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBaggingNew(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.positive_label = positive_label self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' # TODO normalize diversity metric. ''' self.ensemble.add(classifier) y_pred = self.predict(self.validation_X) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, y_true) self.ensemble.classifiers.pop() # create interface for this later return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.predict(self.validation_X) y_true = self.validation_y # obtaining recall scores for each label (assuming the labels are # binary) pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label) neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label)) return neg_acc / (pos_acc + neg_acc) def bootstrap_classifiers(self, X, y, K, pos_prob): pos_idx = (y == self.positive_label) neg_idx = (y == int(not self.positive_label)) X_pos, _ = X[pos_idx, :], y[pos_idx] # positive examples X_neg, _ = X[neg_idx, :], y[neg_idx] # negative examples classifiers = [] for i in range(K): X_new = np.zeros(X.shape) y_new = np.zeros(y.shape) for j in range(X.shape[0]): if pos_prob > np.random.random(): # add a randomly chosen positive example idx = np.random.randint(X_pos.shape[0]) X_new[j, :] = X_pos[idx, :] y_new[j] = self.positive_label else: # add a randomly chosen negative example idx = np.random.randint(X_neg.shape[0]) X_new[j, :] = X_neg[idx, :] y_new[j] = int(not self.positive_label) # if no positive example is present, make sure you insert at least # one if not np.any(y_new == self.positive_label): # chosen spot for replacement on new array idx_new = np.random.randint(X_new.shape[0]) # chosen positive example index idx_pos = np.random.randint(X_pos.shape[0]) X_new[idx_new, :] = X_pos[idx_pos, :] y_new[idx_new] = self.positive_label # if no negative example is present, make sure you insert at least # one elif not np.any(y_new == int(not self.positive_label)): # chosen spot for replacement on new array idx_new = np.random.randint(X_new.shape[0]) # chosen positive example index idx_neg = np.random.randint(X_neg.shape[0]) X_new[idx_new, :] = X_neg[idx_neg, :] y_new[idx_new] = int(not self.positive_label) # train classifier with the bootstrapped data clf = sklearn.base.clone(self.base_classifier) clf.fit(X_new, y_new) classifiers.append(clf) return classifiers def fit(self, X, y): # if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
# print("各个分类器的预测结果:") # for clf in (log_clf, svm_clf, knnclf, NB_clf, tree_clf, rnd_clf, voting_clf): # clf.fit(train_tf_idf_vectorize, train_label) # y_pred = clf.predict(dev_tf_idf_vectorize) # print(clf.__class__.__name__, clf.score(train_tf_idf_vectorize, train_label)) # print(clf.__class__.__name__, accuracy_score(dev_label, y_pred), sep=":") # # ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.6) # ada_clf.fit(train_tf_idf_vectorize, train_label) # y_pre = ada_clf.predict(dev_tf_idf_vectorize) # print("ada精确率:", accuracy_score(y_pred=y_pre, y_true=dev_label)) # Creating Ensemble ensemble = Ensemble([log_clf, svm_clf, knnclf, NB_clf, tree_clf, rnd_clf]) # ('lr', log_clf), ('SVM', svm_clf), ("KNN_10", knnclf), ('NB', NB_clf), ('Tree', tree_clf), ('rf', rnd_clf) eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean')) # Creating Stacking layer_1 = Ensemble([log_clf, svm_clf, knnclf, NB_clf, tree_clf, rnd_clf]) layer_2 = Ensemble([clone(log_clf)]) stack = EnsembleStack(cv=3) stack.add_layer(layer_1) stack.add_layer(layer_2) sclf = EnsembleStackClassifier(stack) clf_list = [log_clf, svm_clf, NB_clf, knnclf, tree_clf, rnd_clf, eclf, sclf] lbl_list = [ 'Logistic Regression', 'SVM', 'knn', 'NB_clf', 'tree_clf', 'rnd_clf',
class SmoteBagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=5): # self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y == i), :] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, i * np.ones((majority_count,)))) # print('original class data = {}'.format(class_data.shape)) # print('sampled class data = {}'.format(class_data[idx,:].shape)) # noqa # print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b / 100) idx = np.random.choice( class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) # noqa sampled_class_data = class_data[idx, :] # print('original class data = {}'.format(class_data.shape)) # print('majority_count = {}'.format(majority_count)) # print('class data = {}'.format(class_data.shape)) # print('b = {}'.format(b)) # print('sample rate = {}'.format(sample_rate)) # print('sampled class data = {}'.format(sampled_class_data.shape)) # noqa # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small # value to avoid a zero N_smote = int(np.ceil( (majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100) # noqa # print(N_smote) # print('----------') # print('smote parameters:') # print('T : {}'.format(sampled_class_data.shape)) # print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) # print('synthetic data = {})'.format(synthetic.shape)) # print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate( (sampled_class_data, synthetic[idx, :])) # print('new class data = {})'.format(new_class_data.shape)) # print() data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, i * np.ones((new_class_data.shape[0],)))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class EnsembleClassifier(object): def __init__(self, ensemble=None, selector=None, combiner=None): self.ensemble = ensemble self.selector = selector if combiner is None: self.combiner = Combiner(rule='majority_vote') elif isinstance(combiner, str): self.combiner = Combiner(rule=combiner) elif isinstance(combiner, Combiner): self.combiner = combiner else: raise ValueError('Invalid parameter combiner') def fit(self, X, y): self.ensemble.fit(X, y) def predict(self, X): # TODO: warn the user if mode of ensemble # output excludes the chosen combiner? if self.selector is None: out = self.ensemble.output(X) y = self.combiner.combine(out) else: y = [] for i in range(X.shape[0]): ensemble, weights = self.selector.select( self.ensemble, X[i, :][np.newaxis, :]) if weights is not None: # use the ensemble with weights if self.combiner.combination_rule == 'majority_vote': out = ensemble.output(X[i, :][np.newaxis, :]) else: out = ensemble.output(X[i, :][np.newaxis, :], mode='probs') # apply weights for i in range(out.shape[2]): out[:, :, i] = out[:, :, i] * weights[i] [tmp] = self.combiner.combine(out) y.append(tmp) else: # use the ensemble, but ignore the weights if self.combiner.combination_rule == 'majority_vote': out = ensemble.output(X[i, :][np.newaxis, :]) else: out = ensemble.output(X[i, :][np.newaxis, :], mode='probs') [tmp] = self.combiner.combine(out) y.append(tmp) return np.asarray(y) def predict_proba(self, X): # TODO: warn the user if mode of ensemble # output excludes the chosen combiner? if self.selector is None: out = self.ensemble.output(X, mode='probs') return np.mean(out, axis=2) else: out_full = [] for i in range(X.shape[0]): ensemble, weights = self.selector.select( self.ensemble, X[i, :][np.newaxis, :]) if weights is not None: # use the ensemble with weights out = ensemble.output(X[i, :][np.newaxis, :]) # apply weights for i in range(out.shape[2]): out[:, :, i] = out[:, :, i] * weights[i] # [tmp] = self.combiner.combine(out) out_full.extend(list(np.mean(out, axis=2))) else: # use the ensemble, but ignore the weights out = ensemble.output(X[i, :][np.newaxis, :]) out_full.extend(list(np.mean(out, axis=2))) # return np.asarray(y) return np.array(out_full) def score(self, X, y, sample_weight=None): return accuracy_score(y, self.predict(X), sample_weight=sample_weight)