class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int( np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int(np.ceil(X.shape[1]*self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer(features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone(self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=float(b), k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None mcb_x = ensemble.output(x, mode='labels')[0, :] # intialize variables # the the indexes of the KNN of x [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] mcb_v = ensemble.output(X, mode='labels') idx = [] for i in range(X.shape[0]): sim = np.mean(mcb_x == mcb_v[i, :]) if sim > self.similarity_threshold: idx = idx + [i] if len(idx) == 0: idx = np.arange(X.shape[0]) scores = [clf.score(X[idx], y[idx]) for clf in ensemble.classifiers] scores = np.array(scores) # if best classifier is significantly better # use best_classifier best_i = np.argmax(scores) best_j_score = np.max(scores[np.arange(len(scores)) != best_i]) if scores[best_i] - scores[best_j] >= self.significance_threshold: best_classifier = ensemble.classifiers[best_i] return Ensemble(classifiers=[best_classifier]), None return Ensemble(classifiers=ensemble.classifiers), None
class Bagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.ensemble = None self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class BaggingSK(PoolGenerator): ''' This class should not be used, use brew.generation.bagging.Bagging instead. ''' def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.sk_bagging.fit(X, y) self.ensemble.add_classifiers(self.sk_bagging.estimators_) #self.classes_ = set(y) def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # obtain the K nearest neighbors in the validation set [idx] = self.knn.kneighbors(x, n_neighbors=self.K, return_distance=False) neighbors_X = self.Xval[idx] # k neighbors neighbors_y = self.yval[idx] # k neighbors target # pool_output (sample, classifier_output) pool_output = np.zeros((neighbors_X.shape[0], len(ensemble))) for i, clf in enumerate(ensemble.classifiers): pool_output[:, i] = clf.predict(neighbors_X) x_outputs = [ ensemble.classifiers[j].predict(x) for j in range(len(ensemble)) ] x_outputs = np.asarray(x_outputs).flatten() scores = np.zeros(len(ensemble)) for j in range(pool_output.shape[1]): # get correctly classified samples mask_classified_correctly = pool_output[:, j] == neighbors_y # get classified samples with the same class as 'x' mask_classified_same_class = (pool_output[:, j] == x_outputs[j]) # get correctly classified samples with the same class as 'x' mask = mask_classified_correctly * mask_classified_same_class # calculate score scores[j] = float( sum(mask)) / (sum(mask_classified_same_class) + 10e-24) return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # obtain the K nearest neighbors in the validation set [idx] = self.knn.kneighbors(x, return_distance=False) neighbors_X = self.Xval[idx] # k neighbors neighbors_y = self.yval[idx] # k neighbors target # pool_output (sample, classifier_output) pool_output = np.zeros((neighbors_X.shape[0], len(ensemble))) for i, clf in enumerate(ensemble.classifiers): pool_output[:, i] = clf.predict(neighbors_X) x_outputs = [ ensemble.classifiers[j].predict(x) for j in range(len(ensemble)) ] x_outputs = np.asarray(x_outputs).flatten() d = {} scores = np.zeros(len(ensemble)) for j in range(pool_output.shape[1]): # get correctly classified samples mask_classified_correctly = pool_output[:, j] == neighbors_y # get classified samples with the same class as 'x' mask_classified_same_class = (pool_output[:, j] == x_outputs[j]) # get correctly classified samples with the same class as 'x' mask = mask_classified_correctly * mask_classified_same_class # calculate score scores[j] = float( sum(mask)) / (sum(mask_classified_same_class) + 10e-24) d[str(scores[j])] = d[str(scores[j])] + [j] if str( scores[j]) in d else [j] best_scores = sorted([float(k) for k in list(d.keys())], reverse=True) options = None for j, score in enumerate(best_scores): pred = [x_outputs[i] for i in d[str(score)]] pred = np.asarray(pred).flatten() bincount = np.bincount(pred.astype(int)) if options is not None: for i in range(len(bincount)): bincount[i] = bincount[i] if i in options else 0 imx = np.argmax(bincount) votes = np.argwhere(bincount == bincount[imx]).flatten() count = len(votes) if count == 1: ens = Ensemble([ensemble.classifiers[np.argmax(pred == imx)]]) return ens, None elif options is None: options = votes return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] scores = np.asarray([clf.score(X, y) for clf in classifiers]) return Ensemble([classifiers[np.argmax(scores)]]), None
def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return
def select(self, ensemble, x): ensemble_mask = None neighbors_X, neighbors_y = self.get_neighbors(x) pool_output = ensemble.output(neighbors_X, mode='labels') # gradually decrease neighborhood size if no # classifier predicts ALL the neighbors correctly for i in range(self.K, 0, -1): pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i], np.all) # if at least one classifier gets all neighbors right if pool_mask is not None: ensemble_mask = pool_mask break # if NO classifiers get the nearest neighbor correctly if ensemble_mask is None: if self.v2007: # Increase neighborhood until one classifier # gets at least ONE (i.e. ANY) neighbors correctly. # Starts with 2 because mask_all with k=1 is # the same as mask_any with k=1 for i in range(2, self.K + 1): pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i], np.any) if pool_mask is not None: ensemble_mask = pool_mask break [selected_idx] = np.where(ensemble_mask) if selected_idx.size > 0: pool = Ensemble( classifiers=[ensemble.classifiers[i] for i in selected_idx]) else: # use all classifiers # pool = ensemble classifiers = self._get_best_classifiers(ensemble, neighbors_X, neighbors_y, x) pool = Ensemble(classifiers=classifiers) # KNORA-ELIMINATE-W that supposedly uses weights, does not make # any sense, so even if self.weighted is True, always return # None for the weights return pool, None
def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return
def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int( np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return
def select(self, ensemble, x): neighbors_X, neighbors_y = self.get_neighbors(x) pool_output = ensemble.output(neighbors_X, mode='labels') output_mask = (pool_output == neighbors_y[:, np.newaxis]) [selected_idx] = np.where(np.any(output_mask, axis=0)) if selected_idx.size > 0: if self.weighted: weights = 1.0 / \ (np.sqrt(np.sum((x - neighbors_X)**2, axis=1)) + 10e-8) weighted_votes = np.dot(weights, output_mask[:, selected_idx]) else: weighted_votes = np.sum(output_mask[:, selected_idx], axis=0) pool = Ensemble( classifiers=[ensemble.classifiers[i] for i in selected_idx]) # if no classifiers are selected, # use all classifiers with no weights else: pool = ensemble weighted_votes = None return pool, weighted_votes
def select(self, ensemble, x): neighbors_X, neighbors_y = self.get_neighbors(x) k = self.K pool = [] while k > 0: nn_X = neighbors_X[:k, :] nn_y = neighbors_y[:k] for i, c in enumerate(ensemble.classifiers): if np.all(c.predict(nn_X) == nn_y[np.newaxis, :]): pool.append(c) if not pool: # empty k = k - 1 else: break if not pool: # still empty # select the classifier that recognizes # more samples in the whole neighborhood # also select classifiers that recognize # the same number of neighbors pool = self._get_best_classifiers(ensemble, neighbors_X, neighbors_y, x) return Ensemble(classifiers=pool), None
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule)
def test__arguments(self): c = MockClassifier() pool = Ensemble(classifiers=[c]) combiner = Combiner(rule='majority_vote') model = EnsembleClassifier(ensemble=pool, combiner=combiner)
def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self
def __init__(self, classifierList, combiningMethod): classifiers = [None] * (len(classifierList)) for key, tuple in enumerate(classifierList): classifiers[key] = tuple[1] hybridEnsemble = Ensemble(classifiers=classifiers) hybridEnsembleClassifier = EnsembleClassifier( ensemble=hybridEnsemble, combiner=Combiner(combiningMethod)) super().__init__(hybridEnsembleClassifier) self.name = "ensemble"
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None n_sel_1, n_sel_2 = self.n_1, self.n_2 if isinstance(self.n_1, float): n_sel_1 = int(n_sel_1 * len(ensemble)) if isinstance(self.n_2, float): n_sel_2 = int(n_sel_2 * len(ensemble)) n_sel_1 = max(n_sel_1, 1) n_sel_2 = max(n_sel_2, 1) # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] acc_scores = np.array([clf.score(X, y) for clf in classifiers]) out = ensemble.output(X, mode='labels') oracle = np.equal(out, y[:, np.newaxis]) div_scores = np.zeros(len(ensemble), dtype=float) for i in range(len(ensemble)): tmp = [] for j in range(len(ensemble)): if i != j: d = kuncheva_double_fault_measure(oracle[:, [i, j]]) tmp.append(d) div_scores[i] = np.mean(tmp) z = zip(np.arange(len(ensemble)), acc_scores, div_scores) z = sorted(z, key=lambda e: e[1], reverse=True)[:n_sel_1] z = sorted(z, key=lambda e: e[2], reverse=False)[:n_sel_2] z = zip(*z)[0] classifiers = [classifiers[i] for i in z] return Ensemble(classifiers=classifiers), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] # d[score] = indexes of the classifiers with that score d = {} scores = [clf.score(X, y) for clf in ensemble.classifiers] for i, scr in enumerate(scores): d[scr] = d[scr] + [i] if scr in d else [i] best_scores = sorted([k for k in d.iterkeys()], reverse=True) # if there was a single best classifier, return it if len(d[best_scores[0]]) == 1: i = d[best_scores[0]][0] return Ensemble([classifiers[i]]), None options = None for j, score in enumerate(best_scores): pred = [classifiers[i].predict(x) for i in d[score]] pred = np.asarray(pred).flatten() bincount = np.bincount(pred) if options is not None: for i in range(len(bincount)): bincount[i] = bincount[i] if i in options else 0 imx = np.argmax(bincount) votes = np.argwhere(bincount == bincount[imx]).flatten() count = len(votes) if count == 1: return Ensemble([classifiers[np.argmax(pred == imx)]]), None elif options is None: options = votes return Ensemble([classifiers[np.argmax(scores)]]), None
def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int(np.ceil(X.shape[1]*self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer(features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone(self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return
def select(self, ensemble, x): neighbors_X, neighbors_y = self.get_neighbors(x) pool = [] for c in ensemble.classifiers: for i, neighbor in enumerate(neighbors_X): if c.predict(neighbor) == neighbors_y[i]: pool.append(c) break weights = [] for clf in pool: msk = clf.predict(neighbors_X) == neighbors_y weights = weights + [sum(msk)] return Ensemble(classifiers=pool), weights
def select(self, ensemble, x): selected_classifier = None nn_X, nn_y, dists = self.get_neighbors(x, return_distance=True) idx_selected, prob_selected = [], [] all_probs = np.zeros(len(ensemble)) for idx, clf in enumerate(ensemble.classifiers): prob = self.probabilities(clf, nn_X, nn_y, dists, x) if prob > 0.5: idx_selected = idx_selected + [idx] prob_selected = prob_selected + [prob] all_probs[idx] = prob if len(prob_selected) == 0: prob_selected = [np.max(all_probs)] idx_selected = [np.argmax(all_probs)] p_correct_m = max(prob_selected) m = np.argmax(prob_selected) selected = True diffs = [] for j, p_correct_j in enumerate(prob_selected): d = p_correct_m - p_correct_j diffs.append(d) if j != m and d < self.threshold: selected = False if selected: selected_classifier = ensemble.classifiers[idx_selected[m]] else: idx_selected = np.asarray(idx_selected) mask = np.array(np.array(diffs) < self.threshold, dtype=bool) i = np.random.choice(idx_selected[mask]) selected_classifier = ensemble.classifiers[i] return Ensemble([selected_classifier]), None
class SmoteBaggingNew(SmoteBagging): def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def smote_bootstrap_sample(self, X, y, b, k): count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, majority_class * np.ones((majority_count,)))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res,)) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn,)) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, minority_class * np.ones((new_class_data.shape[0],)))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate( (target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa return data, target
def test_none_combiner(self): c = MockClassifier() pool = Ensemble(classifiers=[c]) model = EnsembleClassifier(ensemble=pool)
def test_len_with_one_added(self): ens = Ensemble() ens.add(MockClassifier()) assert len(ens) == 1
class SmoteBagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=2): #self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y==i),:] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx,:])) target = np.concatenate((target, i * np.ones((majority_count,)))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b/100) idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) sampled_class_data = class_data[idx,:] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 ) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones((new_class_data.shape[0],)))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBaggingNew(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', max_samples=1.0, positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.positive_label = positive_label self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) y_pred = self.predict(self.validation_X) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, y_true) self.ensemble.classifiers.pop() # create interface for this later return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.predict(self.validation_X) y_true = self.validation_y # obtaining recall scores for each label (assuming the labels are binary) pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label) neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label)) return neg_acc / (pos_acc + neg_acc) def bootstrap_classifiers(self, X, y, K, pos_prob): pos_idx = (y == self.positive_label) neg_idx = (y == int(not self.positive_label)) X_pos, y_pos = X[pos_idx,:], y[pos_idx] # positive examples X_neg, y_neg = X[neg_idx,:], y[neg_idx] # negative examples classifiers = [] for i in range(K): X_new = np.zeros(X.shape) y_new = np.zeros(y.shape) for j in range(X.shape[0]): if pos_prob > np.random.random(): # add a randomly chosen positive example idx = np.random.randint(X_pos.shape[0]) X_new[j,:] = X_pos[idx,:] y_new[j] = self.positive_label else: # add a randomly chosen negative example idx = np.random.randint(X_neg.shape[0]) X_new[j,:] = X_neg[idx,:] y_new[j] = int(not self.positive_label) # if no positive example is present, make sure you insert at least one if not np.any(y_new == self.positive_label): idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array idx_pos = np.random.randint(X_pos.shape[0]) # chosen positive example index X_new[idx_new,:] = X_pos[idx_pos,:] y_new[idx_new] = self.positive_label # if no negative example is present, make sure you insert at least one elif not np.any(y_new == int(not self.positive_label)): idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array idx_neg = np.random.randint(X_neg.shape[0]) # chosen positive example index X_new[idx_new,:] = X_neg[idx_neg,:] y_new[idx_new] = int(not self.positive_label) # train classifier with the bootstrapped data clf = sklearn.base.clone(self.base_classifier) clf.fit(X_new, y_new) classifiers.append(clf) return classifiers def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBagging(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', max_samples=1.0, positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) out = self.ensemble.output(self.validation_X) y_pred = self.combiner.combine(out) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, self.validation_y) #diversity = entropy_measure_e(self.ensemble, # self.validation_X, self.validation_y) self.ensemble.classifiers.pop() return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.combiner.combine(self.ensemble.output(self.validation_X)) mask = self.positive_label == self.validation_y pos_acc = float(sum(y_pred[mask] == self.validation_y[mask]))/len(self.validation_y[mask]) neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask]))/len(self.validation_y[~mask]) return 1.0 - (pos_acc / (pos_acc + neg_acc)) def bootstrap_classifiers(self, X, y, K, pos_prob): mask = self.positive_label == y negative_label = y[~mask][0] clfs = [] sets_cX, sets_cy = [], [] for i in range(K): cX, cy = [], [] for j in range(X.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X[mask]) - 1) cX = cX + [X[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X[~mask]) - 1) cX = cX + [X[~mask][idx]] cy = cy + [negative_label] if not self.positive_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[mask])- 1) cX[idx_1] = X[mask][idx_2] cy[idx_1] = self.positive_label elif not negative_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[~mask])- 1) cX[idx_1] = X[~mask][idx_2] cy[idx_1] = negative_label #print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy) sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy] clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def test_init_mult_classifiers(self): c1 = MockClassifier() c2 = MockClassifier() c3 = MockClassifier() ens = Ensemble(classifiers=[c1, c2, c3]) assert len(ens.classifiers) == 3
def test_len_with_empty_init(self): ens = Ensemble() assert len(ens) == 0
def test_len_with_mult_added(self): ens = Ensemble() ens.add(MockClassifier()) ens.add(MockClassifier()) ens.add(MockClassifier()) assert len(ens) == 3
def test_add_empty_init(self): ens = Ensemble() c = MockClassifier() ens.add(c) assert ens.classifiers[0] is c
import numpy as np import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import itertools import brew from brew.base import Ensemble from brew.combination.combiner import Combiner from brew.stacking.stacker import EnsembleStack from brew.stacking.stacker import EnsembleStackClassifier import sklearn from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier layer_1 = [ SVC(probability=True), RandomForestClassifier(n_estimators=100), ExtraTreesClassifier(n_estimators=100) ] layer_2 = [SVC(probability=True), LogisticRegression(max_iter=500)] stack = EnsembleStack(cv=10) # number of folds per layer stack.add_layer(Ensemble(layer_1)) stack.add_layer(Ensemble(layer_2)) clf = EnsembleStackClassifier(stack, Combiner('mean'))
def test_empty_init(self): ens = Ensemble() assert ens.classifiers != None assert len(ens.classifiers) == 0
my_data = genfromtxt('/Users/samarth/Desktop/data.csv', delimiter=',') for item in range(0, my_data.shape[0]): var = my_data[item][4] my_data[item][4] = int(range_scaler(5538, 600000, 100, 1000, var)) ''' if my_data[item][6] < 100 or my_data[item][6] > 1000 or (my_data[item][6]>my_data[item][4]): my_data = np.delete(my_data, (item), axis = 0) ''' my_data = my_data[np.logical_not( np.logical_and(my_data[:, 4] < 100, my_data[:, 4] > 1000))] my_data = my_data[np.logical_not(my_data[:, 4] > my_data[:, 6])] ensemble = Ensemble([clf1, clf2, clf3]) eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean')) layer_1 = Ensemble([clf1, clf2, clf3]) layer_2 = Ensemble([sklearn.clone(clf1)]) stack = EnsembleStack(cv=3) stack.add_layer(layer_1) stack.add_layer(layer_2) sclf = EnsembleStackClassifier(stack) clf_list = [clf1, clf2, clf3, eclf, sclf] lbl_list = [ 'Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble',
def test_init_one_classifier(self): c = MockClassifier() ens = Ensemble(classifiers=[c]) assert len(ens.classifiers) == 1