class Bagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.ensemble = None self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int(np.ceil(X.shape[1]*self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer(features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone(self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice(X.shape[1], int( np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def test_len_with_one_added(self): ens = Ensemble() ens.add(MockClassifier()) assert len(ens) == 1
def test_add_empty_init(self): ens = Ensemble() c = MockClassifier() ens.add(c) assert ens.classifiers[0] is c
class ICSBagging(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', max_samples=1.0, positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) out = self.ensemble.output(self.validation_X) y_pred = self.combiner.combine(out) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, self.validation_y) #diversity = entropy_measure_e(self.ensemble, # self.validation_X, self.validation_y) self.ensemble.classifiers.pop() return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.combiner.combine(self.ensemble.output(self.validation_X)) mask = self.positive_label == self.validation_y pos_acc = float(sum(y_pred[mask] == self.validation_y[mask]))/len(self.validation_y[mask]) neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask]))/len(self.validation_y[~mask]) return 1.0 - (pos_acc / (pos_acc + neg_acc)) def bootstrap_classifiers(self, X, y, K, pos_prob): mask = self.positive_label == y negative_label = y[~mask][0] clfs = [] sets_cX, sets_cy = [], [] for i in range(K): cX, cy = [], [] for j in range(X.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X[mask]) - 1) cX = cX + [X[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X[~mask]) - 1) cX = cX + [X[~mask][idx]] cy = cy + [negative_label] if not self.positive_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[mask])- 1) cX[idx_1] = X[mask][idx_2] cy[idx_1] = self.positive_label elif not negative_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[~mask])- 1) cX[idx_1] = X[~mask][idx_2] cy[idx_1] = negative_label #print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy) sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy] clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBaggingNew(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', max_samples=1.0, positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.positive_label = positive_label self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) y_pred = self.predict(self.validation_X) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, y_true) self.ensemble.classifiers.pop() # create interface for this later return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.predict(self.validation_X) y_true = self.validation_y # obtaining recall scores for each label (assuming the labels are binary) pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label) neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label)) return neg_acc / (pos_acc + neg_acc) def bootstrap_classifiers(self, X, y, K, pos_prob): pos_idx = (y == self.positive_label) neg_idx = (y == int(not self.positive_label)) X_pos, y_pos = X[pos_idx,:], y[pos_idx] # positive examples X_neg, y_neg = X[neg_idx,:], y[neg_idx] # negative examples classifiers = [] for i in range(K): X_new = np.zeros(X.shape) y_new = np.zeros(y.shape) for j in range(X.shape[0]): if pos_prob > np.random.random(): # add a randomly chosen positive example idx = np.random.randint(X_pos.shape[0]) X_new[j,:] = X_pos[idx,:] y_new[j] = self.positive_label else: # add a randomly chosen negative example idx = np.random.randint(X_neg.shape[0]) X_new[j,:] = X_neg[idx,:] y_new[j] = int(not self.positive_label) # if no positive example is present, make sure you insert at least one if not np.any(y_new == self.positive_label): idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array idx_pos = np.random.randint(X_pos.shape[0]) # chosen positive example index X_new[idx_new,:] = X_pos[idx_pos,:] y_new[idx_new] = self.positive_label # if no negative example is present, make sure you insert at least one elif not np.any(y_new == int(not self.positive_label)): idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array idx_neg = np.random.randint(X_neg.shape[0]) # chosen positive example index X_new[idx_new,:] = X_neg[idx_neg,:] y_new[idx_new] = int(not self.positive_label) # train classifier with the bootstrapped data clf = sklearn.base.clone(self.base_classifier) clf.fit(X_new, y_new) classifiers.append(clf) return classifiers def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def test_len_with_mult_added(self): ens = Ensemble() ens.add(MockClassifier()) ens.add(MockClassifier()) ens.add(MockClassifier()) assert len(ens) == 3
class SmoteBagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=2): #self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y==i),:] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx,:])) target = np.concatenate((target, i * np.ones((majority_count,)))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b/100) idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) sampled_class_data = class_data[idx,:] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 ) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones((new_class_data.shape[0],)))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class SmoteBaggingNew(SmoteBagging): def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def smote_bootstrap_sample(self, X, y, b, k): count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, majority_class * np.ones((majority_count,)))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res,)) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn,)) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, minority_class * np.ones((new_class_data.shape[0],)))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate( (target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa return data, target
class SmoteBagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=5): #self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0, )) for i in classes: class_data = X[(y == i), :] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count, )) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate((target, i * np.ones( (majority_count, )))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b / 100) idx = np.random.choice( class_data.shape[0], (int(sample_rate * class_data.shape[0]), )) sampled_class_data = class_data[idx, :] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing, )) new_class_data = np.concatenate( (sampled_class_data, synthetic[idx, :])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones( (new_class_data.shape[0], )))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=float(b), k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class SmoteBaggingNew(SmoteBagging): def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=float(b), k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0, )) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count, )) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate((target, majority_class * np.ones( (majority_count, )))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res, )) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn, )) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate((target, minority_class * np.ones( (new_class_data.shape[0], )))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate((target, minority_class * np.ones( (sampled_min_data.shape[0], )))) return data, target
class ICSBagging(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) out = self.ensemble.output(self.validation_X) y_pred = self.combiner.combine(out) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, self.validation_y) #diversity = entropy_measure_e(self.ensemble, # self.validation_X, self.validation_y) self.ensemble.classifiers.pop() return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.combiner.combine(self.ensemble.output(self.validation_X)) mask = self.positive_label == self.validation_y pos_acc = float(sum(y_pred[mask] == self.validation_y[mask])) / len( self.validation_y[mask]) neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask])) / len( self.validation_y[~mask]) return 1.0 - (pos_acc / (pos_acc + neg_acc)) def bootstrap_classifiers(self, X, y, K, pos_prob): mask = self.positive_label == y negative_label = y[~mask][0] clfs = [] sets_cX, sets_cy = [], [] for i in range(K): cX, cy = [], [] for j in range(X.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X[mask]) - 1) cX = cX + [X[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X[~mask]) - 1) cX = cX + [X[~mask][idx]] cy = cy + [negative_label] if not self.positive_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[mask]) - 1) cX[idx_1] = X[mask][idx_2] cy[idx_1] = self.positive_label elif not negative_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[~mask]) - 1) cX[idx_1] = X[~mask][idx_2] cy[idx_1] = negative_label #print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy) sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy] clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for _ in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBaggingNew(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.positive_label = positive_label self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' #TODO normalize diversity metric. ''' self.ensemble.add(classifier) y_pred = self.predict(self.validation_X) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, y_true) self.ensemble.classifiers.pop() # create interface for this later return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.predict(self.validation_X) y_true = self.validation_y # obtaining recall scores for each label (assuming the labels are binary) pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label) neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label)) return neg_acc / (pos_acc + neg_acc) def bootstrap_classifiers(self, X, y, K, pos_prob): pos_idx = (y == self.positive_label) neg_idx = (y == int(not self.positive_label)) X_pos, y_pos = X[pos_idx, :], y[pos_idx] # positive examples X_neg, y_neg = X[neg_idx, :], y[neg_idx] # negative examples classifiers = [] for i in range(K): X_new = np.zeros(X.shape) y_new = np.zeros(y.shape) for j in range(X.shape[0]): if pos_prob > np.random.random(): # add a randomly chosen positive example idx = np.random.randint(X_pos.shape[0]) X_new[j, :] = X_pos[idx, :] y_new[j] = self.positive_label else: # add a randomly chosen negative example idx = np.random.randint(X_neg.shape[0]) X_new[j, :] = X_neg[idx, :] y_new[j] = int(not self.positive_label) # if no positive example is present, make sure you insert at least one if not np.any(y_new == self.positive_label): idx_new = np.random.randint( X_new.shape[0]) # chosen spot for replacement on new array idx_pos = np.random.randint( X_pos.shape[0]) # chosen positive example index X_new[idx_new, :] = X_pos[idx_pos, :] y_new[idx_new] = self.positive_label # if no negative example is present, make sure you insert at least one elif not np.any(y_new == int(not self.positive_label)): idx_new = np.random.randint( X_new.shape[0]) # chosen spot for replacement on new array idx_neg = np.random.randint( X_neg.shape[0]) # chosen positive example index X_new[idx_new, :] = X_neg[idx_neg, :] y_new[idx_new] = int(not self.positive_label) # train classifier with the bootstrapped data clf = sklearn.base.clone(self.base_classifier) clf.fit(X_new, y_new) classifiers.append(clf) return classifiers def fit(self, X, y): #if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)