Beispiel #1
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # obtain the K nearest neighbors in the validation set
        [idx] = self.knn.kneighbors(x,
                                    n_neighbors=self.K,
                                    return_distance=False)
        neighbors_X = self.Xval[idx]  # k neighbors
        neighbors_y = self.yval[idx]  # k neighbors target

        # pool_output (sample, classifier_output)
        pool_output = np.zeros((neighbors_X.shape[0], len(ensemble)))
        for i, clf in enumerate(ensemble.classifiers):
            pool_output[:, i] = clf.predict(neighbors_X)

        x_outputs = [
            ensemble.classifiers[j].predict(x) for j in range(len(ensemble))
        ]
        x_outputs = np.asarray(x_outputs).flatten()

        scores = np.zeros(len(ensemble))
        for j in range(pool_output.shape[1]):
            # get correctly classified samples
            mask_classified_correctly = pool_output[:, j] == neighbors_y
            # get classified samples with the same class as 'x'
            mask_classified_same_class = (pool_output[:, j] == x_outputs[j])
            # get correctly classified samples with the same class as 'x'
            mask = mask_classified_correctly * mask_classified_same_class
            # calculate score
            scores[j] = float(
                sum(mask)) / (sum(mask_classified_same_class) + 10e-24)

        return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
Beispiel #2
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        mcb_x = ensemble.output(x, mode='labels')[0, :]

        # intialize variables
        # the the indexes of the KNN of x
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]
        mcb_v = ensemble.output(X, mode='labels')

        idx = []
        for i in range(X.shape[0]):
            sim = np.mean(mcb_x == mcb_v[i, :])
            if sim > self.similarity_threshold:
                idx = idx + [i]

        if len(idx) == 0:
            idx = np.arange(X.shape[0])

        scores = [clf.score(X[idx], y[idx]) for clf in ensemble.classifiers]
        scores = np.array(scores)

        # if best classifier is significantly better
        # use best_classifier
        best_i = np.argmax(scores)
        best_j_score = np.max(scores[np.arange(len(scores)) != best_i])
        if scores[best_i] - scores[best_j] >= self.significance_threshold:
            best_classifier = ensemble.classifiers[best_i]
            return Ensemble(classifiers=[best_classifier]), None

        return Ensemble(classifiers=ensemble.classifiers), None
Beispiel #3
0
class BaggingSK(PoolGenerator):
    """"
    This class should not be used, use brew.generation.bagging.Bagging instead.
    """
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        # self.classes_ = set(y)

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            # print()
            # print('classifier : {}'.format(i))
            # print('------------------------')
            # print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(
                X, y, b=float(b), k=self.k)
            # print('data = {}'.format(data.shape))
            # print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return
Beispiel #5
0
class Bagging(PoolGenerator):
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Beispiel #6
0
    def select(self, ensemble, x):

        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # obtain the K nearest neighbors in the validation set
        [idx] = self.knn.kneighbors(x, return_distance=False)
        neighbors_X = self.Xval[idx]  # k neighbors
        neighbors_y = self.yval[idx]  # k neighbors target

        # pool_output (sample, classifier_output)
        pool_output = np.zeros((neighbors_X.shape[0], len(ensemble)))
        for i, clf in enumerate(ensemble.classifiers):
            pool_output[:, i] = clf.predict(neighbors_X)

        x_outputs = [
            ensemble.classifiers[j].predict(x) for j in range(len(ensemble))
        ]
        x_outputs = np.asarray(x_outputs).flatten()

        d = {}
        scores = np.zeros(len(ensemble))
        for j in range(pool_output.shape[1]):
            # get correctly classified samples
            mask_classified_correctly = pool_output[:, j] == neighbors_y
            # get classified samples with the same class as 'x'
            mask_classified_same_class = (pool_output[:, j] == x_outputs[j])
            # get correctly classified samples with the same class as 'x'
            mask = mask_classified_correctly * mask_classified_same_class
            # calculate score
            scores[j] = float(
                sum(mask)) / (sum(mask_classified_same_class) + 10e-24)
            d[str(scores[j])] = d[str(scores[j])] + [j] if str(
                scores[j]) in d else [j]

        best_scores = sorted([float(k) for k in list(d.keys())], reverse=True)

        options = None
        for j, score in enumerate(best_scores):
            pred = [x_outputs[i] for i in d[str(score)]]
            pred = np.asarray(pred).flatten()

            bincount = np.bincount(pred.astype(int))
            if options is not None:
                for i in range(len(bincount)):
                    bincount[i] = bincount[i] if i in options else 0

            imx = np.argmax(bincount)
            votes = np.argwhere(bincount == bincount[imx]).flatten()
            count = len(votes)
            if count == 1:
                ens = Ensemble([ensemble.classifiers[np.argmax(pred == imx)]])
                return ens, None
            elif options is None:
                options = votes

        return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
Beispiel #7
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        scores = np.asarray([clf.score(X, y) for clf in classifiers])

        return Ensemble([classifiers[np.argmax(scores)]]), None
Beispiel #8
0
    def select(self, ensemble, x):
        ensemble_mask = None

        neighbors_X, neighbors_y = self.get_neighbors(x)
        pool_output = ensemble.output(neighbors_X, mode='labels')

        # gradually decrease neighborhood size if no
        # classifier predicts ALL the neighbors correctly
        for i in range(self.K, 0, -1):
            pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i],
                                       np.all)

            # if at least one classifier gets all neighbors right
            if pool_mask is not None:
                ensemble_mask = pool_mask
                break

        # if NO classifiers get the nearest neighbor correctly
        if ensemble_mask is None:

            if self.v2007:
                # Increase neighborhood until one classifier
                # gets at least ONE (i.e. ANY) neighbors correctly.
                # Starts with 2 because mask_all with k=1 is
                # the same as mask_any with k=1
                for i in range(2, self.K + 1):
                    pool_mask = _get_pool_mask(pool_output[:i],
                                               neighbors_y[:i], np.any)

                    if pool_mask is not None:
                        ensemble_mask = pool_mask
                        break

        [selected_idx] = np.where(ensemble_mask)

        if selected_idx.size > 0:
            pool = Ensemble(
                classifiers=[ensemble.classifiers[i] for i in selected_idx])

        else:  # use all classifiers
            # pool = ensemble
            classifiers = self._get_best_classifiers(ensemble, neighbors_X,
                                                     neighbors_y, x)
            pool = Ensemble(classifiers=classifiers)

        # KNORA-ELIMINATE-W that supposedly uses weights, does not make
        # any sense, so even if self.weighted is True, always return
        # None for the weights

        return pool, None
Beispiel #9
0
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return
Beispiel #10
0
    def select(self, ensemble, x):
        neighbors_X, neighbors_y = self.get_neighbors(x)
        pool_output = ensemble.output(neighbors_X, mode='labels')

        output_mask = (pool_output == neighbors_y[:, np.newaxis])

        [selected_idx] = np.where(np.any(output_mask, axis=0))

        if selected_idx.size > 0:
            if self.weighted:
                weights = 1.0 / \
                    (np.sqrt(np.sum((x - neighbors_X)**2, axis=1)) + 10e-8)
                weighted_votes = np.dot(weights, output_mask[:, selected_idx])
            else:
                weighted_votes = np.sum(output_mask[:, selected_idx], axis=0)

            pool = Ensemble(
                classifiers=[ensemble.classifiers[i] for i in selected_idx])
        # if no classifiers are selected,
        # use all classifiers with no weights
        else:
            pool = ensemble
            weighted_votes = None

        return pool, weighted_votes
Beispiel #11
0
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(
                X.shape[1],
                int(np.ceil(X.shape[1] * self.max_features)),
                replace=False)
            transformer = FeatureSubsamplingTransformer(
                features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(
                self.base_classifier),
                                        transformer=transformer)
            classifier.fit(X, y)

            self.ensemble.add(classifier)

        return
Beispiel #13
0
    def fit(self, X, y):
        # if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K,
                                              self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None

        return self
Beispiel #14
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        # d[score] = indexes of the classifiers with that score
        d = {}
        scores = [clf.score(X, y) for clf in ensemble.classifiers]
        for i, scr in enumerate(scores):
            d[scr] = d[scr] + [i] if scr in d else [i]
        best_scores = sorted([k for k in list(d.keys())], reverse=True)

        # if there was a single best classifier, return it
        if len(d[best_scores[0]]) == 1:
            i = d[best_scores[0]][0]
            return Ensemble([classifiers[i]]), None

        options = None
        for j, score in enumerate(best_scores):
            pred = [classifiers[index].predict(x) for index in d[score]]
            pred = np.asarray(pred).flatten()

            bincount = np.bincount(pred.astype(int))
            if options is not None:
                for i in range(len(bincount)):
                    bincount[i] = bincount[i] if i in options else 0

            imx = np.argmax(bincount)
            votes = np.argwhere(bincount == bincount[imx]).flatten()
            count = len(votes)
            if count == 1:
                return Ensemble([classifiers[np.argmax(pred == imx)]]), None
            elif options is None:
                options = votes

        return Ensemble([classifiers[np.argmax(scores)]]), None
Beispiel #15
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        n_sel_1, n_sel_2 = self.n_1, self.n_2
        if isinstance(self.n_1, float):
            n_sel_1 = int(n_sel_1 * len(ensemble))

        if isinstance(self.n_2, float):
            n_sel_2 = int(n_sel_2 * len(ensemble))

        n_sel_1 = max(n_sel_1, 1)
        n_sel_2 = max(n_sel_2, 1)

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        acc_scores = np.array([clf.score(X, y) for clf in classifiers])

        out = ensemble.output(X, mode='labels')
        oracle = np.equal(out, y[:, np.newaxis])
        div_scores = np.zeros(len(ensemble), dtype=float)

        for i in range(len(ensemble)):
            tmp = []
            for j in range(len(ensemble)):
                if i != j:
                    d = kuncheva_double_fault_measure(oracle[:, [i, j]])
                    tmp.append(d)
            div_scores[i] = np.mean(tmp)

        z = zip(np.arange(len(ensemble)), acc_scores, div_scores)
        z = sorted(z, key=lambda e: e[1], reverse=True)[:n_sel_1]
        z = sorted(z, key=lambda e: e[2], reverse=False)[:n_sel_2]
        z = zip(*z)[0]

        classifiers = [classifiers[i] for i in z]
        return Ensemble(classifiers=classifiers), None
class RandomSubspace(PoolGenerator):
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote',
                 max_features=0.5):
        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.combiner = Combiner(rule=combination_rule)
        self.classifiers = None
        self.ensemble = None
        self.max_features = max_features

    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(
                X.shape[1],
                int(np.ceil(X.shape[1] * self.max_features)),
                replace=False)
            transformer = FeatureSubsamplingTransformer(
                features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(
                self.base_classifier),
                                        transformer=transformer)
            classifier.fit(X, y)

            self.ensemble.add(classifier)

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Beispiel #17
0
    def select(self, ensemble, x):
        selected_classifier = None

        nn_X, nn_y, dists = self.get_neighbors(x, return_distance=True)

        idx_selected, prob_selected = [], []

        all_probs = np.zeros(len(ensemble))
        for idx, clf in enumerate(ensemble.classifiers):
            prob = self.probabilities(clf, nn_X, nn_y, dists, x)
            if prob > 0.5:
                idx_selected = idx_selected + [idx]
                prob_selected = prob_selected + [prob]

            all_probs[idx] = prob

        if len(prob_selected) == 0:
            prob_selected = [np.max(all_probs)]
            idx_selected = [np.argmax(all_probs)]

        p_correct_m = max(prob_selected)
        m = np.argmax(prob_selected)

        selected = True
        diffs = []
        for j, p_correct_j in enumerate(prob_selected):
            d = p_correct_m - p_correct_j
            diffs.append(d)
            if j != m and d < self.threshold:
                selected = False

        if selected:
            selected_classifier = ensemble.classifiers[idx_selected[m]]
        else:
            idx_selected = np.asarray(idx_selected)
            mask = np.array(np.array(diffs) < self.threshold, dtype=bool)
            i = np.random.choice(idx_selected[mask])
            selected_classifier = ensemble.classifiers[i]

        return Ensemble([selected_classifier]), None
class SmoteBaggingNew(SmoteBagging):

    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            # print()
            # print('classifier : {}'.format(i))
            # print('------------------------')
            # print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(
                X, y, b=float(b), k=self.k)
            # print('data = {}'.format(data.shape))
            # print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return

    def smote_bootstrap_sample(self, X, y, b, k):

        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority class
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        class_data = X[(y == majority_class), :]
        idx = np.random.choice(majority_count, (majority_count,))
        data = np.concatenate((data, class_data[idx, :]))
        target = np.concatenate(
            (target, majority_class * np.ones((majority_count,))))

        minority_class = count.argmin()
        minority_count = count.min()

        # print majority_count
        N_syn = int((majority_count) * (b / 100))
        # print N_syn
        N_res = majority_count - N_syn
        # print N_res
        N_syn, N_res = N_res, N_syn

        class_data = X[(y == minority_class), :]
        idx = np.random.choice(class_data.shape[0], (N_res,))
        sampled_min_data = class_data[idx, :]
        # print sampled_min_data.shape
        if N_syn > 0:
            N_smote = np.ceil(N_syn / minority_count) * 100
            N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
            synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)

            idx = np.random.choice(synthetic.shape[0], (N_syn,))
            new_class_data = np.concatenate(
                (sampled_min_data, synthetic[idx, :]))
            data = np.concatenate((data, new_class_data))
            target = np.concatenate(
                (target, minority_class * np.ones((new_class_data.shape[0],))))
        else:
            data = np.concatenate((data, sampled_min_data))
            target = np.concatenate(
                (target, minority_class * np.ones((sampled_min_data.shape[0],))))  # noqa

        return data, target
Beispiel #19
0
class ICSBagging(PoolGenerator):
    def __init__(self,
                 K=10,
                 alpha=0.75,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote',
                 diversity_metric='e',
                 positive_label=1):

        self.K = K
        self.alpha = alpha

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.combination_rule = combination_rule
        self.positive_label = positive_label

        self.classifiers = None
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

        self.diversity_metric = diversity_metric
        self.diversity = Diversity(metric=diversity_metric)

        self.validation_X = None
        self.validation_y = None

    def set_validation(self, X, y):
        self.validation_X = X
        self.validation_y = y

    def fitness(self, classifier):
        '''
        # TODO normalize diversity metric.
        '''
        self.ensemble.add(classifier)
        out = self.ensemble.output(self.validation_X)
        y_pred = self.combiner.combine(out)
        y_true = self.validation_y

        auc = evaluation.auc_score(y_true, y_pred)
        div = self.diversity.calculate(self.ensemble, self.validation_X,
                                       self.validation_y)

        # diversity = entropy_measure_e(self.ensemble,
        #        self.validation_X, self.validation_y)

        self.ensemble.classifiers.pop()
        return self.alpha * auc + (1.0 - self.alpha) * div

    def _calc_pos_prob(self):
        y_pred = self.combiner.combine(self.ensemble.output(self.validation_X))
        mask = self.positive_label == self.validation_y
        pos_acc = float(sum(y_pred[mask] == self.validation_y[mask])) / len(
            self.validation_y[mask])
        neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask])) / len(
            self.validation_y[~mask])
        return 1.0 - (pos_acc / (pos_acc + neg_acc))

    def bootstrap_classifiers(self, X, y, K, pos_prob):
        mask = self.positive_label == y
        negative_label = y[~mask][0]

        clfs = []
        sets_cX, sets_cy = [], []
        for i in range(K):
            cX, cy = [], []
            for j in range(X.shape[0]):
                if np.random.random() < pos_prob:
                    idx = np.random.random_integers(0, len(X[mask]) - 1)
                    cX = cX + [X[mask][idx]]
                    cy = cy + [self.positive_label]
                else:
                    idx = np.random.random_integers(0, len(X[~mask]) - 1)
                    cX = cX + [X[~mask][idx]]
                    cy = cy + [negative_label]
            if self.positive_label not in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X[mask]) - 1)
                cX[idx_1] = X[mask][idx_2]
                cy[idx_1] = self.positive_label
            elif negative_label not in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X[~mask]) - 1)
                cX[idx_1] = X[~mask][idx_2]
                cy[idx_1] = negative_label
            # print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy)

            sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy]
            clf = sklearn.base.clone(self.base_classifier)
            clfs = clfs + [clf.fit(cX, cy)]

        return clfs

    def fit(self, X, y):
        # if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for _ in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K,
                                              self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None

        return self

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Beispiel #20
0
class ICSBaggingNew(PoolGenerator):
    def __init__(self,
                 K=10,
                 alpha=0.75,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote',
                 diversity_metric='e',
                 positive_label=1):

        self.K = K
        self.alpha = alpha

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.positive_label = positive_label

        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

        self.diversity = Diversity(metric=diversity_metric)

        self.validation_X = None
        self.validation_y = None

    def set_validation(self, X, y):
        self.validation_X = X
        self.validation_y = y

    def fitness(self, classifier):
        '''
        # TODO normalize diversity metric.
        '''
        self.ensemble.add(classifier)

        y_pred = self.predict(self.validation_X)
        y_true = self.validation_y

        auc = evaluation.auc_score(y_true, y_pred)
        div = self.diversity.calculate(self.ensemble, self.validation_X,
                                       y_true)

        self.ensemble.classifiers.pop()  # create interface for this later

        return self.alpha * auc + (1.0 - self.alpha) * div

    def _calc_pos_prob(self):
        y_pred = self.predict(self.validation_X)
        y_true = self.validation_y

        # obtaining recall scores for each label (assuming the labels are
        # binary)
        pos_acc = recall_score(y_true,
                               y_pred,
                               average='binary',
                               pos_label=self.positive_label)
        neg_acc = recall_score(y_true,
                               y_pred,
                               average='binary',
                               pos_label=int(not self.positive_label))

        return neg_acc / (pos_acc + neg_acc)

    def bootstrap_classifiers(self, X, y, K, pos_prob):
        pos_idx = (y == self.positive_label)
        neg_idx = (y == int(not self.positive_label))

        X_pos, _ = X[pos_idx, :], y[pos_idx]  # positive examples
        X_neg, _ = X[neg_idx, :], y[neg_idx]  # negative examples

        classifiers = []
        for i in range(K):
            X_new = np.zeros(X.shape)
            y_new = np.zeros(y.shape)

            for j in range(X.shape[0]):

                if pos_prob > np.random.random():
                    # add a randomly chosen positive example
                    idx = np.random.randint(X_pos.shape[0])
                    X_new[j, :] = X_pos[idx, :]
                    y_new[j] = self.positive_label

                else:
                    # add a randomly chosen negative example
                    idx = np.random.randint(X_neg.shape[0])
                    X_new[j, :] = X_neg[idx, :]
                    y_new[j] = int(not self.positive_label)

            # if no positive example is present, make sure you insert at least
            # one
            if not np.any(y_new == self.positive_label):
                # chosen spot for replacement on new array
                idx_new = np.random.randint(X_new.shape[0])
                # chosen positive example index
                idx_pos = np.random.randint(X_pos.shape[0])

                X_new[idx_new, :] = X_pos[idx_pos, :]
                y_new[idx_new] = self.positive_label

            # if no negative example is present, make sure you insert at least
            # one
            elif not np.any(y_new == int(not self.positive_label)):
                # chosen spot for replacement on new array
                idx_new = np.random.randint(X_new.shape[0])
                # chosen positive example index
                idx_neg = np.random.randint(X_neg.shape[0])

                X_new[idx_new, :] = X_neg[idx_neg, :]
                y_new[idx_new] = int(not self.positive_label)

            # train classifier with the bootstrapped data
            clf = sklearn.base.clone(self.base_classifier)
            clf.fit(X_new, y_new)

            classifiers.append(clf)

        return classifiers

    def fit(self, X, y):
        # if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K,
                                              self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None

        return self

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
class SmoteBagging(PoolGenerator):

    def __init__(self, base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote', k=5):

        # self.b = b
        self.k = k
        self.n_classifiers = n_classifiers
        self.base_classifier = base_classifier

        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

    def smote_bootstrap_sample(self, X, y, b, k):

        classes = np.unique(y)
        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority clas
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        for i in classes:

            class_data = X[(y == i), :]

            if i == majority_class:  # majority class
                # regular bootstrap (i.e. 100% sampling rate)
                idx = np.random.choice(majority_count, (majority_count,))
                data = np.concatenate((data, class_data[idx, :]))
                target = np.concatenate(
                    (target, i * np.ones((majority_count,))))
                # print('original class data = {}'.format(class_data.shape))
                # print('sampled class data = {}'.format(class_data[idx,:].shape))  # noqa
                # print()

            else:  # minority classes
                # bootstrap the class data with defined sampling rate
                sample_rate = (majority_count /
                               class_data.shape[0]) * (b / 100)
                idx = np.random.choice(
                    class_data.shape[0], (int(sample_rate * class_data.shape[0]),))  # noqa
                sampled_class_data = class_data[idx, :]

                # print('original class data = {}'.format(class_data.shape))
                # print('majority_count = {}'.format(majority_count))
                # print('class data = {}'.format(class_data.shape))
                # print('b = {}'.format(b))
                # print('sample rate = {}'.format(sample_rate))
                # print('sampled class data = {}'.format(sampled_class_data.shape)) # noqa

                # run smote on bootstrapped data to obtain synthetic samples
                # ceil to make sure N_smote is a multiple of 100, and the small
                # value to avoid a zero
                N_smote = int(np.ceil(
                    (majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100)  # noqa
                # print(N_smote)

                # print('----------')
                # print('smote parameters:')
                # print('T : {}'.format(sampled_class_data.shape))
                # print('N : {}'.format(N_smote))
                synthetic = smote(sampled_class_data, N=N_smote, k=self.k)
                # print('synthetic data = {})'.format(synthetic.shape))
                # print(synthetic)

                # add synthetic samples to sampled class data
                n_missing = majority_count - sampled_class_data.shape[0]
                idx = np.random.choice(synthetic.shape[0], (n_missing,))
                new_class_data = np.concatenate(
                    (sampled_class_data, synthetic[idx, :]))
                # print('new class data = {})'.format(new_class_data.shape))
                # print()
                data = np.concatenate((data, new_class_data))
                target = np.concatenate(
                    (target, i * np.ones((new_class_data.shape[0],))))

        return data, target

    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            # print()
            # print('classifier : {}'.format(i))
            # print('------------------------')
            # print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(
                X, y, b=float(b), k=self.k)
            # print('data = {}'.format(data.shape))
            # print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)