Esempio n. 1
0
class RandomSubspace(PoolGenerator):

    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote',
                 max_features=0.5):
        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.combiner = Combiner(rule=combination_rule)
        self.classifiers = None
        self.ensemble = None
        self.max_features = max_features

    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(X.shape[1], int(
                np.ceil(X.shape[1] * self.max_features)), replace=False)
            transformer = FeatureSubsamplingTransformer(
                features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(
                self.base_classifier), transformer=transformer)
            classifier.fit(X, y)

            self.ensemble.add(classifier)

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 2
0
class RandomSubspace(PoolGenerator):

    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5):
        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.combiner = Combiner(rule=combination_rule)
        self.classifiers = None
        self.ensemble = None
        self.max_features = max_features
       
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(X.shape[1], int(np.ceil(X.shape[1]*self.max_features)), replace=False)
            transformer = FeatureSubsamplingTransformer(features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(self.base_classifier), transformer=transformer)
            classifier.fit(X, y)
            
            self.ensemble.add(classifier)

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 3
0
    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            #print()
            #print('classifier : {}'.format(i))
            #print('------------------------')
            #print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(X,
                                                       y,
                                                       b=float(b),
                                                       k=self.k)
            #print('data = {}'.format(data.shape))
            #print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return
Esempio n. 4
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        mcb_x = ensemble.output(x, mode='labels')[0, :]

        # intialize variables
        # the the indexes of the KNN of x
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]
        mcb_v = ensemble.output(X, mode='labels')

        idx = []
        for i in range(X.shape[0]):
            sim = np.mean(mcb_x == mcb_v[i, :])
            if sim > self.similarity_threshold:
                idx = idx + [i]

        if len(idx) == 0:
            idx = np.arange(X.shape[0])

        scores = [clf.score(X[idx], y[idx]) for clf in ensemble.classifiers]
        scores = np.array(scores)

        # if best classifier is significantly better
        # use best_classifier
        best_i = np.argmax(scores)
        best_j_score = np.max(scores[np.arange(len(scores)) != best_i])
        if scores[best_i] - scores[best_j] >= self.significance_threshold:
            best_classifier = ensemble.classifiers[best_i]
            return Ensemble(classifiers=[best_classifier]), None

        return Ensemble(classifiers=ensemble.classifiers), None
Esempio n. 5
0
class Bagging(PoolGenerator):

    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)
            
            self.ensemble.add(classifier)

        return


    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 6
0
class BaggingSK(PoolGenerator):
    '''
    This class should not be used, use brew.generation.bagging.Bagging instead.
    '''

    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                n_estimators=n_classifiers, max_samples=1.0, max_features=1.0)
        
        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        #self.classes_ = set(y)

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 7
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # obtain the K nearest neighbors in the validation set
        [idx] = self.knn.kneighbors(x,
                                    n_neighbors=self.K,
                                    return_distance=False)
        neighbors_X = self.Xval[idx]  # k neighbors
        neighbors_y = self.yval[idx]  # k neighbors target

        # pool_output (sample, classifier_output)
        pool_output = np.zeros((neighbors_X.shape[0], len(ensemble)))
        for i, clf in enumerate(ensemble.classifiers):
            pool_output[:, i] = clf.predict(neighbors_X)

        x_outputs = [
            ensemble.classifiers[j].predict(x) for j in range(len(ensemble))
        ]
        x_outputs = np.asarray(x_outputs).flatten()

        scores = np.zeros(len(ensemble))
        for j in range(pool_output.shape[1]):
            # get correctly classified samples
            mask_classified_correctly = pool_output[:, j] == neighbors_y
            # get classified samples with the same class as 'x'
            mask_classified_same_class = (pool_output[:, j] == x_outputs[j])
            # get correctly classified samples with the same class as 'x'
            mask = mask_classified_correctly * mask_classified_same_class
            # calculate score
            scores[j] = float(
                sum(mask)) / (sum(mask_classified_same_class) + 10e-24)

        return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
Esempio n. 8
0
class Bagging(PoolGenerator):
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 9
0
class BaggingSK(PoolGenerator):
    '''
    This class should not be used, use brew.generation.bagging.Bagging instead.
    '''
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        #self.classes_ = set(y)

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 10
0
    def select(self, ensemble, x):

        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # obtain the K nearest neighbors in the validation set
        [idx] = self.knn.kneighbors(x, return_distance=False)
        neighbors_X = self.Xval[idx]  # k neighbors
        neighbors_y = self.yval[idx]  # k neighbors target

        # pool_output (sample, classifier_output)
        pool_output = np.zeros((neighbors_X.shape[0], len(ensemble)))
        for i, clf in enumerate(ensemble.classifiers):
            pool_output[:, i] = clf.predict(neighbors_X)

        x_outputs = [
            ensemble.classifiers[j].predict(x) for j in range(len(ensemble))
        ]
        x_outputs = np.asarray(x_outputs).flatten()

        d = {}
        scores = np.zeros(len(ensemble))
        for j in range(pool_output.shape[1]):
            # get correctly classified samples
            mask_classified_correctly = pool_output[:, j] == neighbors_y
            # get classified samples with the same class as 'x'
            mask_classified_same_class = (pool_output[:, j] == x_outputs[j])
            # get correctly classified samples with the same class as 'x'
            mask = mask_classified_correctly * mask_classified_same_class
            # calculate score
            scores[j] = float(
                sum(mask)) / (sum(mask_classified_same_class) + 10e-24)
            d[str(scores[j])] = d[str(scores[j])] + [j] if str(
                scores[j]) in d else [j]

        best_scores = sorted([float(k) for k in list(d.keys())], reverse=True)

        options = None
        for j, score in enumerate(best_scores):
            pred = [x_outputs[i] for i in d[str(score)]]
            pred = np.asarray(pred).flatten()

            bincount = np.bincount(pred.astype(int))
            if options is not None:
                for i in range(len(bincount)):
                    bincount[i] = bincount[i] if i in options else 0

            imx = np.argmax(bincount)
            votes = np.argwhere(bincount == bincount[imx]).flatten()
            count = len(votes)
            if count == 1:
                ens = Ensemble([ensemble.classifiers[np.argmax(pred == imx)]])
                return ens, None
            elif options is None:
                options = votes

        return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
Esempio n. 11
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        scores = np.asarray([clf.score(X, y) for clf in classifiers])

        return Ensemble([classifiers[np.argmax(scores)]]), None
Esempio n. 12
0
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return
Esempio n. 13
0
    def select(self, ensemble, x):
        ensemble_mask = None

        neighbors_X, neighbors_y = self.get_neighbors(x)
        pool_output = ensemble.output(neighbors_X, mode='labels')

        # gradually decrease neighborhood size if no
        # classifier predicts ALL the neighbors correctly
        for i in range(self.K, 0, -1):
            pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i],
                                       np.all)

            # if at least one classifier gets all neighbors right
            if pool_mask is not None:
                ensemble_mask = pool_mask
                break

        # if NO classifiers get the nearest neighbor correctly
        if ensemble_mask is None:

            if self.v2007:
                # Increase neighborhood until one classifier
                # gets at least ONE (i.e. ANY) neighbors correctly.
                # Starts with 2 because mask_all with k=1 is
                # the same as mask_any with k=1
                for i in range(2, self.K + 1):
                    pool_mask = _get_pool_mask(pool_output[:i],
                                               neighbors_y[:i], np.any)

                    if pool_mask is not None:
                        ensemble_mask = pool_mask
                        break

        [selected_idx] = np.where(ensemble_mask)

        if selected_idx.size > 0:
            pool = Ensemble(
                classifiers=[ensemble.classifiers[i] for i in selected_idx])

        else:  # use all classifiers
            # pool = ensemble
            classifiers = self._get_best_classifiers(ensemble, neighbors_X,
                                                     neighbors_y, x)
            pool = Ensemble(classifiers=classifiers)

        # KNORA-ELIMINATE-W that supposedly uses weights, does not make
        # any sense, so even if self.weighted is True, always return
        # None for the weights

        return pool, None
Esempio n. 14
0
    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            #print()
            #print('classifier : {}'.format(i))
            #print('------------------------')
            #print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k)
            #print('data = {}'.format(data.shape))
            #print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)
            
            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return
Esempio n. 15
0
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(X.shape[1], int(
                np.ceil(X.shape[1] * self.max_features)), replace=False)
            transformer = FeatureSubsamplingTransformer(
                features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(
                self.base_classifier), transformer=transformer)
            classifier.fit(X, y)

            self.ensemble.add(classifier)

        return
Esempio n. 16
0
    def select(self, ensemble, x):
        neighbors_X, neighbors_y = self.get_neighbors(x)
        pool_output = ensemble.output(neighbors_X, mode='labels')

        output_mask = (pool_output == neighbors_y[:, np.newaxis])

        [selected_idx] = np.where(np.any(output_mask, axis=0))

        if selected_idx.size > 0:
            if self.weighted:
                weights = 1.0 / \
                    (np.sqrt(np.sum((x - neighbors_X)**2, axis=1)) + 10e-8)
                weighted_votes = np.dot(weights, output_mask[:, selected_idx])
            else:
                weighted_votes = np.sum(output_mask[:, selected_idx], axis=0)

            pool = Ensemble(
                classifiers=[ensemble.classifiers[i] for i in selected_idx])
        # if no classifiers are selected,
        # use all classifiers with no weights
        else:
            pool = ensemble
            weighted_votes = None

        return pool, weighted_votes
Esempio n. 17
0
    def select(self, ensemble, x):
        neighbors_X, neighbors_y = self.get_neighbors(x)

        k = self.K

        pool = []
        while k > 0:
            nn_X = neighbors_X[:k, :]
            nn_y = neighbors_y[:k]

            for i, c in enumerate(ensemble.classifiers):
                if np.all(c.predict(nn_X) == nn_y[np.newaxis, :]):
                    pool.append(c)

            if not pool:  # empty
                k = k - 1
            else:
                break

        if not pool:  # still empty
            # select the classifier that recognizes
            # more samples in the whole neighborhood
            # also select classifiers that recognize
            # the same number of neighbors
            pool = self._get_best_classifiers(ensemble, neighbors_X,
                                              neighbors_y, x)

        return Ensemble(classifiers=pool), None
Esempio n. 18
0
    def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)
Esempio n. 19
0
    def test__arguments(self):

        c = MockClassifier()

        pool = Ensemble(classifiers=[c])
        combiner = Combiner(rule='majority_vote')

        model = EnsembleClassifier(ensemble=pool, combiner=combiner)
Esempio n. 20
0
    def fit(self, X, y):
        #if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K,
                                              self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None

        return self
Esempio n. 21
0
    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                n_estimators=n_classifiers, max_samples=1.0, max_features=1.0)
        
        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)
    def __init__(self, classifierList, combiningMethod):
        classifiers = [None] * (len(classifierList))
        for key, tuple in enumerate(classifierList):
            classifiers[key] = tuple[1]

        hybridEnsemble = Ensemble(classifiers=classifiers)
        hybridEnsembleClassifier = EnsembleClassifier(
            ensemble=hybridEnsemble, combiner=Combiner(combiningMethod))

        super().__init__(hybridEnsembleClassifier)
        self.name = "ensemble"
Esempio n. 23
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        n_sel_1, n_sel_2 = self.n_1, self.n_2
        if isinstance(self.n_1, float):
            n_sel_1 = int(n_sel_1 * len(ensemble))

        if isinstance(self.n_2, float):
            n_sel_2 = int(n_sel_2 * len(ensemble))

        n_sel_1 = max(n_sel_1, 1)
        n_sel_2 = max(n_sel_2, 1)

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        acc_scores = np.array([clf.score(X, y) for clf in classifiers])

        out = ensemble.output(X, mode='labels')
        oracle = np.equal(out, y[:, np.newaxis])
        div_scores = np.zeros(len(ensemble), dtype=float)

        for i in range(len(ensemble)):
            tmp = []
            for j in range(len(ensemble)):
                if i != j:
                    d = kuncheva_double_fault_measure(oracle[:, [i, j]])
                    tmp.append(d)
            div_scores[i] = np.mean(tmp)

        z = zip(np.arange(len(ensemble)), acc_scores, div_scores)
        z = sorted(z, key=lambda e: e[1], reverse=True)[:n_sel_1]
        z = sorted(z, key=lambda e: e[2], reverse=False)[:n_sel_2]
        z = zip(*z)[0]

        classifiers = [classifiers[i] for i in z]
        return Ensemble(classifiers=classifiers), None
Esempio n. 24
0
    def select(self, ensemble, x):
        if ensemble.in_agreement(x):
            return Ensemble([ensemble.classifiers[0]]), None

        # intialize variables
        # the the indexes of the KNN of x
        classifiers = ensemble.classifiers
        [idx] = self.knn.kneighbors(x, return_distance=False)
        X, y = self.Xval[idx], self.yval[idx]

        # d[score] = indexes of the classifiers with that score
        d = {}
        scores = [clf.score(X, y) for clf in ensemble.classifiers]
        for i, scr in enumerate(scores):
            d[scr] = d[scr] + [i] if scr in d else [i]
        best_scores = sorted([k for k in d.iterkeys()], reverse=True)

        # if there was a single best classifier, return it
        if len(d[best_scores[0]]) == 1:
            i = d[best_scores[0]][0]
            return Ensemble([classifiers[i]]), None

        options = None
        for j, score in enumerate(best_scores):
            pred = [classifiers[i].predict(x) for i in d[score]]
            pred = np.asarray(pred).flatten()

            bincount = np.bincount(pred)
            if options is not None:
                for i in range(len(bincount)):
                    bincount[i] = bincount[i] if i in options else 0

            imx = np.argmax(bincount)
            votes = np.argwhere(bincount == bincount[imx]).flatten()
            count = len(votes)
            if count == 1:
                return Ensemble([classifiers[np.argmax(pred == imx)]]), None
            elif options is None:
                options = votes

        return Ensemble([classifiers[np.argmax(scores)]]), None
Esempio n. 25
0
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for i in range(self.n_classifiers):
            chosen_features = np.random.choice(X.shape[1], int(np.ceil(X.shape[1]*self.max_features)), replace=False)
            transformer = FeatureSubsamplingTransformer(features=chosen_features)

            classifier = BrewClassifier(classifier=sklearn.base.clone(self.base_classifier), transformer=transformer)
            classifier.fit(X, y)
            
            self.ensemble.add(classifier)

        return
Esempio n. 26
0
    def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)
            
            self.ensemble.add(classifier)

        return
Esempio n. 27
0
    def select(self, ensemble, x):
        neighbors_X, neighbors_y = self.get_neighbors(x)

        pool = []
        for c in ensemble.classifiers:
            for i, neighbor in enumerate(neighbors_X):
                if c.predict(neighbor) == neighbors_y[i]:
                    pool.append(c)
                    break

        weights = []
        for clf in pool:
            msk = clf.predict(neighbors_X) == neighbors_y
            weights = weights + [sum(msk)]

        return Ensemble(classifiers=pool), weights
Esempio n. 28
0
    def fit(self, X, y):
        #if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None
        
        return self
Esempio n. 29
0
    def select(self, ensemble, x):
        selected_classifier = None

        nn_X, nn_y, dists = self.get_neighbors(x, 
                return_distance=True)
        
        idx_selected, prob_selected = [], []
        
        all_probs = np.zeros(len(ensemble))
        for idx, clf in enumerate(ensemble.classifiers):
            prob = self.probabilities(clf, nn_X, nn_y, dists, x)
            if prob > 0.5:
                idx_selected = idx_selected + [idx]
                prob_selected = prob_selected + [prob]

            all_probs[idx] = prob

        if len(prob_selected) == 0:
            prob_selected = [np.max(all_probs)]
            idx_selected = [np.argmax(all_probs)]
        
        p_correct_m = max(prob_selected)
        m = np.argmax(prob_selected)

        selected = True
        diffs = []
        for j, p_correct_j in enumerate(prob_selected):
            d = p_correct_m - p_correct_j
            diffs.append(d)
            if j != m and d < self.threshold:
                selected = False

        if selected:
            selected_classifier = ensemble.classifiers[idx_selected[m]]
        else:
            idx_selected = np.asarray(idx_selected)
            mask = np.array(np.array(diffs) < self.threshold, dtype=bool)
            i = np.random.choice(idx_selected[mask])
            selected_classifier = ensemble.classifiers[i]
        
        return Ensemble([selected_classifier]), None
Esempio n. 30
0
class SmoteBaggingNew(SmoteBagging):

    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            # print()
            # print('classifier : {}'.format(i))
            # print('------------------------')
            # print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(
                X, y, b=float(b), k=self.k)
            # print('data = {}'.format(data.shape))
            # print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return

    def smote_bootstrap_sample(self, X, y, b, k):

        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority class
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        class_data = X[(y == majority_class), :]
        idx = np.random.choice(majority_count, (majority_count,))
        data = np.concatenate((data, class_data[idx, :]))
        target = np.concatenate(
            (target, majority_class * np.ones((majority_count,))))

        minority_class = count.argmin()
        minority_count = count.min()

        # print majority_count
        N_syn = int((majority_count) * (b / 100))
        # print N_syn
        N_res = majority_count - N_syn
        # print N_res
        N_syn, N_res = N_res, N_syn

        class_data = X[(y == minority_class), :]
        idx = np.random.choice(class_data.shape[0], (N_res,))
        sampled_min_data = class_data[idx, :]
        # print sampled_min_data.shape
        if N_syn > 0:
            N_smote = np.ceil(N_syn / minority_count) * 100
            N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
            synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)

            idx = np.random.choice(synthetic.shape[0], (N_syn,))
            new_class_data = np.concatenate(
                (sampled_min_data, synthetic[idx, :]))
            data = np.concatenate((data, new_class_data))
            target = np.concatenate(
                (target, minority_class * np.ones((new_class_data.shape[0],))))
        else:
            data = np.concatenate((data, sampled_min_data))
            target = np.concatenate(
                (target, minority_class * np.ones((sampled_min_data.shape[0],))))  # noqa

        return data, target
Esempio n. 31
0
    def test_none_combiner(self):
        c = MockClassifier()

        pool = Ensemble(classifiers=[c])
        model = EnsembleClassifier(ensemble=pool)
Esempio n. 32
0
 def test_len_with_one_added(self):
     ens = Ensemble()
     ens.add(MockClassifier())
     assert len(ens) == 1
Esempio n. 33
0
class SmoteBagging(PoolGenerator):
    
    def __init__(self, base_classifier=None,
                n_classifiers=100,
                combination_rule='majority_vote', k=2):

        #self.b = b
        self.k = k
        self.n_classifiers = n_classifiers
        self.base_classifier = base_classifier

        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)


    def smote_bootstrap_sample(self, X, y, b, k):
        
        classes = np.unique(y)
        count = np.bincount(y) # number of instances of each class

        majority_class = count.argmax() # majority clas
        majority_count = count.max() # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        for i in classes:

            class_data = X[(y==i),:]

            if i == majority_class: # majority class
                # regular bootstrap (i.e. 100% sampling rate)
                idx = np.random.choice(majority_count, (majority_count,))
                data = np.concatenate((data, class_data[idx,:]))
                target = np.concatenate((target, i * np.ones((majority_count,))))
                #print('original class data = {}'.format(class_data.shape))
                #print('sampled class data = {}'.format(class_data[idx,:].shape))
                #print()



            else: # minority classes
                # bootstrap the class data with defined sampling rate
                sample_rate = (majority_count / class_data.shape[0]) * (b/100)
                idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),))
                sampled_class_data = class_data[idx,:]
                
                #print('original class data = {}'.format(class_data.shape))
                #print('majority_count = {}'.format(majority_count))
                #print('class data = {}'.format(class_data.shape))
                #print('b = {}'.format(b))
                #print('sample rate = {}'.format(sample_rate))
                #print('sampled class data = {}'.format(sampled_class_data.shape))


                # run smote on bootstrapped data to obtain synthetic samples
                # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero
                N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 )
                #print(N_smote)

                #print('----------')
                #print('smote parameters:')
                #print('T : {}'.format(sampled_class_data.shape))
                #print('N : {}'.format(N_smote))
                synthetic = smote(sampled_class_data, N=N_smote, k=self.k)
                #print('synthetic data = {})'.format(synthetic.shape))
                #print(synthetic)
               
                # add synthetic samples to sampled class data
                n_missing = majority_count - sampled_class_data.shape[0]
                idx = np.random.choice(synthetic.shape[0], (n_missing,))
                new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:]))
                #print('new class data = {})'.format(new_class_data.shape))
                #print()
                data = np.concatenate((data, new_class_data))
                target = np.concatenate((target, i * np.ones((new_class_data.shape[0],))))

        return data, target


    def fit(self, X, y):

        self.ensemble = Ensemble()

        # this parameter should change between [10, 100] with
        # increments of 10, for every classifier in the ensemble
        b = 10

        for i in range(self.n_classifiers):
            #print()
            #print('classifier : {}'.format(i))
            #print('------------------------')
            #print('b = {}'.format(b))
            data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k)
            #print('data = {}'.format(data.shape))
            #print()

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)
            
            self.ensemble.add(classifier)

            if b >= 100:
                b = 10
            else:
                b += 10

        return

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 34
0
class ICSBaggingNew(PoolGenerator):


    def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100,
            combination_rule='majority_vote', diversity_metric='e', max_samples=1.0,
            positive_label=1):

        self.K = K
        self.alpha = alpha

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.positive_label = positive_label

        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

        self.diversity = Diversity(metric=diversity_metric)

        self.validation_X = None
        self.validation_y = None


    def set_validation(self, X, y):
        self.validation_X = X
        self.validation_y = y


    def fitness(self, classifier):
        '''
        #TODO normalize diversity metric.
        '''
        self.ensemble.add(classifier)

        y_pred = self.predict(self.validation_X)
        y_true = self.validation_y

        auc = evaluation.auc_score(y_true, y_pred)
        div = self.diversity.calculate(self.ensemble, self.validation_X, y_true)

        self.ensemble.classifiers.pop() # create interface for this later

        return self.alpha * auc + (1.0 - self.alpha) * div


    def _calc_pos_prob(self):
        y_pred = self.predict(self.validation_X)
        y_true = self.validation_y

        # obtaining recall scores for each label (assuming the labels are binary)
        pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label)
        neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label))

        return neg_acc / (pos_acc + neg_acc)


    def bootstrap_classifiers(self, X, y, K, pos_prob):
        pos_idx = (y == self.positive_label)
        neg_idx = (y == int(not self.positive_label))

        X_pos, y_pos = X[pos_idx,:], y[pos_idx] # positive examples
        X_neg, y_neg = X[neg_idx,:], y[neg_idx] # negative examples

        classifiers = []
        for i in range(K):
            X_new = np.zeros(X.shape)
            y_new = np.zeros(y.shape)

            for j in range(X.shape[0]):
                
                if pos_prob > np.random.random():
                    # add a randomly chosen positive example
                    idx = np.random.randint(X_pos.shape[0])
                    X_new[j,:] = X_pos[idx,:]
                    y_new[j] = self.positive_label

                else:
                    # add a randomly chosen negative example
                    idx = np.random.randint(X_neg.shape[0])
                    X_new[j,:] = X_neg[idx,:]
                    y_new[j] = int(not self.positive_label)

            # if no positive example is present, make sure you insert at least one
            if not np.any(y_new == self.positive_label):
                idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array
                idx_pos = np.random.randint(X_pos.shape[0]) # chosen positive example index

                X_new[idx_new,:] = X_pos[idx_pos,:]
                y_new[idx_new] = self.positive_label
            
            # if no negative example is present, make sure you insert at least one
            elif not np.any(y_new == int(not self.positive_label)):
                idx_new = np.random.randint(X_new.shape[0]) # chosen spot for replacement on new array
                idx_neg = np.random.randint(X_neg.shape[0]) # chosen positive example index

                X_new[idx_new,:] = X_neg[idx_neg,:]
                y_new[idx_new] = int(not self.positive_label)

            # train classifier with the bootstrapped data
            clf = sklearn.base.clone(self.base_classifier)
            clf.fit(X_new, y_new)

            classifiers.append(clf)

        return classifiers


    def fit(self, X, y):
        #if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None
        
        return self


    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 35
0
class ICSBagging(PoolGenerator):


    def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100,
            combination_rule='majority_vote', diversity_metric='e', max_samples=1.0,
            positive_label=1):

        self.K = K
        self.alpha = alpha

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.combination_rule = combination_rule
        self.positive_label = positive_label

        self.classifiers = None
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule)

        self.diversity_metric = diversity_metric
        self.diversity = Diversity(metric=diversity_metric)

        self.validation_X = None
        self.validation_y = None


    def set_validation(self, X, y):
        self.validation_X = X
        self.validation_y = y


    def fitness(self, classifier):
        '''
        #TODO normalize diversity metric.
        '''
        self.ensemble.add(classifier)
        out = self.ensemble.output(self.validation_X)
        y_pred = self.combiner.combine(out)
        y_true = self.validation_y

        auc = evaluation.auc_score(y_true, y_pred)
        div = self.diversity.calculate(self.ensemble,
                self.validation_X, self.validation_y)

        #diversity = entropy_measure_e(self.ensemble,
        #        self.validation_X, self.validation_y)

        self.ensemble.classifiers.pop()
        return self.alpha * auc + (1.0 - self.alpha) * div


    def _calc_pos_prob(self):
        y_pred = self.combiner.combine(self.ensemble.output(self.validation_X))
        mask = self.positive_label == self.validation_y
        pos_acc = float(sum(y_pred[mask] == self.validation_y[mask]))/len(self.validation_y[mask])
        neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask]))/len(self.validation_y[~mask])
        return 1.0 - (pos_acc / (pos_acc + neg_acc))


    def bootstrap_classifiers(self, X, y, K, pos_prob):
        mask = self.positive_label == y
        negative_label = y[~mask][0]

        clfs = []
        sets_cX, sets_cy = [], []
        for i in range(K):
            cX, cy = [], []
            for j in range(X.shape[0]):
                if np.random.random() < pos_prob:
                    idx = np.random.random_integers(0, len(X[mask]) - 1)
                    cX = cX + [X[mask][idx]]
                    cy = cy + [self.positive_label]
                else:
                    idx = np.random.random_integers(0, len(X[~mask]) - 1)
                    cX = cX + [X[~mask][idx]]
                    cy = cy + [negative_label]
            if not self.positive_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X[mask])- 1)
                cX[idx_1] = X[mask][idx_2]
                cy[idx_1] = self.positive_label
            elif not negative_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X[~mask])- 1)
                cX[idx_1] = X[~mask][idx_2]
                cy[idx_1] = negative_label
            #print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy)

            sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy]
            clf = sklearn.base.clone(self.base_classifier)
            clfs = clfs + [clf.fit(cX, cy)]

        return clfs


    def fit(self, X, y):
        #if self.validation_X == None and self.validation_y == None:
        self.validation_X = X
        self.validation_y = y

        self.classes_ = set(y)
        self.ensemble = Ensemble()

        clfs = self.bootstrap_classifiers(X, y, self.K, 0.5)
        self.ensemble.add(np.random.choice(clfs))

        for i in range(1, self.n_classifiers):
            clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob())
            self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf)))

        self.validation_X = None
        self.validation_y = None
        
        return self


    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
Esempio n. 36
0
 def test_init_mult_classifiers(self):
     c1 = MockClassifier()
     c2 = MockClassifier()
     c3 = MockClassifier()
     ens = Ensemble(classifiers=[c1, c2, c3])
     assert len(ens.classifiers) == 3
Esempio n. 37
0
 def test_len_with_empty_init(self):
     ens = Ensemble()
     assert len(ens) == 0
Esempio n. 38
0
 def test_len_with_mult_added(self):
     ens = Ensemble()
     ens.add(MockClassifier())
     ens.add(MockClassifier())
     ens.add(MockClassifier())
     assert len(ens) == 3
Esempio n. 39
0
 def test_add_empty_init(self):
     ens = Ensemble()
     c = MockClassifier()
     ens.add(c)
     assert ens.classifiers[0] is c
Esempio n. 40
0
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools

import brew
from brew.base import Ensemble
from brew.combination.combiner import Combiner
from brew.stacking.stacker import EnsembleStack
from brew.stacking.stacker import EnsembleStackClassifier

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

layer_1 = [
    SVC(probability=True),
    RandomForestClassifier(n_estimators=100),
    ExtraTreesClassifier(n_estimators=100)
]

layer_2 = [SVC(probability=True), LogisticRegression(max_iter=500)]

stack = EnsembleStack(cv=10)  # number of folds per layer
stack.add_layer(Ensemble(layer_1))
stack.add_layer(Ensemble(layer_2))

clf = EnsembleStackClassifier(stack, Combiner('mean'))
Esempio n. 41
0
 def test_add_empty_init(self):
     ens = Ensemble()
     c = MockClassifier()
     ens.add(c)
     assert ens.classifiers[0] is c
Esempio n. 42
0
 def test_empty_init(self):
     ens = Ensemble()
     assert ens.classifiers != None
     assert len(ens.classifiers) == 0
Esempio n. 43
0

my_data = genfromtxt('/Users/samarth/Desktop/data.csv', delimiter=',')

for item in range(0, my_data.shape[0]):
    var = my_data[item][4]
    my_data[item][4] = int(range_scaler(5538, 600000, 100, 1000, var))
'''	
if my_data[item][6] < 100 or my_data[item][6] > 1000 or (my_data[item][6]>my_data[item][4]):
		my_data = np.delete(my_data, (item), axis = 0)
'''
my_data = my_data[np.logical_not(
    np.logical_and(my_data[:, 4] < 100, my_data[:, 4] > 1000))]
my_data = my_data[np.logical_not(my_data[:, 4] > my_data[:, 6])]

ensemble = Ensemble([clf1, clf2, clf3])
eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean'))

layer_1 = Ensemble([clf1, clf2, clf3])
layer_2 = Ensemble([sklearn.clone(clf1)])

stack = EnsembleStack(cv=3)

stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifier(stack)

clf_list = [clf1, clf2, clf3, eclf, sclf]
lbl_list = [
    'Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble',
Esempio n. 44
0
 def test_init_one_classifier(self):
     c = MockClassifier()
     ens = Ensemble(classifiers=[c])
     assert len(ens.classifiers) == 1