Exemple #1
0
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=False).fit(X, y)

    warn_msg = ("'flatten_transform' default value will be "
                "changed to True in 0.21. "
                "To silence this warning you may"
                " explicitly set flatten_transform=False.")
    res = assert_warns_message(DeprecationWarning, warn_msg, eclf1.transform,
                               X)
    assert_array_equal(res.shape, (3, 4, 2))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(
        res.swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
    assert_array_almost_equal(
        eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
def test_set_estimator_none(drop):
    """VotingClassifier set_params should be able to set estimators as None or
    drop"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] is drop
    assert len(eclf2.estimators_) == 2
    assert all(isinstance(est, (LogisticRegression, GaussianNB))
               for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] is drop

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        eclf2.set_params(voting='soft').fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
    assert record if drop is None else not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X1, y1)
    assert record if drop is None else not record
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=False).fit(X, y)

    warn_msg = ("'flatten_transform' default value will be "
                "changed to True in 0.21. "
                "To silence this warning you may"
                " explicitly set flatten_transform=False.")
    res = assert_warns_message(DeprecationWarning, warn_msg,
                               eclf1.transform, X)
    assert_array_equal(res.shape, (3, 4, 2))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(res.swapaxes(0, 1).reshape((4, 6)),
                              eclf2.transform(X))
    assert_array_almost_equal(
            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
            eclf2.transform(X)
    )
Exemple #4
0
class deepForestLayer(ClassifierMixin):
    """A Really hacky and WIP implementation of the layers required for a Cascade Forest"""
    def __init__(self, n_nodes, nClasses=1,output=False):
        self.output=output
        self.nClasses = nClasses
        self.n_nodes= n_nodes
        nrfs = int(n_nodes/2)
        nefs = n_nodes - nrfs
        self.estimators = []
        self.final_voters = []
        for i in range(nefs):
            self.estimators.append(('ET'+str(i),ExtraTreesClassifier(n_estimators=1000,min_samples_leaf=10, n_jobs=-1)))
            
        for i in range(nrfs):
            self.estimators.append(('RF'+str(i),RandomForestClassifier(n_estimators=1000,min_samples_leaf=10,n_jobs=-1)))
        self.voter = VotingClassifier(estimators=self.estimators, voting='soft')

    def fit_Kfold(self,X_train, X_test,y_train, y_test):
        """This function implements the growing and validation to determine number of layers required"""
        
        fold = KFold() # 3-Fold CV
        train_preds = np.empty((3,X_train.shape[0],self.nClasses*self.n_nodes))
        train_preds[:] = np.nan
        est_preds = np.empty((3,X_test.shape[0],self.nClasses*self.n_nodes))
        est_preds[:] = np.nan
        i=0
        for train_idx, test_idx in fold.split(X_train):
            self.voter.fit(X_train[train_idx],y_train[train_idx]) #Fit each of the estimators to our data
            
            #voter.transform has shape number of estimators (4 in this case) x no of samples x number of classes.
            #Insample is the transform output reshaped to have shape no of samples x number of classes*number of estimators
            insample = self.voter.transform(X_train[train_idx]).swapaxes(0,1).reshape((X_train[train_idx].shape[0],-1))
            outsample = self.voter.transform(X_test).swapaxes(0,1).reshape((X_test.shape[0],-1))
            #Insample is the training error, outsample is the validation error.
            
            train_preds[i,train_idx] = insample.copy()
            est_preds[i] = outsample.copy()
            
            i+=1
        #As I used KFold, train_preds and est_preds have two valid entries and one nan entry per data point
        #average this dimension so we get one probability prediction per data point
        return np.nanmean(train_preds,axis=0),np.nanmean(est_preds,axis=0)
        
    def fit(self, X,y):
        """This function does a full fit once the number of layers has been decided"""
        fold = KFold()
        #Create 3 models, each fitted on a fold of the data.
        #This is only way I can think of getting required output at prediction stage.
        for train_idx, test_idx in fold.split(X):
            clf=VotingClassifier(estimators=copy.deepcopy(self.estimators), voting='soft')
            clf.fit(X[train_idx], y[train_idx])
            self.final_voters.append(clf)
        
    def predict(self, X):
        """Make predictions, using models fitted by KFold"""
        preds = np.zeros((len(self.final_voters),X.shape[0],self.nClasses))
        for i in range(len(self.final_voters)):
            preds[i]=self.final_voters[i].predict_proba(X)
        return np.mean(preds,axis=0)
Exemple #5
0
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(
        all([
            not isinstance(est, RandomForestClassifier)
            for est in eclf2.estimators_
        ]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(ValueError, msg,
                         eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(all([not isinstance(est, RandomForestClassifier) for est in
                     eclf2.estimators_]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(
        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
Exemple #7
0
class VotingClassifierImpl():
    def __init__(self,
                 estimators=None,
                 voting='hard',
                 weights=None,
                 n_jobs=None,
                 flatten_transform=True):
        self._hyperparams = {
            'estimators': estimators,
            'voting': voting,
            'weights': weights,
            'n_jobs': n_jobs,
            'flatten_transform': flatten_transform
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Exemple #8
0
def test_notfitted():
    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
                                        ('lr2', LogisticRegression())],
                            voting='soft')
    ereg = VotingRegressor([('dr', DummyRegressor())])
    msg = ("This %s instance is not fitted yet. Call \'fit\'"
           " with appropriate arguments before using this estimator.")
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.predict(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.predict_proba(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.transform(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
        ereg.predict(X_r)
    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
        ereg.transform(X_r)
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=False).fit(X, y)

    assert_array_equal(eclf1.transform(X).shape, (4, 6))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(eclf1.transform(X),
                              eclf2.transform(X))
    assert_array_almost_equal(
            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
            eclf2.transform(X)
    )
Exemple #10
0
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=False).fit(X, y)

    assert_array_equal(eclf1.transform(X).shape, (4, 6))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
    assert_array_almost_equal(
        eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
Exemple #11
0
def test_notfitted():
    eclf = VotingClassifier(
        estimators=[("lr1", LogisticRegression()),
                    ("lr2", LogisticRegression())],
        voting="soft",
    )
    ereg = VotingRegressor([("dr", DummyRegressor())])
    msg = ("This %s instance is not fitted yet. Call 'fit'"
           " with appropriate arguments before using this estimator.")
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.predict(X)
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.predict_proba(X)
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.transform(X)
    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
        ereg.predict(X_r)
    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
        ereg.transform(X_r)
def voting(X_train, y_train, estimators, X_test,
           y_test):  ## trains with all models !!
    seed = 7
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    ensemble = VotingClassifier(estimators).fit(X_train, y_train)
    results = ensemble.score(X_test, y_test)
    y_df = ensemble.transform(X_test)
    y_pred = ensemble.predict(X_test)
    precicions, recall, t = precision_recall_curve(y_test, y_df, pos_label=1)
    print(precicions[:10], recall[:10], t[:10])
    precision = precicions[0]
    confmat = confusion_matrix(y_test, y_pred)
    return results, precision, confmat
Exemple #13
0
def committee_classify(features_train, labels_train, features_test, classifier="d_tree", n_classifiers=5):
    """
        Using an ensamble committee, classifies and returns calculated entropies for given test features.
    :param numpy.array features_train: array with features to train the classifiers
    :param list of int labels_train: list of labels to train the classifiers
    :param numpy.array features_test: array with features to be classified
    :param str classifier: the classifier type to be used in the committee. Valid values: "d_tree", "nb", "svm"
    :param int n_classifiers: number of classifiers to be created in the committee
    :return: a tuple with predictions and calculated entropies
    :rtype: (list of int, numpy.array)
    """

    def d_tree(randomness):
        return DecisionTreeClassifier(criterion="entropy", splitter="random", random_state=randomness)

    def nb(randomness):
        return MultinomialNB(alpha=randomness * 0.5)

    def svm(randomness):
        kernels = [("linear", 0), ("poly", 2), ("poly", 3), ("rbf", 0), ("sigmoid", 0)]
        return SVC(kernel=kernels[randomness % len(kernels)][0], degree=kernels[randomness % len(kernels)][1])

    classifier_calls = {"d_tree": d_tree, "nb": nb, "svm": svm}

    # Necessary to scale features for svm
    if classifier == "svm":
        scaler = preprocessing.StandardScaler().fit(features_train)
        features_train = scaler.transform(features_train)
        features_test = scaler.transform(features_test)

    estimators = []
    for i in range(n_classifiers):
        # Initialize classifiers for the ensamble
        estimators.append((classifier + str(i), classifier_calls[classifier](i)))

    # Fit and predict
    eclf = VotingClassifier(estimators=estimators, voting="hard", n_jobs=-1)
    eclf.fit(features_train, labels_train)
    prediction = eclf.predict(features_test)

    # Calculate entropies. Individual votes array needs to transpose for calculating entropies with classifiers in
    # the axis 0.
    individual_votes = numpy.transpose(eclf.transform(features_test))
    entropies = calculate_entropies(individual_votes)

    return prediction, entropies
Exemple #14
0
def test_get_features_names_out_classifier(kwargs, expected_names):
    """Check get_feature_names_out for classifier for different settings."""
    X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
    y = [0, 1, 2, 0]

    voting = VotingClassifier(
        estimators=[
            ("lr", LogisticRegression(random_state=0)),
            ("tree", DecisionTreeClassifier(random_state=0)),
        ],
        **kwargs,
    )
    voting.fit(X, y)
    X_trans = voting.transform(X)
    names_out = voting.get_feature_names_out()

    assert X_trans.shape[1] == len(expected_names)
    assert_array_equal(names_out, expected_names)
Exemple #15
0
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X, y)
print(eclf1.predict(X))

np.array_equal(eclf1.named_estimators_.lr.predict(X),
               eclf1.named_estimators_['lr'].predict(X))

eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft')
eclf2 = eclf2.fit(X, y)
print(eclf2.predict(X))

eclf3 = VotingClassifier(estimators=[
       ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
       voting='soft', weights=[2,1,1],
       flatten_transform=True)
eclf3 = eclf3.fit(X, y)
print(eclf3.predict(X))

print(eclf3.transform(X).shape)
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin):
    """
    Soft voting classifier that chooses weights based on test dataset
    """
    def __init__(self,
                 estimators,
                 test_size=0.33,
                 starting_weights=None,
                 verbose=0,
                 random_state=None,
                 refit=False):
        self.test_size = test_size
        self.estimators = estimators
        self.verbose = verbose
        self.random_state = random_state
        self.refit = refit

        if starting_weights is not None:
            self.starting_weights = starting_weights
        else:
            self.starting_weights = [0.5] * len(estimators)

        self.best_estimator_ = None
        self.weights_ = None
        self.peak_score_ = None

    def _log(self, msg, verbosity=0):
        if self.verbose >= verbosity:
            print("{pre} {ind}{msg}".format(pre="(SW)",
                                            ind="".join(["  "] * verbosity),
                                            msg=msg))

    def fit(self, X, y):
        """Train and find the optimum weights.

        https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code
        https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code
        """

        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=y)

        fitted_estimators = []
        predictions = []

        def log_loss_func(weights):
            final_prediction = 0
            for weight, prediction in zip(weights, predictions):
                final_prediction += weight * prediction

            return log_loss(y_test, final_prediction)

        # Fit on train set
        self._log("Fitting on train subset...")

        for label, clf in self.estimators:
            self._log("fitting {0}...".format(label), 1)
            fitted_clf = clone(clf).fit(X_train, y_train)
            fitted_estimators.append((label, fitted_clf))

        # Predict on test set
        self._log("Predict on test subset...")

        for label, clf in fitted_estimators:
            self._log("predict using {0}...".format(label), 1)
            predictions.append(clf.predict_proba(X_test))

        # Search weights
        self._log("Searching weights...")

        cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)})
        bounds = [(0, 1)] * len(predictions)
        res = minimize(log_loss_func,
                       self.starting_weights,
                       method="SLSQP",
                       bounds=bounds,
                       constraints=cons)

        self.weights_ = list(res["x"])
        self.peak_score_ = res["fun"]

        self._log("Best weights: {0}".format(self.weights_), 1)
        self._log("Peak score: {0}".format(self.peak_score_), 1)

        # Build voting classifier
        self.best_estimator_ = VotingClassifier(estimators=self.estimators,
                                                voting="soft",
                                                weights=self.weights_)

        if self.refit:
            self._log("Refitting using best weights...")
            self.best_estimator_.fit(X, y)

        return self

    def predict(self, X):
        return self.best_estimator_.predict(X)

    def predict_proba(self, X):
        return self.best_estimator_.predict_proba(X)

    def transform(self, X):
        return self.best_estimator_.transform(X)
Exemple #17
0
def test_set_estimator_drop():
    # VotingClassifier set_params should be able to set estimators as drop
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
        voting="hard",
        weights=[1, 0, 0.5],
    ).fit(X, y)

    eclf2 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
        voting="hard",
        weights=[1, 1, 0.5],
    )
    eclf2.set_params(rf="drop").fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] == "drop"
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] == "drop"

    eclf1.set_params(voting="soft").fit(X, y)
    eclf2.set_params(voting="soft").fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = "All estimators are dropped. At least one is required"
    with pytest.raises(ValueError, match=msg):
        eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(
        estimators=[("rf", clf2), ("nb", clf3)],
        voting="soft",
        weights=[0, 0.5],
        flatten_transform=False,
    ).fit(X1, y1)

    eclf2 = VotingClassifier(
        estimators=[("rf", clf2), ("nb", clf3)],
        voting="soft",
        weights=[1, 0.5],
        flatten_transform=False,
    )
    eclf2.set_params(rf="drop").fit(X1, y1)
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
    )
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1.0, 0.0], [0.0, 1.0]]]))
    eclf1.set_params(voting="hard")
    eclf2.set_params(voting="hard")
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
Exemple #18
0
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin):
    """
    Soft voting classifier that chooses weights based on test dataset
    """
    def __init__(self, estimators, test_size=0.33, starting_weights=None,
                 verbose=0, random_state=None, refit=False):
        self.test_size = test_size
        self.estimators = estimators
        self.verbose = verbose
        self.random_state = random_state
        self.refit = refit

        if starting_weights is not None:
            self.starting_weights = starting_weights
        else:
            self.starting_weights = [0.5] * len(estimators)

        self.best_estimator_ = None
        self.weights_ = None
        self.peak_score_ = None

    def _log(self, msg, verbosity=0):
        if self.verbose >= verbosity:
            print "{pre} {ind}{msg}".format(
                pre = "(SW)",
                ind = "".join(["  "] * verbosity),
                msg = msg
            )

    def fit(self, X, y):
        """Train and find the optimum weights.

        https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code
        https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code
        """

        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size = self.test_size,
            random_state = self.random_state,
            stratify = y
        )

        fitted_estimators = []
        predictions = []

        def log_loss_func(weights):
            final_prediction = 0
            for weight, prediction in zip(weights, predictions):
                final_prediction += weight * prediction

            return log_loss(y_test, final_prediction)

        # Fit on train set
        self._log("Fitting on train subset...")

        for label, clf in self.estimators:
            self._log("fitting {0}...".format(label), 1)
            fitted_clf = clone(clf).fit(X_train, y_train)
            fitted_estimators.append((label, fitted_clf))

        # Predict on test set
        self._log("Predict on test subset...")

        for label, clf in fitted_estimators:
            self._log("predict using {0}...".format(label), 1)
            predictions.append(clf.predict_proba(X_test))

        # Search weights
        self._log("Searching weights...")

        cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)})
        bounds = [(0,1)]*len(predictions)
        res = minimize(
            log_loss_func,
            self.starting_weights,
            method = "SLSQP",
            bounds = bounds,
            constraints = cons
        )

        self.weights_ = list(res["x"])
        self.peak_score_ = res["fun"]

        self._log("Best weights: {0}".format(self.weights_), 1)
        self._log("Peak score: {0}".format(self.peak_score_), 1)

        # Build voting classifier
        self.best_estimator_ = VotingClassifier(
            estimators = self.estimators,
            voting = "soft",
            weights = self.weights_
        )

        if self.refit:
            self._log("Refitting using best weights...")
            self.best_estimator_.fit(X, y)

        return self

    def predict(self, X):
        return self.best_estimator_.predict(X)

    def predict_proba(self, X):
        return self.best_estimator_.predict_proba(X)

    def transform(self, X):
        return self.best_estimator_.transform(X)
Exemple #19
0
def vote(debug=False):
    """
    本地的fptp,大概是在0.60,预计线上比较低的
    ===
    :param debug:
    :return:
    """
    train_path = 'data/atec_anti_fraud_train.csv'
    test_path = 'data/atec_anti_fraud_test_a.csv'
    if debug:
        nrows = 100000
    else:
        nrows = 10000 * 10000
    logging.info('begin main')
    train_df = pd.read_csv(train_path, nrows=nrows)

    train_df = train_df[train_df['label'] != -1]

    test_df = pd.read_csv(test_path, nrows=nrows)
    test_df['label'] = -2
    df = factorize(train_df, test_df)

    ### 关于 rate的特别处理 .
    df['frate_1'] = df['f83'] / (df['f84'] + 1)
    df['frate_2'] = df['f85'] / (df['f84'] + 1)
    df['frate_3'] = df['f86'] / (df['f84'] + 1)
    df['frate_82_84'] = df['f82'] / (df['f84'] + 1)

    df['frate_4'] = df['f82'] / (df['f85'] + 1)
    df['frate_5'] = df['f82'] / (df['f86'] + 1)
    df['frate_6'] = df['f85'] / (df['f86'] + 1)

    train_df = df[df['label'] != -2]
    test_df = df[df['label'] == -2]

    logging.info('traindf  shape = {}'.format(train_df.shape))
    logging.info('testdf shape = {} '.format(test_df.shape))

    y = train_df.pop('label')

    train_df.pop('id')
    train_df.pop('date')

    test_id = test_df.pop('id')
    test_df.pop('date')
    test_df.pop('label')

    # y_test = df_test.pop('label')
    #
    # df_test.pop('id')
    # df_test.pop('date')
    #
    # X_train = df_train
    # X_test = df_test
    X = train_df
    logging.info("will train-test split")

    logging.info("fitting..")

    cls1 = lgb.LGBMClassifier(objective='binary',
                              n_estimators=100,
                              subsample=0.8,
                              subsample_freq=1,
                              colsample_bytree=0.8,
                              num_leaves=31,
                              learning_rate=0.05,
                              silent=False)

    rf1 = lgb.LGBMClassifier(boosting_type='rf',
                             objective='binary',
                             n_estimators=200,
                             subsample=0.8,
                             subsample_freq=1,
                             colsample_bytree=0.8,
                             num_leaves=31,
                             learning_rate=0.05,
                             silent=False)

    rf2 = lgb.LGBMClassifier(boosting_type='rf',
                             objective='binary',
                             n_estimators=400,
                             subsample=0.8,
                             subsample_freq=1,
                             colsample_bytree=0.8,
                             num_leaves=31,
                             learning_rate=0.05,
                             silent=False)

    cb = catboost.CatBoostClassifier(iterations=100,
                                     learning_rate=0.05,
                                     depth=6,
                                     loss_function='Logloss')
    vc = VotingClassifier(estimators=[('cb', cb), ('gbdt', cls1),
                                      ('rf200', rf1), ('rf400', rf2)],
                          voting='soft',
                          flatten_transform=False)

    vc.fit(X, y)
    # test_df = train_df
    # (n_classifiers, n_samples, n_classes)
    vc_score = vc.transform(test_df)  #type: np.ndarray

    test_labels = vc.predict(test_df)

    n_classifier, n_sample, n_classes = vc_score.shape
    logging.info('orig shape = {}, test_labels = {},  test_mean = {} '.format(
        vc_score.shape, test_labels.shape, np.mean(test_labels)))

    vc_score = vc_score.swapaxes(0, 1)
    logging.info('swapaxeis  shape = {}'.format(vc_score.shape))

    score = []
    for i in range(n_sample):
        buf = []
        for j in range(n_classifier):
            k = np.argmax(vc_score[i][j], 0)
            if k == test_labels[i]:
                buf.append(vc_score[i][j][1])

        s = np.mean(buf, axis=0)
        score.append(s)

    # logging.info('train metric = {}'.format(metric(y, score)))
    day = today()
    pd.DataFrame({
        'id': test_id,
        'score': score
    }).to_csv('{}.{}.submit.csv'.format(day, 'vc'),
              index=False,
              float_format='%.6f')

    logging.info('done')
class LabelClassifier:
    """Class implemens various label Classifiers """
    def __init__(self, categoryToClassify: list, pretrained=None):
        """Constructor for Label Classier

        Args:
            categoryToClassify (list): data to save
            pretrained ([type], optional): Pretrained classifier. Defaults to None.
        """
        if not categoryToClassify:
            raise ("no categories to classify have been provided")
        self.category: list = categoryToClassify
        self.estimators = estimators=[('MultinomialNB', MultinomialNB()), \
        ('SGDClassifier', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=100, max_iter=200)),
        ('sigmoidSVM', SVC(kernel='sigmoid', gamma=1.0)),
        ('RandomForest', RandomForestClassifier(200, bootstrap=False)),
        ('LogisticRegression',LogisticRegression(solver='sag',random_state=100))]
        self.trainedEstimator = pretrained
        self.fileLocation: str = self.generateFilename()
        self.stackingEstimator = None
        self.rbfKernel = None

    def trainingClassifier(self, X_train: numpy.ndarray,
                           y_train: numpy.ndarray):
        """Constructor for Label Classier

        Args:
            X_train (numpy.ndarray): X_train training documents
            y_train (numpy.ndarray): y_train labels for training documents
        """
        if not X_train.size:
            raise ("No X_train data was provided")
        if not y_train.size:
            raise ("No y_train data was provided")
        logging.info("> training classifier")
        voting = None
        if config.getValueFromConfig("classifier loadClassifier") == True:
            try:
                self.trainedEstimator = joblib.load(self.fileLocation)
                voting = load_classifier.getVotingClassifier()
            except:
                raise ("load voting classifier failed")

        else:
            self.trainedEstimator = VotingClassifier(self.estimators,
                                                     voting='hard')
            voting = self.trainedEstimator.fit_transform(
                X_train, y_train)  # test our model on the test data
            if config.getValueFromConfig("classifier saveClassifier") == True:
                joblib.dump(self.trainedEstimator,
                            self.fileLocation,
                            compress=9)
                joblib.dump(
                    voting,
                    '../classifier/trained_classifiers/voting_classifier',
                    compress=9)
                logging.info("> dumped Classifier: {}".format(
                    self.fileLocation))
        self.trainKernelApproxSvgOnVoting(voting, y_train)

    def predict(self, X_test: numpy.ndarray) -> numpy.ndarray:
        """Method labels data

        Args:
            X_test (numpy.ndarray): X_test data

        Returns:
            numpy.ndarray: Trained estimator prediction
        """
        if not X_test.size:
            raise ("No test documents were provided")
        logging.info("> predicting")
        prediction = self.trainedEstimator.predict(X_test)
        assert prediction.size, "No documents were predicted"
        return prediction

    def generateFilename(self) -> str:
        """Method generates Filename for classifier

        Returns:
            str: Filename as string
        """
        folder = config.getValueFromConfig("classifier path saveFolder")
        if folder == None:
            raise ("No folder name was provided")
        if len(self.category) < 2 or len(self.category) > 3:
            raise ("To few or many categories")
        if len(self.category) == 3:
            return "{}ensembleClassifier_{}-{}-{}.joblib.pkl".format(
                folder, self.category[0], self.category[1], self.category[2])
        else:
            return "{}ensembleClassifier_{}-{}.joblib.pkl".format(
                folder, self.category[0], self.category[1])

    def accuracy(self, X_test: numpy.ndarray, y_test: numpy.ndarray,
                 predicted: numpy.ndarray):
        """Methods plots the accuracy of the trained classifier

        Args:
            X_test (numpy.ndarray): The test documents
            y_test (numpy.ndarray): The results for the test documents
            predicted (numpy.ndarray): The predicted test values 

        Raises:
            AssertionError: This error is being thrown, if the classifier wasn't trained previousely
        """
        if not X_test.size:
            raise ("X_test was empty")
        if not y_test.size:
            raise ("y_test was empty")
        if not predicted.size:
            raise ("predicted was empty")
        if self.trainedEstimator == None:
            raise AssertionError("Classifier has not been trained yet")
        logging.info("\n ->> ensemble-score:{}\n".format(
            numpy.mean(predicted == y_test)))
        plot_confusion_matrix(
            self.trainedEstimator,
            X_test,
            y_test,
            normalize="all",
            display_labels=[self.category[0], self.category[1]])
        plt.show()

    def trainKernelApproxSvgOnVoting(self, X_predicted: numpy.ndarray,
                                     y: numpy.ndarray):
        """Train kernel for classifier

        Args:
            X_predicted (numpy.ndarray): The prediction of the other classifiers.
            y (numpy.ndarray): The real labels.
        """
        if not X_predicted.size:
            raise ("No X_predicted data was orovided")
        if not y.size:
            raise ("No y data was provided")
        logging.info("training stacking classifier")
        self.rbfKernel = RBFSampler(gamma=1, random_state=1)
        X_features = self.rbfKernel.fit_transform(X_predicted)
        self.stackingEstimator = SGDClassifier(
            max_iter=config.getValueFromConfig("SGDClassifierIterations"))
        self.stackingEstimator.fit(X_features, y)
        logging.info("stacking-classifier: " +
                     str(self.stackingEstimator.score(X_features, y)))

    def stackingPrediction(self, X_test: numpy.ndarray) -> numpy.ndarray:
        """This method predicts the result using another classifier - so called "stacking"

        Args:
            X_test (numpy.ndarray): The vectorized documents to test on. 

        Returns:
            numpy.ndarray: The prediction for the labels using stacking.
        """
        if not X_test.size:
            raise ("No X_test data was provided")
        voting = self.trainedEstimator.transform(X_test)
        influencedVoting = self.rbfKernel.transform(voting)
        prediction = self.stackingEstimator.predict(influencedVoting)
        assert prediction.size
        return prediction
Exemple #21
0
def test_set_estimator_drop():
    # VotingClassifier set_params should be able to set estimators as drop
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] == 'drop'
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] == 'drop'

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(voting='soft').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y)
    assert not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X1, y1)
    assert not record
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
Exemple #22
0
# Performance evaluation
for clf in (clf1, clf2, clf3, eclf2):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(clf.__class__.__name__, accuracy_score(y, y_pred))
print()

# prdicted result
print('eclf2.predict(X) = \n{0}\n'.format(eclf2.predict(X)))

print(
    '---< fitting: voting="soft", weight=[2, 1, 1], flatten_transform=True >---'
)
eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                     ('gnb', clf3)],
                         voting='soft',
                         weights=[2, 1, 1],
                         flatten_transform=True)

# Performance evaluation
for clf in (clf1, clf2, clf3, eclf3):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(clf.__class__.__name__, accuracy_score(y, y_pred))
print()

# prdicted result
print('eclf3.predict(X) = \n{0}\n'.format(eclf3.predict(X)))

print('eclf3.transform(X).shape = {0}\n'.format(eclf3.transform(X).shape))