Exemple #1
0
    def fit(self, x, y):
        '''
        Fit method for the MetaEstimator. Output is a fitted estimator, that can then be used
        for prediction.
        '''
        
        # Determine if regression or classification problem, by comparing number of
        # unique values in output against threshold
        if self.method_type is None:
            is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical
            self.method_type = ('classif','regr')[is_above]
        
        # Fetch the appropriate list of estimators
        if self.estimators is None:
            if self.method is not None:
                self.get_estim(y)
            else:
                if self.method_type == 'regr':
                    self.estimators = linear_model.LassoCV(normalize=True)
                elif self.method_type == 'classif':
                    self.estimators = ensemble.RandomForestClassifier(random_state=1)
        else:
            if self.method_type == 'regr':
                self.estimators = self.estimators[0]
            elif self.method_type == 'classif':
                self.estimators = self.estimators[1]

        # Collect information on classes in training set (needed later)
        if self.method_type == 'classif':
            self.classes = dummy.DummyClassifier().fit(x, y).classes_

        # Fit according to respective ensembling method
        if self.method == 'stacking':
            if self.method_type == 'regr':
                self.fitted = regressor.StackingRegressor(regressors=self.estimators,
                                            meta_regressor=linear_model.LinearRegression()).fit(x, y)

            elif self.method_type == 'classif':
                self.fitted = classifier.StackingClassifier(classifiers=self.estimators,
                            meta_classifier=linear_model.LogisticRegression(random_state = 1)).fit(x, y)

        elif self.method == 'multiplexing':
            for i in self.estimators:
                self.losses.append(np.mean(cross_val_score(i, x, y)))
            # For multiplexing, cross validation scores determine which estimator is chosen
            self.fitted  = self.estimators[np.argmin(self.losses)].fit(x, y)

        else:
            self.fitted = self.estimators.fit(x, y)

        return self
Exemple #2
0
titanic_all.drop(['PassengerId', 'Name', 'Cabin','Ticket','Survived'], axis=1, inplace=True)

features = ['Sex', 'Embarked', 'Pclass', 'Title', 'FamilyCategory']
titanic_all = pd.get_dummies(titanic_all, columns=features)

X_train = titanic_all[0:titanic_train.shape[0]]
y_train = titanic_train['Survived']

#build stacked model using selected features
rf1 = ensemble.RandomForestClassifier(random_state=100)
ada2 = ensemble.AdaBoostClassifier(random_state=100)

dtSuper = tree.DecisionTreeClassifier(random_state=100)

stack_estimator = mlxClassifier.StackingClassifier(classifiers=[rf1, ada2], meta_classifier=dtSuper) #, store_train_meta_features=True)
stack_grid = {'randomforestclassifier__n_estimators': [5, 10],
            'adaboostclassifier__n_estimators': [10, 50],
            'meta-decisiontreeclassifier__min_samples_split': [2, 3]}

grid_stack_estimator = model_selection.GridSearchCV(stack_estimator, stack_grid, cv=10)
grid_stack_estimator.fit(X_train, y_train)
#grid_stack_estimator.fit(X_train1, y_train)

final_model = grid_stack_estimator.best_estimator_
print(final_model.clfs_) #Classifiers
print(final_model.meta_clf_) #Meta Classifiers

     
X_test = titanic_all[titanic_train.shape[0]:]
titanic_test['Survived'] = grid_stack_estimator.predict(X_test)
Exemple #3
0
fs_model = feature_selection.SelectFromModel(rf, prefit=True)
X_train1 = fs_model.transform(X_train)
X_train1.shape
selected_features = X_train.columns[fs_model.get_support()]

#build stacked model using selected features
dt1 = tree.DecisionTreeClassifier(random_state=100)
knn2 = neighbors.KNeighborsClassifier()
gb3 = ensemble.GradientBoostingClassifier(random_state=100)

lr = linear_model.LogisticRegression(random_state=100)

#use_probas=True Means: The class-probabilities of the first-level classifiers can be used to train the meta-classifier (2nd-level classifier)
stack_estimator = classifier.StackingClassifier(
    classifiers=[dt1, knn2, gb3], meta_classifier=lr,
    use_probas=True)  #store_train_meta_features=True,
stack_grid = {
    'decisiontreeclassifier__min_samples_split': [2, 3],
    'kneighborsclassifier__n_neighbors': [1, 3, 5, 8],
    'gradientboostingclassifier__n_estimators': [10, 50],
    'meta-logisticregression__C': [0.1, 10.0]
}

grid_stack_estimator = model_selection.GridSearchCV(stack_estimator,
                                                    stack_grid,
                                                    cv=10)
grid_stack_estimator.fit(X_train1, y_train)

print(grid_stack_estimator.best_score_)
print(grid_stack_estimator.best_params_)
Exemple #4
0
cat_columns = ['Sex', 'Embarked', 'Pclass', 'Title', 'Age1', 'FamilySize1']
titanic_train1 = pd.get_dummies(titanic_train, columns=cat_columns)
titanic_train1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'],
                    axis=1,
                    inplace=True)

X_train = titanic_train1
y_train = titanic_train['Survived']

knn = neighbors.KNeighborsClassifier()
gnb = naive_bayes.GaussianNB()
rf = ensemble.RandomForestClassifier(random_state=100)
lr = linear_model.LogisticRegression(random_state=100)

sclf = classifier.StackingClassifier(classifiers=[knn, gnb, rf],
                                     meta_classifier=lr,
                                     store_train_meta_features=True,
                                     use_probas=True)
st_grid = {
    'kneighborsclassifier__n_neighbors': [3, 4, 5],
    'randomforestclassifier__n_estimators': [10, 50],
    'meta-logisticregression__C': [0.1, 0.5]
}

grid_classifier = model_selection.GridSearchCV(sclf,
                                               st_grid,
                                               cv=10,
                                               refit=True,
                                               return_train_score=True)
grid_classifier.fit(X_train, y_train)
results = grid_classifier.cv_results_
print(results.get('params'))
features.plot(kind='barh', figsize=(20, 20))

fs_model = feature_selection.SelectFromModel(rf, prefit=True)
X_train1 = fs_model.transform(X_train)
X_train1.shape
selected_features = X_train.columns[fs_model.get_support()]

#build stacked model using selected features
rf1 = ensemble.RandomForestClassifier(random_state=100)
knn2 = neighbors.KNeighborsClassifier()
gb3 = ensemble.GradientBoostingClassifier(random_state=100)

lr = linear_model.LogisticRegression(random_state=100)

stack_estimator = classifier.StackingClassifier(classifiers=[rf1, knn2, gb3],
                                                meta_classifier=lr,
                                                store_train_meta_features=True)
stack_grid = {
    'kneighborsclassifier__n_neighbors': [1, 5],
    'randomforestclassifier__n_estimators': [10, 50],
    'gradientboostingclassifier__n_estimators': [10, 50],
    'meta-logisticregression__C': [0.1, 10.0]
}

grid_stack_estimator = model_selection.GridSearchCV(stack_estimator,
                                                    stack_grid,
                                                    cv=10)
grid_stack_estimator.fit(X_train1, y_train)

print(grid_stack_estimator.best_score_)
print(grid_stack_estimator.best_params_)
#
# fs_model = feature_selection.SelectFromModel(rf, prefit=True)
# X_train1 = fs_model.transform(X_train)
# X_train1.shape
# selected_features = X_train.columns[fs_model.get_support()]
#==============================================================================

#build stacked model using selected features
dt1 = tree.DecisionTreeClassifier(random_state=100)
rf2 = ensemble.RandomForestClassifier(random_state=100)
gb3 = ensemble.GradientBoostingClassifier(random_state=100)

lr = linear_model.LogisticRegression(random_state=100)

stack_estimator = mlxClassifier.StackingClassifier(
    classifiers=[dt1, rf2,
                 gb3], meta_classifier=lr)  #, store_train_meta_features=True)
stack_grid = {
    'decisiontreeclassifier__min_samples_split': [2, 3],
    'randomforestclassifier__n_estimators': [5, 10],
    'gradientboostingclassifier__n_estimators': [10, 50],
    'meta-logisticregression__C': [0.1, 10.0]
}

grid_stack_estimator = model_selection.GridSearchCV(stack_estimator,
                                                    stack_grid,
                                                    cv=10)
grid_stack_estimator.fit(X_train, y_train)
#grid_stack_estimator.fit(X_train1, y_train)

print(grid_stack_estimator.best_score_)
kernel_svm_estimator = svm.SVC(kernel='rbf')
stages = [('features', kutils.KernelTransformer('rbf')) ,
          ('clf', linear_model.LogisticRegression())
          ]
lr_pipeline = pipeline.Pipeline(stages)
rf_estimator = ensemble.RandomForestClassifier()
gb_estimator = ensemble.GradientBoostingClassifier()

stage1_estimators = [ gb_estimator, 
                     knn_estimator,
                    rf_estimator,
                    kernel_svm_estimator,
                    lr_pipeline   ]
stage2_estimator = linear_model.LogisticRegression()

stacking_estimator = mlxtnd.StackingClassifier(stage1_estimators, stage2_estimator  )
stacking_grid = {
        'gradientboostingclassifier__max_depth':[2], 'gradientboostingclassifier__n_estimators':list(range(300,500, 100)), 'gradientboostingclassifier__learning_rate':[0.1, 0.2, 0.5, 1.0], 
        'kneighborsclassifier__n_neighbors': list(range(6,10)),
        'randomforestclassifier__max_depth':list(range(6,8)), 'randomforestclassifier__n_estimators':list(range(200,400, 100)),
         'svc__gamma':[0.001, 0.01], 'svc__C':[0.001, 0.01, 1, 10] ,
         'meta_classifier__C': [0.1, 10.0]
        }
stacking_final_estimator = cutils.grid_search_best_model(stacking_estimator, stacking_grid, X_train, y_train)


titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))

print(titanic_test.shape)
print(titanic_test.info())