コード例 #1
0
ファイル: models.py プロジェクト: sumitpopli/CharityML
def model_feature_selection(clf_c,best_clf, best_predictions,X_train, y_train, X_test, y_test):
    # TODO: Train the supervised model on the training set using .fit(X_train, y_train)
    model = None

    # TODO: Extract the feature importances using .feature_importances_
    importances = clf_c.feature_importances_

    # Plot
    vs.feature_plot(importances, X_train, y_train)
    # Reduce the feature space
    X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
    X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]

    # Train on the "best" model found from grid search earlier
    fit_start = dt.datetime.now()
    clf = (clone(best_clf)).fit(X_train_reduced, y_train)
    fit_end = dt.datetime.now()

    fit_time = fit_end - fit_start

    # Make new predictions
    pred_start = dt.datetime.now()
    reduced_predictions = clf.predict(X_test_reduced)
    pred_end = dt.datetime.now()
    pred_time = pred_end - pred_start

    # Report scores from the final model using both versions of data
    print("Final Model trained on full data\n------")
    print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5)))
    print("\nFinal Model trained on reduced data\n------")
    print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))
    print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta=0.5)))
    print('time taken for training is {0}'.format(fit_time))
    print('time taken for predicting is {0}'.format(pred_time))
コード例 #2
0
def modelfit(alg, dtrain, y_train, dtest, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_parameters = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain.values, label=y_train.values)
        cvresult = xgb.cv(xgb_parameters, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds, show_stdv=False)
        print(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])

    alg.fit(dtrain, y_train, eval_metric='rmse')

    dtrain_prediction = alg.predict(dtrain)
    dtest_prediction = alg.predict(dtest)

    # print model report
    print("\nModel Report")
    print("Train RMSE : %.4g" % mean_squared_error(y_train.values, dtrain_prediction)**0.5)
    print("Test RMSE : %.4g" % mean_squared_error(y_test.values, dtest_prediction) ** 0.5)

    # feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    # feat_imp.plot(kind='bar', title='Feature Importance')
    # plt.ylabel('Feature Importance Score')
    # plt.show()
    plot_importance(alg)
    plt.show()

    importances = alg.feature_importances_
    vs.feature_plot(importances, dtrain, y_train)

    return dtrain_prediction, dtest_prediction
コード例 #3
0
def Model_Tuning(features_train, labels_train, features_test, labels_test):
    """
    perform a grid search optimization for the model over the entire training set (features_train and labels_train) by tuning at least one parameter to improve upon the untuned model's F-score.
    """
    clf = DecisionTreeClassifier()
    parameters = {'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]}
    scorer = make_scorer(fbeta_score, beta = 0.5)
    grid_obj = GridSearchCV(clf, parameters, scoring = scorer)
    grid_fit = grid_obj.fit(features_train, labels_train)
    best_clf = grid_fit.best_estimator_
    predictions = (clf.fit(features_train, labels_train)).predict(features_test)
    best_predictions = best_clf.predict(features_test)
    # Report the before-and-afterscores
    print("Unoptimized model\n------")
    print("Accuracy score on testing data: {:.4f}".format(accuracy_score(labels_test, predictions)))
    print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, predictions, beta = 0.5)))
    print("\nOptimized Model\n------")
    print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(labels_test, best_predictions)))
    print("Final F-score on the testing data: {:.4f}".format(fbeta_score(labels_test, best_predictions, beta = 0.5)))
    # Feature Relevance Observation
    model = ExtraTreesClassifier()
    model.fit(features_train, labels_train)
    importances = model.feature_importances_
    vs.feature_plot(importances, features_train, labels_train)
    # Feature selection
    X_train_reduced = features_train[features_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
    X_test_reduced = features_test[features_test.columns.values[(np.argsort(importances)[::-1])[:5]]]
    # Train on the "best" model found from grid search earlier
    clf = (clone(best_clf)).fit(X_train_reduced, labels_train)
    # Make new predictions
    reduced_predictions = clf.predict(X_test_reduced)
    # Report scores from the final model using both versions of data
    print("Final Model trained on full data\n------")
    print("Accuracy on testing data: {:.4f}".format(accuracy_score(labels_test, best_predictions)))
    print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, best_predictions, beta = 0.5)))
    print("\nFinal Model trained on reduced data\n------")
    print("Accuracy on testing data: {:.4f}".format(accuracy_score(labels_test, reduced_predictions)))
    print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, reduced_predictions, beta = 0.5)))
コード例 #4
0
        i = i + 1
        j = 0
    else:
        j += 1

# TODO: Import a supervised learning model that has 'feature_importances_'
clf = GradientBoostingClassifier(random_state=1990)

# TODO: Train the supervised model on the training set
model = clf.fit(X_train, y_train)

# TODO: Extract the feature importances
importances = model.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train)

# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(
    np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(
    np.argsort(importances)[::-1])[:5]]]

# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train)

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)
コード例 #5
0
model = AdaBoostClassifier(random_state=0)
model = model.fit(X_train_LE, y_train_LE)
y_pred_LE = model.predict(X_test_LE)
LE_score_acc = accuracy_score(y_test_LE, y_pred_LE)
LE_score_fbeta = fbeta_score(y_test_LE, y_pred_LE, beta=0.5)
print("LE Acc: ", LE_score_acc)
print("LE f: ", LE_score_fbeta)

model.fit(X_train_OHE, y_train_OHE)
y_pred_OHE = model.predict(X_test_OHE)
OHE_score_acc = accuracy_score(y_test_OHE, y_pred_OHE)
OHE_score_fbeta = fbeta_score(y_test_OHE, y_pred_OHE, beta=0.5)
print("OHE Acc: ", OHE_score_acc)
print("OHE f: ", OHE_score_fbeta)
feature_importannces = model.feature_importances_
vs.feature_plot(feature_importannces, X_train_OHE, y_train_OHE)

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
# TODO: Initialize the classifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3))

# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1]}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(fbeta_score, beta=0.5)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
コード例 #6
0
df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score',
                                                     ascending=False).tail()

print(df)

# Import a supervised learning model that has 'feature_importances_'
from sklearn.tree import DecisionTreeClassifier
# Train the supervised model on the training set
model = tree.DecisionTreeClassifier(criterion="gini", random_state=0)
model.fit(X_train, y_train['>50K'])
# Extract the feature importances
importances = model.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train['>50K'])

# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(
    np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(
    np.argsort(importances)[::-1])[:5]]]

# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train['>50K'])

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)
コード例 #7
0
# Report the before-and-afterscores
print "\nUnoptimized model"
print "Accuracy on testing data: {:.4f}".format(
    accuracy_score(y_test, predictions))
print "F1 score on testing data: {:.4f}".format(f1_score(y_test, predictions))
print "\nOptimized Model"
print "Accuracy on testing data: {:.4f}".format(
    accuracy_score(y_test, optimal_predictions))
print "F1 score on testing data: {:.4f}".format(
    f1_score(y_test, optimal_predictions))
print "\nThe optimized configuration of the decision tree:"
print optimal_clf

#%% Finding the top important features in the model
top_features = clf.feature_importances_
vs.feature_plot(top_features, X_train, y_train)

#%% Train a new model only with top five features
X_train_reduced = X_train[X_train.columns.values[(
    np.argsort(top_features)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(
    np.argsort(top_features)[::-1])[:5]]]

# Reuse previous optimal parameters and train with top features
clf = (clone(optimal_clf)).fit(X_train_reduced, y_train)

# New prediction
new_predictions = clf.predict(X_test_reduced)

# Report scores from the final model using both versions of data
print "Model trained on full data"
コード例 #8
0
def plot_learn_curve(X_train, y_train, X_test, y_test, reglist):

    for e in reglist:
        print e
        e.fit(X_train, y_train)
        print "Regressor R2 score on the test set: {:.4f}".format(
            e.score(X_test, y_test))
        print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test))

        # TODO: Use learning_curve imported above to create learning curves for both the
        #       training data and testing data. You'll need 'size', 'cv' and 'score' from above.

        train_sizes, train_scores, test_scores = learning_curve(
            e,
            X_train,
            y_train,
            cv=KFold(n_splits=10),
            scoring=make_scorer(r2_score),
            train_sizes=np.linspace(.1, 1, 20),
            n_jobs=8)

        # TODO: Plot the training curves and the testing curves
        #       Use plt.plot twice -- one for each score. Be sure to give them labels!

        plt.figure(figsize=(10, 7))

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(train_sizes,
                 train_scores_mean,
                 color="r",
                 label="Training score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 color="g",
                 label="Cross-validation score")

        # Plot aesthetics
        plt.ylim(-1.1, 1.1)
        plt.ylabel("R2 Score")
        plt.xlabel("Training Points")
        plt.legend(bbox_to_anchor=(1.0, 1.15))
        plt.show()
        try:
            importances = e.feature_importances_
            vs.feature_plot(importances, X_train, y_train)
        except AttributeError:
            print('No feature importance avalable for this learner')
        print('')
    return
コード例 #9
0
# Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf,parameters,scoring=scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train_reduced, y_train.values.ravel())

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train_reduced, y_train.values.ravel())).predict(X_test_reduced)
best_predictions = best_clf.predict(X_test_reduced)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test.values.ravel(), predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test.values.ravel(), predictions, beta = 0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test.values.ravel(), best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test.values.ravel(), best_predictions, beta = 0.5))

# Train the supervised model on the training set 
new_clf = AdaBoostClassifier()
model = new_clf.fit(X_train, y_train.values.ravel())

# Extract the feature importances
importances = model.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train.values.ravel())
コード例 #10
0
def plot_shuffle_split_score(features, labels, features2, labels2, reglist,
                             n_Splits, earlyStopRounds):
    sscv = ShuffleSplit(n_splits=n_Splits, test_size=.25, random_state=None)

    for e in reglist:
        score_l = []
        print(
            '-------------------------------------------------------------------------------------------------------'
        )
        print(
            '-------------------------------------------------------------------------------------------------------'
        )
        print e
        i = 0
        for train_index, test_index in sscv.split(features):
            print(
                '---------------------------------------------------------------------------------------------------'
            )
            print('ShuffledSplit iteration {} of {}'.format(i + 1, n_Splits))
            i += 1
            X_train, X_test = features.loc[train_index], features.loc[
                test_index]
            y_train, y_test = labels.loc[train_index], labels.loc[test_index]
            y_train = y_train.values.ravel(
            )  # change column vector to 1d array to avoid conversion warning @ regressor.fit()

            regressor = e
            test_set = [(X_test, y_test), (features2, labels2)]
            #test_set = [(features2, labels2)]
            start = time.time()

            if type(e).__name__ in ("XGBRegressor", "MLPRegressor"):
                if earlyStopRounds > 0:
                    regressor.fit(X_train,
                                  y_train,
                                  early_stopping_rounds=earlyStopRounds,
                                  eval_metric='rmse',
                                  eval_set=test_set,
                                  verbose=False)
                    elapsed = time.time() - start
                elif earlyStopRounds == 0:
                    print 'earlyStop disabled'
                    regressor.fit(X_train, y_train, eval_metric='rmse')
                    elapsed = time.time() - start

                results = regressor.evals_result()
                epochs = len(results['validation_0']['rmse'])
                x_axis = range(0, epochs)
                # plot regression error
                fig, ax = plt.subplots()
                ax.plot(x_axis,
                        results['validation_0']['rmse'],
                        label='Validation')
                ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
                ax.legend()
                plt.xlabel('number of epochs')
                plt.ylabel('Regression RMSE')
                plt.title('XGBReg. RMSE')
                plt.show()

            else:
                regressor.fit(X_train, y_train)
                elapsed = time.time() - start

            print("time to fit: %f" % (elapsed))
            score = regressor.score(X_test, y_test)
            score_l.append(score)

        print('')
        print "Regressor R2 score on the validation set: {:.4f}".format(score)
        print(
            '---------------------------------------------------------------------'
        )
        print('size of the training set (features, labels)', np.shape(X_train),
              np.shape(y_train))
        print('size of the validation set (features, labels)',
              np.shape(X_test), np.shape(y_test))
        print('size of the test set (features, labels)', np.shape(features2),
              np.shape(labels2))
        print(
            '---------------------------------------------------------------------'
        )
        print('Variance of the train/valid. set: {}'.format(
            labels['sr_highres'].var()))
        print('Variance of the test set: {}'.format(
            labels2['sr_highres'].var()))
        print('')

        preds = regressor.predict(features)
        preds = pd.DataFrame(preds)
        preds.rename(columns={0: 'sr_predicted'}, inplace=True)

        plt.figure(figsize=(6, 3))
        plt.plot(score_l)
        plt.ylabel("R2 Score")
        plt.xlabel("number of ShuffleSplits")
        plt.show()

        fig = plt.figure(figsize=(18, 3))
        labels['sr_highres'].plot()
        preds['sr_predicted'].plot()
        plt.ylim(-4, 4)
        plt.title('validation data: labels vs. predictions')
        plt.legend(loc='best')
        plt.show()

        print('validation set r2 score:',
              r2_score(labels['sr_highres'], preds['sr_predicted']))
        print('validation set mean squared error: {0:.2f}%'.format(
            mean_squared_error(labels['sr_highres'], preds['sr_predicted']) *
            100))

        lmean = labels['sr_highres'].mean()
        predmean = preds['sr_predicted'].mean()
        devmean = -100 / lmean * (lmean - predmean)
        print(
            'sr mean: {} | predicted mean: {} | pred. deviation from sr: {}%'.
            format(lmean, predmean, devmean))

        preds2 = regressor.predict(features2)
        preds2 = pd.DataFrame(preds2)
        preds2.rename(columns={0: 'sr_predicted'}, inplace=True)

        fig = plt.figure(figsize=(18, 3))
        labels2['sr_highres'].plot()
        preds2['sr_predicted'].plot()
        plt.ylim(-4, 4)
        plt.title('test data: labels vs. predictions')
        plt.legend(loc='best')
        plt.show()

        print('test set r2 score:',
              r2_score(labels2['sr_highres'], preds2['sr_predicted']))
        print('test set mean squared error: {0:.2f}%'.format(
            mean_squared_error(labels2['sr_highres'], preds2['sr_predicted']) *
            100))

        lmean2 = labels2['sr_highres'].mean()
        predmean2 = preds2['sr_predicted'].mean()
        devmean2 = -100 / lmean2 * (lmean2 - predmean2)
        print(
            'sr mean: {} | predicted mean: {} | pred. deviation from sr: {}%'.
            format(lmean2, predmean2, devmean2))

        try:
            importances = regressor.feature_importances_
            vs.feature_plot(importances, X_train, y_train)
        except AttributeError:
            print('')
            print('No feature importance available for this learner')
            print('')
            print('')
        if type(e).__name__ == "XGBRegressor":
            fig, ax = plt.subplots(1, 1, figsize=(8, 13))
            plot_importance(regressor, ax=ax)
            plt.show()

    return regressor, preds, preds2
コード例 #11
0
def plot_kfold_split_score(features, labels, valid_features, valid_labels,
                           reglist, n_Splits):
    kfold = KFold(n_splits=n_Splits, random_state=0, shuffle=True)

    for e in reglist:
        score_l = []
        print e
        for train_index, test_index in kfold.split(features):
            X_train, X_test = features.loc[train_index], features.loc[
                test_index]
            y_train, y_test = labels.loc[train_index], labels.loc[test_index]
            y_train = y_train.values.ravel(
            )  # change column vector to 1d array to avoid conversion warning @ regressor.fit()

            regressor = e
            start = time.time()
            regressor.fit(X_train, y_train)
            elapsed = time.time() - start
            print("time to fit: %f" % (elapsed))
            score = regressor.score(X_test, y_test)
            score_l.append(score)
        print('')
        print "Regressor R2 score on the validation set: {:.4f}".format(score)
        # print('size of the training set (x,y)', np.shape(X_train), np.shape(y_train))
        # print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test))
        print('')

        if score > 0.2:

            preds = regressor.predict(features)
            preds = pd.DataFrame(preds)

            # print('mean of orig. sr. (validation data set)', valid_labels.sr_highres.mean())
            print('mean of predicted sr. (validation data set)',
                  preds.values.mean())

            plt.figure(figsize=(6, 3))
            plt.plot(score_l)
            plt.ylabel("R2 Score")
            plt.xlabel("number of ShuffleSplits")
            plt.show()

            fig = plt.figure(figsize=(18, 3))
            labels['sr_highres'].plot()
            preds[0].plot()
            plt.ylim(-4, 4)
            plt.title('labels vs. predictions')
            plt.legend(loc='best')
            plt.show()

            print('validation set r2 score:',
                  r2_score(labels['sr_highres'], preds[0]))
            print('validation set mean squared error: {0:.2f}%'.format(
                mean_squared_error(labels['sr_highres'], preds[0]) * 100))

            preds2 = regressor.predict(valid_features)
            preds2 = pd.DataFrame(preds2)

            fig = plt.figure(figsize=(18, 3))
            valid_labels['sr_highres'].plot()
            preds2[0].plot()
            plt.ylim(-4, 4)
            plt.show()

            print('test set r2 score:',
                  r2_score(valid_labels['sr_highres'], preds2[0]))
            print('test set mean squared error: {0:.2f}%'.format(
                mean_squared_error(valid_labels['sr_highres'], preds2[0]) *
                100))
            try:
                importances = regressor.feature_importances_
                vs.feature_plot(importances, X_train, y_train)
            except AttributeError:
                print('')
                print('No feature importance available for this learner')
                print('')
                print('')
            if type(e).__name__ == "XGBRegressor":
                fig = plt.figure(figsize=(15, 5))
                plot_importance(regressor)
                plt.show()

    return regressor, preds, preds2
コード例 #12
0
def plot_time_split_score(features, labels, valid_features, valid_labels,
                          reglist, n_TSSplits):
    #def plot_time_split_score(features, labels, reglist, n_TSSplits):
    tscv = TimeSeriesSplit(n_splits=n_TSSplits)

    for e in reglist:
        score_l = []
        print e
        for train_index, test_index in tscv.split(features):
            X_train, X_test = features.loc[train_index], features.loc[
                test_index]
            y_train, y_test = labels.loc[train_index], labels.loc[test_index]
            y_train = y_train.values.ravel(
            )  # change column vector to 1d array to avoid conversion warning @ regressor.fit()

            regressor = e
            regressor.fit(X_train, y_train)
            score = regressor.score(X_test, y_test)
            score_l.append(score)
        print('')
        print "Regressor R2 score on the test set: {:.4f}".format(score)
        #print('size of the training set (x,y)', np.shape(X_train), np.shape(y_train))
        #print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test))
        print('')

        if score > -20.20:

            preds = regressor.predict(features)
            preds = pd.DataFrame(preds)

            #print('mean of orig. sr. (validation data set)', valid_labels.sr_highres.mean())
            print('mean of predicted sr. (validation data set)',
                  preds.values.mean())

            plt.figure(figsize=(6, 3))
            plt.plot(score_l)
            plt.ylabel("R2 Score")
            plt.xlabel("number of TimeSeriesSplits")
            plt.show()

            fig = plt.figure(figsize=(18, 3))
            labels['sr_highres'].plot()
            preds[0].plot()
            plt.ylim(-4, 4)
            plt.title('labels vs. predictions')
            plt.legend(loc='best')
            plt.show()
            print('test set labels mean sr:', labels.mean())
            print('test set predicted mean sr:', preds.mean())

            preds2 = regressor.predict(valid_features)
            preds2 = pd.DataFrame(preds2)
            #
            fig = plt.figure(figsize=(18, 3))
            valid_labels['sr_highres'].plot()
            preds2[0].plot()
            plt.ylim(-4, 4)
            plt.show()
            print('validation set labels mean sr:', valid_labels.mean())
            print('validation set predicted mean sr:', preds2.mean())
            try:
                importances = regressor.feature_importances_
                vs.feature_plot(importances, X_train, y_train)
            except AttributeError:
                print('')
                print('No feature importance avalable for this learner')
                print('')
                print('')

    return regressor, preds
コード例 #13
0
# Make predictions using the unoptimized and models
predictions = (clf.fit(X_train,  y_train.values.ravel())).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))




# Import a supervised learning model that has 'feature_importances_'
from sklearn.tree import DecisionTreeClassifier

# Train the supervised model on the training set 
model = DecisionTreeClassifier();
model.fit(X_train, y_train)

# Extract the feature importances
importances = model.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train)


コード例 #14
0
ファイル: census2.py プロジェクト: robinshie/udalearn
    fbeta_score(y_val, predictions, beta=0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the validation data: {:.4f}".format(
    accuracy_score(y_val, best_predictions))
print "Final F-score on the validation data: {:.4f}".format(
    fbeta_score(y_val, best_predictions, beta=0.5))

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

importances = model.feature_importances_
importances_AdaBoost = best_clf.feature_importances_

vs.feature_plot(importances, X_train, y_train)
vs.feature_plot(importances_AdaBoost, X_train, y_train)

from sklearn.base import clone

X_train_reduced = X_train[X_train.columns.values[(
    np.argsort(importances)[::-1])[:5]]]
X_val_reduced = X_val[X_val.columns.values[(
    np.argsort(importances)[::-1])[:5]]]

clf_on_reduced = (clone(best_clf)).fit(X_train_reduced, y_train)

reduced_predictions = clf_on_reduced.predict(X_val_reduced)

print "Final Model trained on full data\n------"
print "Accuracy on validation data: {:.4f}".format(