Esempio n. 1
0
def display_decision_tree(train_x, valid_x, train_y, valid_y):
    print('(9) Display Decision Tree\n')
    fullClassTree = DecisionTreeClassifier(max_depth=4, random_state=1)
    fullClassTree.fit(train_x, train_y)
    plotDecisionTree(fullClassTree, feature_names=train_x.columns)

    prediction_train = fullClassTree.predict(
        train_x)  #use the DT model to predict on the training data
    prediction_valid = fullClassTree.predict(
        valid_x)  #use the DT model to predict on the validation data

    print('precision on test is:', precision_score(valid_y, prediction_valid))
    print('recall on test is:', recall_score(valid_y, prediction_valid))
    print('f1 on test is:', f1_score(valid_y, prediction_valid))
    print('Logistic Regression:Accuracy on train is:',
          accuracy_score(train_y, prediction_train))
    print('Logistic Regression:Accuracy on test is:',
          accuracy_score(valid_y, prediction_valid), '\n')

    importances = fullClassTree.feature_importances_
    important_df = pd.DataFrame({
        'feature': train_x.columns,
        'importance': importances
    })  #,'std':std})
    important_df = important_df.sort_values('importance', ascending=False)
    print(important_df)
Esempio n. 2
0
print("Precision_score train is:",precision_score(train_y,lr_prediction_train))
print("Precision_score on test is:",precision_score(valid_y,lr_prediction_valid))
print("Recall_score on train is:",recall_score(train_y,lr_prediction_train))
print("Recall_score on test is:",recall_score(valid_y,lr_prediction_valid))
print("f1_score on train is:",f1_score(train_y,lr_prediction_train))
print("f1_score on test is:",f1_score(valid_y,lr_prediction_valid))

"""
Decision Tree
"""

# apply the decision tree model
DecisionTree = DecisionTreeClassifier(max_depth = 4)
DecisionTree.fit(train_X, train_y)

plotDecisionTree(DecisionTree, feature_names=train_X.columns)

importances = DecisionTree.feature_importances_

im = pd.DataFrame({'feature': train_X.columns, 'importance': importances})
im = im.sort_values('importance',ascending=False)
print(im)

dt_prediction_train = DecisionTree.predict(train_X) 
dt_prediction_valid = DecisionTree.predict(valid_X) 
    
print("Accuracy score on train is:",accuracy_score(train_y,dt_prediction_train))
print("Accuracy score on test is:",accuracy_score(valid_y,dt_prediction_valid))
print("Precision score on train is:",precision_score(train_y,dt_prediction_train))
print("Precision score on test is:",precision_score(valid_y,dt_prediction_valid))
print("Recall score on train is:",recall_score(train_y,dt_prediction_train))
# The package _scikit-learn_ has the class `DecisionTreeClassifier` to build a decision tree model. The function `plotDecisionTree` from the _dmba_ package can be used to visualize the tree.

loan3000 = pd.read_csv(LOAN3000_CSV)

predictors = ['borrower_score', 'payment_inc_ratio']
outcome = 'outcome'

X = loan3000[predictors]
y = loan3000[outcome]

loan_tree = DecisionTreeClassifier(random_state=1,
                                   criterion='entropy',
                                   min_impurity_decrease=0.003)
loan_tree.fit(X, y)
plotDecisionTree(loan_tree,
                 feature_names=predictors,
                 class_names=loan_tree.classes_)

print(textDecisionTree(loan_tree))

### The Recursive Partitioning Algorithm

fig, ax = plt.subplots(figsize=(6, 4))

loan3000.loc[loan3000.outcome == 'paid off'].plot(x='borrower_score',
                                                  y='payment_inc_ratio',
                                                  style='.',
                                                  markerfacecolor='none',
                                                  markeredgecolor='C1',
                                                  ax=ax)
loan3000.loc[loan3000.outcome == 'default'].plot(x='borrower_score',
Esempio n. 4
0
        18,
        20,
    ],
}
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved parameters: ', gridSearch.best_params_)

regTree = gridSearch.best_estimator_

regressionSummary(train_y, regTree.predict(train_X))
regressionSummary(valid_y, regTree.predict(valid_X))

#plot reg tree

plotDecisionTree(regTree, feature_names=train_X.columns)
plotDecisionTree(regTree, feature_names=train_X.columns, rotate=True)

# In[48]:

#Classification Tree
retail = pd.read_csv('retailsales1.csv')

predictors = ['inventorygrowth', 'populationgrowth']
outcome = 'yoygtenp'

X = pd.get_dummies(retail[predictors], drop_first=True)
y = retail[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                      y,