def linear_regression_model(train, validation, alpha, depth=None):

    X_train = train['X'].values[:, 1:]
    y_train = np.ravel(train['y'].values)
    X_validation = validation['X'].values[:, 1:]
    y_validation = np.ravel(validation['y'].values)
    models = {
        'type': ['ridge', 'decision tree', 'random forest'],
        'model': [
            Ridge(alpha=0.1,
                  fit_intercept=fit_intercept,
                  normalize=normalize,
                  max_iter=max_iter,
                  tol=tol,
                  random_state=random_state),
            dt(criterion='mse',
               splitter='best',
               max_depth=depth,
               min_samples_split=2,
               min_samples_leaf=1,
               random_state=random_state),
            rfr(n_estimators=100,
                criterion='mse',
                max_depth=depth,
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=random_state)
        ],
        'score_train': [],
        'score_valid': [],
        'mse_train': [],
        'mse_valid': []
    }
    y_train_predict = []
    y_valid_predict = []
    for i in np.arange(0, len(models['type']), 1):
        m = models['model'][i]
        m.alpha = alpha
        m.fit(X_train, y_train)
        models['score_train'].append(m.score(X_train, y_train))
        models['score_valid'].append(m.score(X_validation, y_validation))
        y_train_predict.append(m.predict(X_train))
        y_valid_predict.append(m.predict(X_validation))
        models['mse_train'].append(mse(y_train, y_train_predict[i]))
        models['mse_valid'].append(mse(y_validation, y_valid_predict[i]))
    print('models: ', models['type'])
    print('R2 training:', models['score_train'])
    print('R2 validation:', models['score_valid'])
    print('MSE training:', models['mse_train'])
    print('MSE validation:', models['mse_valid'])
    return models
def DecisionTree(data_directory, model_dir, features):
    X_train, X_test, y_train, y_test, predict_X, features = pre(data_directory, features)
    os.chdir(model_dir)
    model = dt(random_state=1)
    grid = gs(estimator=model, param_grid={'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'],
                                           'max_features': ['auto', 'sqrt', 'log2']}, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_estimator_.score(X_test, y_test))
    joblib.dump(grid.best_estimator_, 'dtr_%d_%.4f.m'%(len(features),grid.best_estimator_.score(X_test, y_test)))

    df = pd.DataFrame(columns=['ml_bandgap', 'pbe_bandgap'])
    df['pbe_bandgap'] = y_test
    df['ml_bandgap'] = grid.best_estimator_.predict(X_test)
    print(df)
Ejemplo n.º 3
0
def doit(inp, k):
    x, y = loadData("train", 225)
    x = x.toarray()
    train_x = x[0:10000]
    train_y = y[0:10000]

    test_x = x[9000:10000]
    test_y = y[9000:10000]

    model = dt(max_depth=10)
    model.fit(train_x, train_y)
    ret = model.predict_proba(X=inp)
    predict = model.predict(X=inp)
    clas = model.classes_
    for i in range(ret.shape[0]):
        ret[i] = clas[np.argsort(ret[i])]
        #  print(predict[i],ret[i][ret[0].size-1:ret[0].size])
    return np.flip(ret[:, ret[0].size - k:ret[0].size], axis=1)
Ejemplo n.º 4
0
def import_lib(models):
    #from sklearn.tree import DecisionTreeClassifier as dt
    #clf = dt()
    #return clf

    model_obj = {}

    for model in models:
        # check before importing its root model and create obj

        # tree_model
        if model == "Decision Tree":
            from sklearn.tree import DecisionTreeClassifier as dt
            dt_model = dt()
            model_obj[model] = dt_model

        # linar model
        elif model == "Logistic Regression":
            from sklearn.linear_model import LogisticRegression as lr
            lr_model = lr()
            model_obj[model] = lr_model

    return model_obj
Ejemplo n.º 5
0

# In[24]:

# In[22]:


from sklearn.tree import DecisionTreeClassifier as dt


# In[25]:

# In[23]:


model=dt(criterion='entropy')


# In[26]:

# In[24]:


model.fit(train[predictors],train[target])


# In[27]:

# In[25]:

Ejemplo n.º 6
0
plt.legend(loc='upper left')
plt.savefig('iris_petal_lengthvswidth.png')
plt.close()

mean_scaler = StandardScaler()
# use fit to estimate s and var for X_train individual features
mean_scaler.fit(X_train)
mean_scaler.transform(X_train)
mean_scaler.transform(X_test)

print('scaled train:\n{}\nscaled test:\n{}'.format(X_train[:2], X_test[:2]))

# single decision tree using a criterion and max_depth for regularization
tree = dt(
    criterion='entropy',
    max_depth=6,
    random_state=7
)
print('DT params:\n{}'.format(tree))

# using scaled features for better decision boundary viz
tree.fit(X_train, Y_train)

# check decision boundary for test points with 5 max depth of tree
# test labels 45 105 -> 150
plot_decision_region(X_comb, Y_comb, clsfr=tree, test_idx=range(105,150))
plt.xlabel('Petal length(cm)')
plt.ylabel('Petal Width(cm)')
plt.legend(loc='upper left')
plt.title('0->setosa   1->versicolor   2->virginica')
plt.savefig('dt_6dentropy_iris_petal_length_width.png')
Ejemplo n.º 7
0
# In[102]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

# # Decision Tree Classification Model

# In[111]:

from sklearn.tree import DecisionTreeClassifier as dt
model = dt(max_depth=10, random_state=100)
model.fit(x_train, y_train)

# # Feature Scaling

# In[112]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
print(x_train)

# # Predicting Model

# In[113]:
Ejemplo n.º 8
0
from sklearn.datasets import load_iris as ir
import numpy as np 
from sklearn.tree import DecisionTreeClassifier as dt

"""	0 - sat
	1 - ves
	2 - vt
"""
iris=ir()
#print(iris.feature_names)
#print(iris.target_names)
#print(iris.data[0])
#print(iris.target[0])

x=[0,50,100]

xtrain = np.delete(iris.data,x,axis=0)
ytrain = np.delete(iris.target,x)

xtest = iris.data[x]
ytest = iris.target[x]

clf = dt()
clf.fit(xtrain,ytrain)

print(ytest)
print("prediction =  ",clf.predict(xtest))

#Accuracy of Naive bayes classifier
print('accuracy by Naive Bayes classifier with PCA =',
      accuracy_score(Y_test, y_predict_NB))
a_NB = accuracy_score(Y_test, y_predict_NB)

#Mathews correlation coefficient for Naive bayes classifier
m_NB = matthews_corrcoef(Y_test, y_predict_NB)
print(
    'Mathew\'s correlation coefficient for Naive Bayes classifier with PCA =',
    m_NB)

# In[29]:

#Decision Tree Classifier without PCA _withoutpca
from sklearn.tree import DecisionTreeClassifier as dt
classifier_withoutpca = dt(criterion='entropy', random_state=0)

classifier_withoutpca.fit(X_train, y_train)

y_predict_DT_withoutpca = classifier_withoutpca.predict(X_test)

#Accuracy of Decision Tree Classifier
a_DT_withoutpca = accuracy_score(y_test, y_predict_DT_withoutpca)
print('accuracy by Decision Tree without PCA=',
      accuracy_score(y_test, y_predict_DT_withoutpca))

#Matthews correlation coefficient for Decision Tree Classifier
m_DT_withoutpca = matthews_corrcoef(y_test, y_predict_DT_withoutpca)
print(
    'Matthew\'s correlation coefficient for Decision Tree Classifier without PCA =',
    m_DT_withoutpca)
Ejemplo n.º 10
0
df_cancer = cancer()

df_cancer.keys()
# dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

# 2. train set, test set split

x_train, x_test, y_train, y_test = train_test_split(df_cancer['data'],
                                                    df_cancer['target'],
                                                    random_state=0)

# 3. Data learngin
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.tree import DecisionTreeRegressor as dt_r

m_dt = dt()
m_dt.fit(x_train, y_train)

# 4. Model evaluation

m_dt.score(x_test, y_test)  # 88%

# 5. Parameter tunning

score_train = []
score_test = []
for i in np.arange(2, 21):
    m_dt = dt(min_samples_split=i)
    m_dt.fit(x_train, y_train)
    score_train.append(m_dt.score(x_train, y_train))
    score_test.append(m_dt.score(x_test, y_test))
accuracy = accuracy_score(label_test, predicts)
print('Accuracy of Naive Bayes classifier :',accuracy) # =82%

# 3.Using Decision Tree Induction Classification

# First we need to convert continuous values into categorial as much as we can
print(max(featuers.age),min(featuers.age)) # to know pins range #70 #32
ageCategory=pd.cut(featuers.age,bins=[0,17,32,65,100],labels=['child','teenager','adult','elderly'])
cigsPerDayCategory=pd.cut(featuers.cigsPerDay,bins=[-1,2.0,5.0,7.0,20.0],labels=['low','medium','high','veryHigh'])
featuers.insert(2,"ageCategory",ageCategory)
featuers.insert(4,"cigsPerDayCategory",cigsPerDayCategory)
# Now after adding a categorial columns we need to drop continuous values columns
del featuers['age']
del featuers['cigsPerDay']
''' All totChol,sysBP,diaBP,BMI,heartRate,glucose has continuous values between : -0.8,+0.8'''
continuousValuesWithSameRange=["totChol","sysBP","diaBP","BMI","heartRate","glucose"]
for column in continuousValuesWithSameRange:
    columnCategory = pd.cut(featuers[column], bins=[-0.9, -0.3,4,9],
                            labels=['low', 'medium', 'high'])
    featuers.insert(13, column+"Category",columnCategory)
    # Now after adding a categorial columns we need to drop continuous values columns
    del featuers[column]
print("Categorial data set:",featuers) # NOW Data is ready to use Decision Tree Induction Classification
from sklearn.tree import DecisionTreeClassifier as dt
print("here")
model = dt(random_state=1)
model.fit(feature_train, label_train)
predicts=model.predict(feature_test)
print("PREDICT RESULT Using Decision Tree Induction:",predicts)
accuracy = accuracy_score(label_test, predicts)
print('Accuracy of Decision Tree Induction classifier :',accuracy) # =77%
Ejemplo n.º 12
0
# In[124]:

accuracy = confusion_matrix(y_test, clf_pred)
accuracy

# In[127]:

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, clf_pred)
accuracy

# # This exaample to determine accuracy via Decision Tree

# In[130]:

clf_DT = dt()

# In[131]:

clf_DT.fit(X_train, y_train)

# In[132]:

clf_dt_pred = clf_DT.predict(X_test)

# In[133]:

clf_dt_pred

# In[134]:
Ejemplo n.º 13
0
def main():
    train, test = load("cancer-data-train.csv"), load("cancer-data-test.csv")
    X_train, y_train = train
    X_test, y_test = test
    X_train, X_test, print_pred = arguments(sys.argv, X_train, X_test)
    fig = plot.figure()

    # Passing training data and classes to find best C and number of leaf nodes to use. Also creating graphs to display this info
    classifier_plotter(X_train, y_train)

    # Setting up graphs for each plot
    ax1 = fig.add_subplot(234)
    ax1.set_title('Average Precsion Scores')
    ax1.set_ylabel('Precsion Score')
    ax1.set_xlabel('Classifier')
    ax2 = fig.add_subplot(235)
    ax2.set_title('Average Recall Scores')
    ax2.set_ylabel('Recall Score')
    ax2.set_xlabel('Classifier')
    ax3 = fig.add_subplot(236)
    ax3.set_title('Average F-measures')
    ax3.set_ylabel('F-measure')
    ax3.set_xlabel('Classifier')

    # Create and train the classifiers
    classifier_svm, classifier_gini, classifier_ig, classifier_lda = svm(
        kernel='linear',
        C=0.1), dt(criterion='gini',
                   max_leaf_nodes=10), dt(criterion='entropy',
                                          max_leaf_nodes=5), lda()
    classifier_svm.fit(X_train, y_train), classifier_gini.fit(
        X_train, y_train), classifier_ig.fit(X_train,
                                             y_train), classifier_lda.fit(
                                                 X_train, y_train)

    # Make the predictions
    pred_svm, pred_gini, pred_ig, pred_lda = classifier_svm.predict(
        X_test), classifier_gini.predict(X_test), classifier_ig.predict(
            X_test), classifier_lda.predict(X_test)

    # Calculate the precision, recall, f-measure
    avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda = average_precision_score(
        y_test, pred_svm), average_precision_score(
            y_test, pred_gini), average_precision_score(
                y_test, pred_ig), average_precision_score(y_test, pred_lda)
    recall_svm, recall_gini, recall_ig, recall_lda = recall_score(
        y_test, pred_svm, average='weighted'), recall_score(
            y_test, pred_gini, average='weighted'), recall_score(
                y_test, pred_ig,
                average='weighted'), recall_score(y_test,
                                                  pred_lda,
                                                  average='weighted')
    f_svm, f_gini, f_ig, f_lda = f1_score(
        y_test, pred_svm, average='weighted'), f1_score(
            y_test, pred_gini, average='weighted'), f1_score(
                y_test, pred_ig,
                average='weighted'), f1_score(y_test,
                                              pred_lda,
                                              average='weighted')

    ################## Extra Credit #########################
    # Train classifier and make predictions on test set
    classifier_rfc = rfc(n_estimators=100, max_depth=2)
    classifier_rfc.fit(X_train, y_train)
    pred_rfc = classifier_rfc.predict(X_test)

    #Calculate precision, recall and f-measure for Random Forest Classifier
    avg_precision_rfc = average_precision_score(y_test, pred_rfc)
    recall_rfc = recall_score(y_test, pred_rfc, average='weighted')
    f_rfc = f1_score(y_test, pred_rfc, average='weighted')
    #########################################################

    # Printing scores and predictions
    print_scores([[
        avg_precision_svm, avg_precision_gini, avg_precision_ig,
        avg_precision_lda, avg_precision_rfc
    ], [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc],
                  [f_svm, f_gini, f_ig, f_lda, f_rfc]])
    print_predictions([pred_svm, pred_gini, pred_ig, pred_lda, pred_rfc],
                      print_pred)

    # Create the graphs for the scores
    score_plotter(ax1, [
        avg_precision_svm, avg_precision_gini, avg_precision_ig,
        avg_precision_lda, avg_precision_rfc
    ])
    score_plotter(ax2,
                  [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc])
    score_plotter(ax3, [f_svm, f_gini, f_ig, f_lda, f_rfc])

    plot.tight_layout(w_pad=1.5, h_pad=2.0)
    plot.show()
Ejemplo n.º 14
0
def classifier_plotter(X_train, y_train):
    '''
	Takes the training data and runs through SVM, DT-Gini and DT-IG with multiple C values and max_leaf_nodes to try.
	The method then creates a graph by taking the average of cross validation scores for that C value or max_leaf_node.

	Params:
	X_train: 
		List/s of features already standardized from the initial dataset
	y_train: 
		List of classifiers for X_train taken from the original dataset

	Return:
	Outputs a graph of the average cross validation scores.
	'''
    i, d = 1, 0

    # Values to test
    c_values = [0.01, 0.1, 1, 10, 100]
    k_values = [2, 5, 10, 20]
    classifiers = ["SVM", "DT-Gini & DT-IG"]

    for clf in classifiers:
        count = 1
        if clf == "SVM":
            if d == 0:
                ax = plot.subplot(231)
                ax.set_title(clf)
                plot.ylabel('F-measure')
                plot.xlabel('C values')
                d += 1
            print('SVM')
            for c in c_values:
                classi = svm(kernel='linear', C=c).fit(X_train, y_train)
                scores = cross_val_score(classi, X_train, y_train, cv=10)
                ax.plot(str(c), scores.mean(), 'bs')
                print('%d.) %.4f%%' % (count, scores.mean() * 100))
                count += 1
            plot.axis([None, None, 0.90, 1])
            print('\n')
            i += 1
            d = 0

        elif clf == "DT-Gini & DT-IG":
            count = 1
            if d == 0:
                ax = plot.subplot(232)
                plot.ylabel('F-measure')
                plot.xlabel('Max Leaf Nodes')
            print('    Gini\tIG')
            for k in k_values:
                gini_class, ig_class = dt(criterion='gini',
                                          max_leaf_nodes=k), dt(
                                              criterion='entropy',
                                              max_leaf_nodes=k)
                score_gini, score_ig = cross_val_score(gini_class,
                                                       X_train,
                                                       y_train,
                                                       cv=10), cross_val_score(
                                                           ig_class,
                                                           X_train,
                                                           y_train,
                                                           cv=10)
                ax.plot(str(k), score_gini.mean(), 'r.', str(k),
                        score_ig.mean(), 'g.')
                print('%d.) %.4f%%\t%.4f%%' %
                      (count, score_gini.mean() * 100, score_ig.mean() * 100))
                count += 1
            plot.axis([None, None, 0.889, 0.96])
            ax.legend(('Gini', 'IG'), loc=2)
            print('\n')
            i += 1
            d = 0

        else:
            return "Should not get here."
Ejemplo n.º 15
0
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

print len(features_train[0])

from sklearn.tree import DecisionTreeClassifier as dt
clf = dt(min_samples_split=40)
t0 = time()
clf.fit(features_train[:], labels_train[:])
print 'Training time:', round(time() - t0, 3), 's'

t0 = time()
pred = clf.predict(features_test).tolist()
print 'Chis Occurrences:', pred.count(1)
print 'Shara Occurrences:', pred.count(0)
# print 'Predictions: ', pred
print 'Predicting time:', round(time() - t0, 3), 's'

from sklearn.metrics import accuracy_score

t0 = time()
accuracy = accuracy_score(pred, labels_test)
# 23. Split train into training and validation dataset
X_train, X_test, y_train, y_test = train_test_split(
                                                    X,
                                                    target,
                                                    test_size = 0.3)

# 23.1
X_train.shape    # 43314 X 135  if no kmeans: (43314, 126)
X_test.shape     # 18564 X 135; if no kmeans: (18564, 126)


# 24 Decision tree classification
# 24.1 Create an instance of class
clf = dt(min_samples_split = 5,
         min_samples_leaf= 5
        )



start = time.time()
# 24.2 Fit/train the object on training data
#      Build model
clf = clf.fit(X_train, y_train)
end = time.time()
(end-start)/60                     # 1 minute

# 24.3 Use model to make predictions
classes = clf.predict(X_test)

# 24.4 Check accuracy
Ejemplo n.º 17
0
def GridSearchCV_hp_tuning(X_train, X_test, y_train, y_test):
    ##for SVM for best parameters
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV 
    import matplotlib.pyplot as plt
    from sklearn.metrics import classification_report
    param_grid = {'C': [1, 10],  
                  'gamma': ('auto','scale'), 
                  'kernel': ['linear']}  
    grid = GridSearchCV(SVC(), param_grid, cv=2) 
    grid.fit(X_train, y_train)
    print('Best parameters: ', grid.best_params_) 
    print('Best estimator: ', grid.best_estimator_) 
    grid_predictions = grid.predict(X_test)
    acc = Accuracy(grid_predictions, y_test)
    print('Acc: ', acc)
    print(classification_report(y_test, grid_predictions))
    
    ##for Knn for best parameters
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()
    params_knn = {'n_neighbors': [2,3,4,5], 
                  'weights': ['uniform'], 
                  'metric': ['euclidean'] 
                 }
    knn_grid = GridSearchCV(knn, params_knn, cv=3)
    knn_grid.fit(X_train, y_train)
    print('Best parameters: ', knn_grid.best_params_) 
    print('Best estimator: ', knn_grid.best_estimator_) 
    grid_predictions = knn_grid.predict(X_test)
    acc = Accuracy(grid_predictions, y_test)
    print('Acc: ', acc)
    print(classification_report(y_test, grid_predictions))
    
    ##for Decision Tree for best parameters
    from sklearn.tree import DecisionTreeClassifier as dt
    clf = dt()
    param_grid = {'max_depth':[1,2,3],
                  'min_samples_leaf':[1,2,3,4,5],
                  'min_samples_split':[2,3,4],
                  'criterion':['gini','entropy']
                 }
    grid = GridSearchCV(clf, param_grid, cv=10)
    grid.fit(X_train, y_train)
    print('Best parameters: ', grid.best_params_) 
    print('Best estimator: ', grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    acc = Accuracy(grid_predictions, y_test)
    print('Acc: ', acc)
    print(classification_report(y_test, grid_predictions))
    
    ##Tuning the hyperparameters
    ##for SVM: parameter C
    from sklearn.model_selection import GridSearchCV 
    from sklearn.metrics import classification_report 
    c_values = [0.1, 1, 10 , 100]
    acc = []
    for i in c_values:
        param_grid = {'C': [i],  
                      'gamma': ('auto','scale'), 
                      'kernel': ['linear']}  
        grid = GridSearchCV(SVC(), param_grid, cv=2) 
        grid.fit(X_train, y_train)
        print('Best parameters: ', grid.best_params_) 
        print('Best estimator: ', grid.best_estimator_) 
        grid_predictions = grid.predict(X_test)
        acc_1 = Accuracy(grid_predictions, y_test)
        acc.append(acc_1) 
    xi = list(range(len(c_values)))
    plt.plot(xi, acc, marker='o', linestyle='--', color='r', label='acc')
    plt.xlabel('C values',fontweight="bold",fontsize = 12)
    plt.ylabel('accuracy',fontweight="bold",fontsize = 12)
    plt.title("C vs accuracy for GridSearchCV SVM",fontweight="bold",fontsize = 16)
    plt.xticks(xi, c_values)
    plt.legend()
    plt.show()
    
    ##for SVM: parameter kernel
    from sklearn.model_selection import GridSearchCV 
    from sklearn.metrics import classification_report 
    kernel_values = ['linear', 'rbf']
    acc_k = []
    for i in kernel_values:
        # defining parameter range 
        param_grid = {'C': [10],  
                      'gamma': ('auto','scale'), 
                      'kernel': [i]}  
        grid = GridSearchCV(SVC(), param_grid, cv=2) 
        grid.fit(X_train, y_train)
        print('Best parameters: ', grid.best_params_) 
        print('Best estimator: ', grid.best_estimator_) 
        grid_predictions = grid.predict(X_test)
        acc_1 = Accuracy(grid_predictions, y_test)
        acc_k.append(acc_1)
    xi = list(range(len(kernel_values)))
    plt.plot(xi, acc_k, marker='o', linestyle='--', color='r', label='acc')
    plt.xlabel('kernel',fontweight="bold",fontsize = 12)
    plt.ylabel('accuracy',fontweight="bold",fontsize = 12)
    plt.title("kernels vs accuracy for GridSearchCV SVM",fontweight="bold",fontsize = 16)
    plt.xticks(xi, kernel_values)
    plt.legend()
    plt.show()

    ##for decision tree: parameter max_depth
    from sklearn.tree import DecisionTreeClassifier as dt
    max_depth_values = [1, 2, 3]
    acc_dep = []
    clf=dt()
    for i in max_depth_values:
        param_grid = {'max_depth':[i],
                      'min_samples_leaf':[1,2,3,4,5],
                      'min_samples_split':[2,3,4],
                      'criterion':['gini','entropy']}
        grid = GridSearchCV(clf,param_grid, cv=10)
        a = grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        print('Best parameters: ', grid.best_params_) 
        print('Best estimator: ', grid.best_estimator_)  
        grid_predictions = grid.predict(X_test)
        acc = Accuracy(grid_predictions, y_test)
        acc_dep.append(acc)
    xi = list(range(len(max_depth_values)))
    plt.plot(xi, acc_dep, marker='o', linestyle='--', color='r', label='acc')
    plt.xlabel('max_depth values',fontweight="bold",fontsize = 12)
    plt.ylabel('accuracy',fontweight="bold",fontsize = 12)
    plt.title("max_depth vs accuracy for GridSearchCV Decision Tree",fontweight="bold",fontsize = 16)
    plt.xticks(xi, max_depth_values)
    plt.legend()
    plt.show()
    
    ##for Knn: parameter K
    knn = KNeighborsClassifier()
    acc_knn = []
    n_values = [2, 3, 4, 5]
    for i in n_values:
        params_knn = {'n_neighbors': [i], 
                      'weights': ['uniform'], 
                      'metric': ['euclidean'] 
                     }
        knn_grid= GridSearchCV(knn, params_knn, cv=3)
        knn_grid.fit(X_train, y_train)
        print('Best parameters: ', grid.best_params_) 
        print('Best estimator: ', grid.best_estimator_)  
        grid_predictions = knn_grid.predict(X_test)
        acc_1 = Accuracy(grid_predictions, y_test)
        print('acc: ',acc_1)
        acc_knn.append(acc_1)
    xi = list(range(len(n_values)))
    plt.plot(xi, acc_knn, marker='o', linestyle='--', color='r', label='acc')
    plt.xlabel('k values',fontweight="bold",fontsize = 12)
    plt.ylabel('accuracy',fontweight="bold",fontsize = 12)
    plt.title("k vs accuracy for GridSearchCV Knn",fontweight="bold",fontsize = 16)
    plt.xticks(xi, n_values)
    plt.legend()
    plt.show()
Ejemplo n.º 18
0
import pandas as pd
from sklearn.tree import DecisionTreeRegressor as dt
from sklearn.metrics import mean_absolute_error as me  # for calculating errors
from sklearn.ensemble import RandomForestRegressor as rf  #another model
iris_data = pd.read_csv('iris.csv')
#print(iris_data.columns)
# now defining x and y for y = mx + c + E(epsilon)
x = iris_data[['sepal length', 'sepal width', 'petal length', 'petal width']]
y = iris_data[['category']]
#print(y)
# selecting the model
mymodel = dt()
# training the machine
mymodel.fit(x, y)
#print(x.head())
#pridicting the values
print(mymodel.predict(x.head()))
predicted_y = mymodel.predict(x)
# getting error with test data so that it can be included and ans. become more accurate
print(me(y, predicted_y))

# randomforest model
myforest = rf()
myforest.fit(x, y)
print(myforest.predict(x.head()))
Ejemplo n.º 19
0
            y[index] = 0
        else:
            y[index] = 1
    np.save('x.npy', x)
    np.save('y.npy', y)
else:
    print('Loading from x.npy, y.npy')
    x = np.load('x.npy')
    y = np.load('y.npy')


# Training
# Split into test/train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = dt(max_depth=10)
print("Starting decision tree training...")
clf = clf.fit(x_train, y_train)
print("DT result: ", clf.score(x_test, y_test))
target_names = ['Not Churn', 'Churn']
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names=target_names))

print("Starting SVM training...")
clf = SVC(C=1, kernel='rbf', gamma=0.125,
            decision_function_shape='ovr')
clf = clf.fit(x_train, y_train)
print("SVM result: ", clf.score(x_test, y_test))
target_names = ['Not Churn', 'Churn']
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names=target_names))
Ejemplo n.º 20
0
enc.fit(y_train)                   # Let the object learn data
y_tr = enc.transform(y_train)      # Let it encode
y_tr

# 2.3 Check mapping
enc.classes_     # array(['setosa', 'versicolor', 'virginica']
                 # Corresponds to 0,1,2
# 2.4 Verify:
enc.transform(['setosa','versicolor', 'virginica'])


# 3. Start modeling
# 3.1 Initialize our decision tree object.
#     Supply relevant parameters
ct = dt( criterion="gini",    # Alternative 'entropy'
         max_depth=None       # Alternative, specify an integer
                              # 'None' means full tree till single leaf
         )


# 3.2 Train our decision tree
c_tree = ct.fit(X_train,y_tr)

# 4.0 Make predictions of test data
# 4.1 First transform y_test into inetgers
#     just as in y_tr
#     We use the already trained enc() object
y_te = enc.transform(y_test)

# 4.2 Now make prediction
out = ct.predict(X_test)
out
Ejemplo n.º 21
0
import sklearn
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.model_selection import train_test_split

datasub = data[['Age', 'EstimatedSalary', 'Purchased']]
datasub.head(5)

X = datasub[['Age', 'EstimatedSalary']]
y = datasub['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=False,
                                                    test_size=0.2)
X_train

dtclf = dt()
dtclf.fit(X_train, y_train)

pred = dtclf.predict(X_test)

from sklearn.metrics import accuracy_score
print('accuracy is ', accuracy_score(pred, y_test))

from sklearn.metrics import classification_report

df = pd.DataFrame({'Actual': y_test, 'Predicted': pred})

df

import matplotlib.pyplot as plt
Ejemplo n.º 22
0
        scores.append(sum(stats['test_score']) / len(stats['test_score']))
    for score in scores:
        print('monk', i, ':', score)
    print('-----------------------------')
    return sum(scores) / len(scores)


x1, y1 = read_data("monks-1.csv")
x2, y2 = read_data("monks-2.csv")
x3, y3 = read_data("monks-3.csv")

feats = [x1, x2, x3]
labs = [y1, y2, y3]

print('***Using 3-fold validation***')

worst = show_stats(feats, labs, pct(max_iter=100, tol=0), 3, 'perceptron')
best = show_stats(feats, labs, dt(max_depth=10), 3, 'decision tree')
show_stats(feats, labs, knn(n_neighbors=3), 3, 'K-nearest-neighbors')
show_stats(feats, labs, gnb(), 3, 'Gaussian Naive Bayes')

print('t test between perceptron and decision tree:', ttest_ind(worst, best))

print('***Using Leave-one-out***')

worst = show_stats(feats, labs, pct(max_iter=50, tol=0), loo(), 'perceptron')
best = show_stats(feats, labs, dt(max_depth=10), loo(), 'decision tree')
show_stats(feats, labs, knn(n_neighbors=3), loo(), 'K-nearest-neighbors')
show_stats(feats, labs, gnb(), loo(), 'Gaussian Naive Bayes')

print('t test between perceptron and decision tree:', ttest_ind(worst, best))
#%% imported library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor as dt

#%% readed data
df = pd.read_csv("dataset.csv", sep=";")

#%% converted list to DataFrame
dfTemp = pd.DataFrame(df.iloc[:, 0].values)
dfRainy = pd.DataFrame(df.iloc[:, 1].values)

#%% made Decision Tree
tree = dt()
tree.fit(dfTemp, dfRainy)

#%% border drawing
dfTempArange=np.arange(min(df.iloc[:,0].values)\
                       ,max(df.iloc[:,0].values),0.001).reshape(-1,1)

#%% finded model results
y_results = tree.predict(dfTempArange).reshape(-1, 1)

#%% showed values
plt.scatter(dfTemp, dfRainy, color="black")
plt.xlabel("Temperature")
plt.ylabel("Rainy")
plt.plot(dfTempArange, y_results, color="gray")
plt.show()