Example #1
0
 def test_cross_validation_5_folds(self):
     model = KNeighborsClassifier(n_neighbors=self.neighbors)
     expected = cross_val_score(model, self.x, self.y, cv=5)
     returned = self.classifier.cross_validation(5)
     print(returned)
     for i in range(5):
         self.assertEqual(expected[i], returned[i])
def eval_linear(data_set, test_size=0.4):
    # load training data from feature matrix
    x, y = data_set.load_training_data()

    # split data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        shuffle=True,
                                                        random_state=0)

    # train model on train set
    model = linear_model.LinearRegression(normalize=True)
    model.fit(x_train, y_train)

    # evaluate on test set
    score = cross_val_score(model,
                            x_test,
                            y_test,
                            scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # plot
    predict = model.predict(x_test)
    plt.scatter(y_test, predict)
    plt.show()
Example #3
0
def runsvc(kernel, data, target, nfolds):
    skf = StratifiedKFold(n_splits=nfolds, random_state=5)
    svc = svm.SVC(C=1, kernel=kernel)
    start = time.time()
    ret = (cross_val_score(svc, data, target, cv=skf, n_jobs=-1))
    end = time.time()

    return ret, (end - start)
Example #4
0
    def evalModel(self):
        Log(LOG_INFO) << "Evaluate CV score ..."

        kfold = KFold(n_splits=10, shuffle=False)
        res = cross_val_score(self.mlEngine.getEstimator(),
                              self.totalFeatureMatrix,
                              self.totalLabels,
                              cv=kfold,
                              n_jobs=-1)
        Log(LOG_INFO) << "CV accuracy: %f" % res.mean()
Example #5
0
 def cross_validation(self, folds=None):
     if folds is None:
         y2_model = self.model.fit(self.x1, self.y1).predict(self.x2)
         y1_model = self.model.fit(self.x2, self.y2).predict(self.x1)
         return [
             accuracy_score(self.y1, y1_model),
             accuracy_score(self.y2, y2_model)
         ]
     else:
         return cross_val_score(self.model, self.x, self.y, cv=folds)
def eval_cc_linear(train_data_set, test_data_set):
    # train model on train data set
    model = train_linear(train_data_set)

    # evaluate model on test data set (cross corpus)
    x, y = test_data_set.load_training_data()
    score = cross_val_score(model, x, y, scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # plot
    predict = model.predict(x)
    plt.scatter(y, predict)
    plt.show()
Example #7
0
        def score_pri(slices, x0, y0):
            slices = list(slices)
            if len(slices) <= 1:
                score0 = -np.inf
            else:
                slices = self.feature_unfold(slices)
                data_x0 = x0[:, slices]

                self.estimator.fit(data_x0, y0)
                if hasattr(self.estimator, 'best_score_'):
                    score0 = np.mean(self.estimator.best_score_)
                else:
                    score0 = np.mean(
                        cross_val_score(self.estimator, data_x0, y0, cv=5))
            return score0
def autoencoder_dim_tuning_graph():
    '''run the autoencoder with a variety of hidden layer dimensionalities and plot the cross
    validation errors for each
    '''

    data = read_atoms_data()
    scaledData = data / 10 - 0.5
    kFold = KFold(n_splits=5, shuffle=True)
    errors = []
    
#     for layer1Dim in range(6,16):
    for layer1Dim in range(4,5):
        print('LAYER 1 DIMENSIONALITY: ', layer1Dim)
        errors.append([])
        latentLayerDims = range(4,layer1Dim+1)
        for latentLayerDim in latentLayerDims:
            auto = Autoencoder(hiddenDims=[layer1Dim,latentLayerDim])
            errors[-1].append(-10.0 * np.mean(cross_val_score(auto, scaledData, cv=kFold)))
        plt.semilogy(latentLayerDims,errors[-1],label=layer1Dim)
    print(errors)
Example #9
0
def eval_linear(data_set, test_size=0.4):
    # load training data from feature matrix
    x, y = data_set.load_training_data()

    # cross validation evaluation
    model = LinearRegression(normalize=True)
    #model = RFE(model, 10)
    score = cross_val_score(model, x, y, scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # to visualize:
    # split data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        shuffle=True,
                                                        random_state=0)

    # train model on train set
    model = LinearRegression(normalize=True)
    model = model.fit(x_train, y_train)
    print(model.coef_)

    pprint(model)

    # plot train performance
    predict_train = model.predict(x_train)
    plt.figure()
    plt.title('train')
    plt.scatter(y_train, predict_train)

    # plot test performance
    predict = model.predict(x_test)
    plt.figure()
    plt.title('test')
    plt.scatter(y_test, predict)
    plt.show()
Example #10
0
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("Extra Trees", ExtraTreesClassifier()),
    # ("SVM", SVC(kernel="linear")),
    ("XGBOOST Classifer", XGBClassifier()),
]

## Model comparison ###
start = timeit.default_timer()

accuracies = []
for name, model in models:

    # kfold = model_selection.KFold(n_splits=10)

    cv_results = model_selection.cross_val_score(model, X, y, cv=5)
    precision = cross_val_score(model, X, y, cv=5, scoring="precision")
    recall = cross_val_score(model, X, y, cv=5, scoring="recall")
    f1 = cross_val_score(model, X, y, cv=5, scoring="f1")

    print(
        "\n ### Classifier :",
        name,
        " ###",
        "\nAccuracy :",
        cv_results.mean(),
        "\nprecision :",
        precision.mean(),
        "\nRecall :",
        recall.mean(),
        "\nF1 Score :",
        f1.mean(),
Example #11
0
 def cross_validation_leave_one(self):
     return cross_val_score(self.model, self.x, self.y, cv=LeaveOneOut())
Example #12
0
 def test_cross_validation_leave_one_out(self):
     model = KNeighborsClassifier(n_neighbors=self.neighbors)
     expected = cross_val_score(model, self.x, self.y, cv=LeaveOneOut())
     returned = self.classifier.cross_validation_leave_one()
     self.assertAlmostEqual(expected.mean(), returned.mean())
Example #13
0
    'Sex': train_df['Sex'].astype('int32'),
    'Age': train_df['Age'].astype('int32'),
    'Embarked': train_df['Embarked'].astype('int32')
}
result_df = pd.DataFrame(resultData)
#Task1 Q3##################################################################################################################
resultDecisionTree = DecisionTreeClassifier(criterion='gini')
X = result_df.drop('Survived', axis=1)
y = result_df['Survived']
resultDecisionTree.fit(X, y)
fig = plt.figure(figsize=(35, 30))
plot_tree(resultDecisionTree, filled=True)
#plt.show()
#Task1 Q4##################################################################################################################
clf = DecisionTreeClassifier()
scoresforDTC = cross_val_score(clf, X=X, y=y)
print("Average score of DTC:", scoresforDTC.mean())
#Task1 Q5##################################################################################################################
rf = ensemble.RandomForestClassifier()
scoresforRFC = cross_val_score(rf, X=X, y=y)
print("Average score for RFC:", scoresforRFC.mean())

#HW1 As a reference########################################################################################################
# validPassenger=[0 for i in range(len(train_df.columns))]
# #Q7###########################################################
# for i in [5,6,7,9]:#Age, SibSp, Parch, Fare
#     validPassenger[i] = list(filter(lambda x: not pd.isnull(x), train_df[train_df.columns[i]]))
#     print(train_df.columns[i])
#     print('count ', len(validPassenger[i]))
#     print('mean ', sum(validPassenger[i])/len(validPassenger[i]))
#     print('std ', np.std(validPassenger[i]))
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

import numpy as np
import pandas as pd
from sklearn.model_selection._validation import cross_val_predict,\
    cross_val_score

iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

data = iris.data
label = iris.target

scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv=3)

print(np.round(scores, 4))
print(np.round(np.mean(scores), 4))
Example #15
0
# 交叉验证评估,使用默认的k折交叉验证kFold

import numpy as np
import urllib.request
from sklearn import preprocessing



# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data"
# download the file
raw_data = urllib.request.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=',')
# separate the data from the target attributes
X = dataset[:, 1:-1]
y = dataset[:, -1]
# normalize the data attributes
normalized_X = preprocessing.normalize(X)

from sklearn.model_selection import _validation
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
score = _validation.cross_val_score(estimator=model, X=X, y=y, cv=10)
print(score)
Example #16
0
# CORRELATION MATRIX
corr_matrix(X)

# PCA
X_pca = show_PCA(X)

# LDA
X_lda = show_LDA(X)

# Support Vector Machines
C = [0.001, 0.01, 1, 10, 100]
for c in C:
    svm = SVC(kernel='linear', C=c)
    print("Values of C = ", c)
    print("X_stand avg_accuracy_score",
          np.mean(cross_val_score(svm, X, y, cv=5, scoring="accuracy")))
    print("X_PCA avg_accuracy_score",
          np.mean(cross_val_score(svm, X_pca, y, cv=5, scoring="accuracy")))
    print("X_LDA avg_accuracy_score",
          np.mean(cross_val_score(svm, X_lda, y, cv=5, scoring="accuracy")),
          "\n")

X_lda_train, X_lda_test, y_lda_train, y_lda_test = train_test_split(
    X_lda, y, test_size=0.2)
svm = SVC(kernel='linear', C=10)
svm.fit(X_lda_train, y_lda_train)
getContourImage(svm, X_lda_test)

# KNN
K = [1, 3, 5, 7]
for k in K:
Example #17
0
train_df.update(updatedEmbarkedData)
#print('Embarked correlation',(train_df['Embarked'].astype('int32')).corr(train_df[train_df.columns[1]]))

corr_matrix = train_df.corr(method='pearson')
#print(corr_matrix['Survived']['Pclass'])
#print(corr_matrix)

resultData = {
    'Survived': train_df['Survived'],
    'Pclass': train_df['Pclass'],
    'Sex': train_df['Sex'].astype('int32'),
    'Age': train_df['Age'].astype('int32'),
    'Fare': train_df['Fare'].astype('int32'),
    'Embarked': train_df['Embarked'].astype('int32')
}
result_df = pd.DataFrame(resultData)
#print(result_df)
X = result_df.drop('Survived', axis=1)
y = result_df['Survived']
SVCClf1 = SVC(kernel='linear', C=1)
scores1 = cross_val_score(SVCClf1, X=X, y=y)
print(scores1.mean())

SVCClf2 = SVC(kernel='poly', C=1)
scores2 = cross_val_score(SVCClf2, X=X, y=y)
print(scores2.mean())

SVCClf3 = SVC(kernel='rbf', C=1)
scores3 = cross_val_score(SVCClf3, X=X, y=y)
print(scores3.mean())
    predict = model.predict(x)
    plt.scatter(y, predict)
    plt.show()


if __name__ == '__main__':
    data_set = DataSet('cepp')

    x, y = data_set.load_training_data()

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.4,
                                                        shuffle=True,
                                                        random_state=0)

    regr = linear_model.LinearRegression(normalize=True)
    #regr = linear_model.Ridge(alpha=0.001, normalize=True)

    regr.fit(x_train, y_train)
    predict = regr.predict(x_test)

    scores = cross_val_score(regr,
                             x_test,
                             y_test,
                             scoring='neg_mean_squared_error')
    print(scores.mean())

    plt.scatter(y_test, predict)
    plt.show()
    kernel='rbf',
    C=32,
    gamma=8,
)

print("K-Folds scores:")

originalclass = []
predictedclass = []


def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred)  # return accuracy score


#inner_cv = StratifiedKFold(n_splits=10)
outer_cv = StratifiedKFold(n_splits=10)

# Nested CV with parameter optimization
nested_score = cross_val_score(
    clf,
    X=X,
    y=y,
    cv=outer_cv,
    scoring=make_scorer(classification_report_with_accuracy_score))

# Average values in classification report for all folds in a K-fold Cross-validation
print(classification_report(originalclass, predictedclass))
print("10 folds processing seconds: {}".format(time() - start))
Example #20
0
y_train = np.array(label_list)

# Convert label strings to numerical encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

# Create classifier
clf = svm.SVC(kernel='linear')

# Set up 5-fold cross-validation
kf = KFold(len(X_train), shuffle=True, random_state=1)

# Perform cross-validation
scores = _validation.cross_val_score(cv=kf,
                                     estimator=clf,
                                     X=X_train,
                                     y=y_train,
                                     scoring='accuracy')
print('Scores: ' + str(scores))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std()))

# Gather predictions
predictions = _validation.cross_val_predict(cv=kf,
                                            estimator=clf,
                                            X=X_train,
                                            y=y_train)

accuracy_score = metrics.accuracy_score(y_train, predictions)
print('accuracy score: ' + str(accuracy_score))

confusion_matrix = metrics.confusion_matrix(y_train, predictions)