def CV(clf,X,y,seeds=range(3)): ## shuffle data, and to alter method via seed. if type(seeds).__name__=='int': ## shuffle only once. cv=np.sqrt(-cvs(clf,X,y,scoring=scorer,cv=KFold(10,True,seeds).split(X,y))) print('Mean: ',round(np.mean(cv)*1e4,3),'\tMax: ',round(np.max(cv)*1e4,3)) else: median=[]; worst=[]; meen=[] for seed in seeds: cv=np.sqrt(-cvs(clf,X,y,scoring=scorer,cv=KFold(10,True,seed).split(X,y))) worst.append(np.max(cv)); meen.append(np.mean(cv)) print('Mean3: ',round(np.mean(meen)*1e4,3),'\tMax3: ',round(np.mean(worst)*1e4,3))
def runBestRegsCompKFold(dataSets=[], regModels=[], names=[]): myResults = {} for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') #myTrain = skb(f_regression, k=3).fit_transform(myTrain,myVal) for name in myTrain.columns: if (not (myTrain[name].dtype == 'O')): myTrain[name] = pre.minmax_scale(myTrain[name].astype('float')) splits = kf(n_splits=10, shuffle=True, random_state=42) infinity = float("inf") index = -1 count = -1 for reg in regModels: count = count + 1 reg.fit(myTrain, myVal) cvsScores = cvs(reg, myTrain, myVal, cv=splits, scoring='neg_mean_squared_error') meanSquareRootError = np.sqrt(-1 * cvsScores.mean()) print(RegsCompNames[names[count]], meanSquareRootError) if (meanSquareRootError < infinity): infinity = meanSquareRootError index = count L1, L2, L3 = RegsCompNames[names[index]], cvsScores, infinity print(filesReg[ds], RegsCompNames[names[index]], infinity) myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3} print('\n') return myResults
def runBestClassificationKFold(dataSets=[], Classifiers=[], names=[]): myResults = {} le = pre.LabelEncoder() for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesBinClass') le.fit(myVal) myVal = le.transform(myVal) #myTrain = skb(f_regression, k=6).fit_transform(myTrain,myVal) #myTrain = skb(chi2, k=5).fit_transform(myTrain,myVal) splits = sss(n_splits=10, test_size=((len(myData) * .20) / len(myData)), random_state=42) #splits =kf(n_splits=10, shuffle=True, random_state=42) infinity = -1.0 * float("inf") index = -1 count = -1 for clf in Classifiers: count = count + 1 clf.fit(myTrain, myVal) cvsScores = cvs(clf, myTrain, myVal, cv=splits, scoring='roc_auc') meanAUC = cvsScores.mean() print(ClassifiersNames[names[count]], meanAUC) if (meanAUC > infinity): infinity = meanAUC index = count L1, L2, L3 = ClassifiersNames[ names[index]], cvsScores, infinity print(filesBinClass[ds], ClassifiersNames[names[index]], infinity) myResults[filesBinClass[ds]] = {1: L1, 2: L2, 3: L3} print('\n') return myResults
def cross_fold_val(self, model_list): self.model_list = model_list self.avg_scores = [] self.std_dev = [] self.model_names = [] for model_name, model in tqdm(self.model_list): score = cvs(model, self.A, self.C, cv=5, scoring='neg_mean_absolute_error') scores = abs(score) # MAE scoring is negative in cross_val_score avg_score = np.mean(scores) std = np.std(scores) self.avg_scores.append(avg_score) self.std_dev.append(std) self.model_names.append(model_name) output = "%s: %f (%f)" % (model_name, avg_score, std) print(output) fig, ax = plt.subplots(figsize=(15, 7)) plt.title(' Models with Cross Validation Scores comparison', size=20) plt.ylabel('Avg_scores', fontsize=15, fontweight='bold') plt.xlabel('model_list', fontsize=15, fontweight='bold') plt.xticks(fontsize=12, fontweight='bold') plt.yticks(fontsize=12, fontweight='bold') ax.bar(self.model_names, self.avg_scores) plt.show()
def runBestRegressionModelKFoldwFS(dataSets=[], regModels=[], names=[]): myResults = {} for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') myTrain = skb(f_regression, k=5).fit_transform(myTrain, myVal) splits = kf(n_splits=10, shuffle=True, random_state=42) infinity = float("inf") index = -1 count = -1 for reg in regModels: count = count + 1 reg.fit(myTrain, myVal) cvsScores = cvs(reg, myTrain, myVal, cv=splits, scoring='neg_mean_squared_error') meanSquareRootError = np.sqrt(-1 * cvsScores.mean()) print(regsNames[names[count]], meanSquareRootError) if (meanSquareRootError < infinity): infinity = meanSquareRootError index = count L1, L2, L3, L4, L5, L6 = regsNames[ names[index]], reg.intercept_, reg.coef_, np.exp( reg.coef_), cvsScores, infinity print(filesReg[ds], regsNames[names[index]], infinity) myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5, 6: L6} print('\n') return myResults
def classification(data, labels, trials=3): """Performs classifications and obtain analytical scores. Parameters ---------- data : List[List[float]] labels : List[int] trials : int The number of trials of cross-validation. Returns ------- None """ clf1 = OneVsRestClassifier( svm.SVC(kernel='poly', C=1, degree=6, probability=True)) clf2 = clone(clf1) clf3 = clone(clf1) trainingAccuracy = getTrainingAccuracy(clf1, data, labels) # print("Training accuracy = " + str(trainingAccuracy)) CVSScores = cvs(clf2, data, labels, cv=trials) # print("Cross-validation scores: " + str(CVSScores)) ROCAUC = getROCAUCScore(clf3, data, labels, trials) # print("ROC AUC = " + str(ROCAUC)) return trainingAccuracy, np.mean(CVSScores), ROCAUC
def show_score(self): import sklearn if str(sklearn.__version__).startswith('0.18'): from sklearn.model_selection import cross_val_score as cvs else: from sklearn.cross_validation import cross_val_score as cvs scores = cvs(self.model, self.X, self.targets, cv=5) print("mean of scores is: " + str(scores.mean()))
def automatic_dt_pruning(dt_classifier, data, label): np.random.seed(42) alpha = [] score = [] for k in range(0, 100): ccp_alpha_test = k / 100 dt_classifier.set_params(ccp_alpha=ccp_alpha_test) alpha.append(ccp_alpha_test) score.append(cvs(dt_classifier, data, label, cv=5).mean()) best_ccp_alpha = alpha[score.index(max(score))] return best_ccp_alpha
def rfr_fillna(df_all): ''' func:对于原来的表格进行缺失值填充,使用的方法是随机森林 paramas: df_all:原来需要填充的表格 return:df_adda(新的表格),model(填充模型),MinMax_1st(归一化模型1),MinMax_2nd(归一化模型2) ''' # 将数据分段,选择好要进行预测的因变量和自变量 user_id = df_all.iloc[:, 0] X = df_all.iloc[:, 1:-1] Y = df_all.iloc[:, -1] X1 = X.copy() Y2 = X1.iloc[:, 43:] sex = X1.iloc[:, 0] X2 = X1.iloc[:, 1:43] # 量纲归一化 MinMax_1st = MinMaxScaler().fit(X2) X2.iloc[:, :] = MinMax_1st.transform(X2) X2 = pd.concat([sex, X2], axis=1) # 对于模型进行筛选 model = {} krange = range(4, 30) for k in tqdm(list(Y2)): X_train = X2[Y2[k].notnull()] X_test = X2[Y2[k].isnull()] Y_train = Y2[k][Y2[k].notnull()] score = [] for i in krange: rfr = RFR(min_samples_split=i, n_jobs=-1) score_each = cvs(rfr, X_train, Y_train, cv=3, n_jobs=-1).mean() score.append(score_each) best_choose = list(krange)[np.argmax(score)] rfr = RFR(min_samples_split=best_choose, n_jobs=-1) rfr = rfr.fit(X_train, Y_train) model[k] = rfr Y2[k][Y2[k].isnull()] = rfr.predict(X_test) # 对银行流水表再次量纲归一化 MinMax_2nd = MinMaxScaler().fit(Y2) Y2.iloc[:, :] = MinMax_2nd.transform(Y2) df_adda = pd.concat([X2, Y2], axis=1) df_adda = pd.concat([user_id, df_adda, Y], axis=1) return df_adda, model, MinMax_1st, MinMax_2nd
def build_classifier(self): """ build both and LDA and an SVM classifier for offline training. can lateron be used for online training :return: """ self.clf = [LDA(n_components=None, priors=None, shrinkage='auto', solver='eigen', store_covariance=False, tol=0.0001), SVM(kernel='rbf', shrinking=True, probability=True, gamma='scale')] # self.clf.fit(self.features, self.labels) [c.fit(self.features, self.labels) for c in self.clf] # possible to use methods predict(X), predict_log_proba(X) or predict_proba() self.cv_scores = [] [self.cv_scores.append(cvs(estimator=c, X=self.features, y=self.labels, cv=10, n_jobs=-1)) for c in self.clf] [print('mean cv score of clf {:d} is'.format(i), np.mean(cv)) for i, cv in enumerate(self.cv_scores)]
def build_abr(training_x, training_y, holdout_x, holdout_y, rounds): ''' INPUT: training features, training target, holdout features, holdout target OUTPUT: adaboost model, adaboost test score, adaboost train score ''' # > BUILD ADABOOST REGRESSOR # use defaults to get adaboost training error _abr = a_br() # get RMSE (take square root of absolute value # of negative mse) using 5-fold Cross Validation abr_train = sqrt( abs( np.array( cvs(_abr, training_x, training_y, cv=4, n_jobs=-1, verbose=False, scoring='neg_mean_squared_error')).mean())) # print training error print('Adaboost_cross_val_score = ', abr_train) # now onto test error... # set parameters for Random Search param_distribution = { "loss": ['linear', "square", "exponential"], "learning_rate": [.15, .25, .29, .33, .5, .6, .9], "n_estimators": sp_randint(250, 1500) } # number of iterations on Random Search n_iter_search = rounds # set Random Search (r_search was import name) _abr = r_search(_abr, param_distributions=param_distribution, n_iter=n_iter_search, n_jobs=-1, cv=4, verbose=1) # fit to training set _abr.fit(training_x, training_y) # get holdout score and print it abr_test = sqrt(mse(holdout_y, _abr.predict(holdout_x))) print('holdout_score = ', abr_test) # return model and scores return _abr, abr_test, abr_train
def lr_model(X_train, y_train, X_test, y_test): ''' Set up logistic regession pipeline. Input: train and test matricies Output: model predictions and accuracy ''' lr_model = LogisticRegression(C=0.1, penalty='l1') lr_model.fit(X_train, y_train) cv_score = np.mean( cvs(lr_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1)) y_hat = lr_model.predict(X_test) score = metrics.accuracy_score(y_test, y_hat) print('LR CV Accuracy: {:.2f}'.format(cv_score)) print('LR Test Accuracy: {:.2f}'.format(score))
def linear_model(x_train='x_train.csv', y_train='y_train.csv', x_test='x_test.csv', y_test='y_test.csv'): Linear_model = LinearRegression() Linear_model.fit(x_train, y_train) scores = cvs(Linear_model, x_train, y_train, cv=10) print("accuracy of linearRegressor " + str(scores.mean())) rms = np.sqrt( np.square(np.asarray(np.log(y_predict) - np.log(y_test))).sum() / float(len(y_predict))) print('RMSE = {}'.format(rms)) y_predict = Linear_model.predict(x_test) return y_predict
def trainModel(clf,Algorithm, tr_x, tr_y,tr_te_x,tr_te_y): clf.fit(tr_x, tr_y) tr_score = clf.score(tr_x, tr_y) print('训练集中切分的训练数据_%s score is %.4f' % (Algorithm, tr_score),end=' ') te_score = clf.score(tr_te_x, tr_te_y) print('训练集中切分的测试数据_%s score is %.4f' % (Algorithm, te_score),end=' ') scores=cvs(clf,tr_x, tr_y,cv=5) np_tr_scores=np.array(scores) tr_mean_score=np_tr_scores.mean() print('训练集中切分的训练数据交叉验证:_%s mena score is %.4f'%(Algorithm,tr_mean_score)) # te_scores = cvs(clf, tr_te_x,tr_te_y, cv=5) # np_te_scores = np.array(te_scores) # te_mean_score = np_te_scores.mean() # print('训练集中切分的测试数据交叉验证:_%s mena score is %.4f' % (Algorithm, te_mean_score), end=' ') return tr_score,te_score,tr_mean_score
def _cv_score(self, model: Pipeline, p_grid: dict, X: list, y: list, metric: make_scorer, nest: bool = True, smote: bool = True ) -> (Pipeline, float, dict): """Big evaluation function, handles oversampling, and cross-val.""" neural = self.neural if smote: X, y = self._oversample(X, y, factor=3) if p_grid: # If nested add a layer of 3 splits, else just cross-validate with # 10. If neural apply a simple split only. n, _n = (10, 3) if nest else (10 if not neural else 2, 0) print(f"running {_n} outer, {n} inner...") cv = StratifiedKFold(n_splits=n, random_state=42) if nest: _cv = StratifiedKFold(n_splits=_n, random_state=42) # Non_nested parameter search and scoring grid = GridSearchCV(estimator=model, param_grid=p_grid, cv=cv, scoring=metric, n_jobs=1 if nest or neural else -1) # NOTE: n_jobs sometimes needs to be tweaked (depending on where # multi-threading happens). Above is the safest default config. grid.fit(X, y) print("\n> Inner CV F1:", grid.best_score_) # Score of 10-fold clf = grid.best_estimator_ else: try: assert not nest except AssertionError: raise(ValueError( "Set nest to false if no p_grid is provided.")) grid, clf = None, model print("\n\n> Final model:\n") for step in clf.steps: print(step) clf.fit(X, y) # Refit best_estimator_ on the entire train set # Nested CV with parameter optimization v only if nested return clf, cvs(clf, X, y, cv=_cv, scoring=metric) if nest else 0, grid
def rf_model(X_train, y_train, X_test, y_test): ''' Set up logistic regession pipeline. Input: train and test matricies Output: model predictions and accuracy ''' rf_model = RandomForestClassifier(n_estimators=500, min_samples_leaf=4, min_samples_split=3, max_features='sqrt') rf_model.fit(X_train, y_train) cv_score = np.mean( cvs(rf_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1)) y_hat = rf_model.predict(X_test) score = metrics.accuracy_score(y_test, y_hat) print('RF CV Accuracy: {:.2f}'.format(cv_score)) print('RF Test Accuracy: {:.2f}'.format(score))
def classification(): global feat_val cols = features + ["Gender"] data = df[cols] data_bal = data x = np.array(data_bal.iloc[:, :-1]) y = np.array(data_bal.iloc[:, -1]) x_train, x_test, y_train, y_test = tts(x, y, test_size=0.25, random_state=23) classifiers = [ rfc(n_estimators=100, random_state=23), lr(random_state=23), SVC(random_state=23) ] methods = [ "Random Forest Classifier", "Logistic Regression", "Support Vector Machines" ] print( "We are using the K-fold cross-validation for estimating accuracy of the models " ) print( "The accuracy mean with its 95% confidence interval for different methods is as follows" ) for i in range(len(methods)): clf = classifiers[i] print("Classifier : {}".format(methods[i])) clf.fit(x_train, y_train) if (i == 0): feat_val = clf.feature_importances_ y_pred = clf.predict(x_test) #acc = recall_score(y_test, y_pred , average=None).mean() #acc = bac(y_test,y_pred) #print("Accuracy : {}".format(acc)) acc = cvs(clf, x, y, cv=5) print("Accuracy: %0.4f (+/- %0.2f)" % (acc.mean(), acc.std() * 2))
def fit(name, dataset, target_col, feature_cols=None, exclude_cols=None): try: # Load Dataset X, y = load_dataset(dataset, target_col, feature_cols, exclude_cols) # Fetch model to fit and evaluate Model = MODELS[name]['Model'] params = MODELS[name]['params'] # Fit and evaluate model start = time.time() model = Model(**params) r2_scores = cvs(model, X, y, scoring='r2', cv=12) delta = time.time() - start # Construct report report = { 'model': name, 'hyperparameters': model.get_params(), 'repr': str(model), 'r2_scores': list(r2_scores), 'elapsed': delta, 'target_col': target_col, 'feature_cols': feature_cols, 'X_shape': X.shape, 'y_shape': y.shape, } return json.dumps(report) except Exception as e: return json.dumps({ 'model': name, 'dataset': dataset, 'target_col': target_col, 'feature_cols': feature_cols, 'error': str(e), })
def gb_model(X_train, y_train, X_test, y_test): ''' Set up logistic regession pipeline. Input: train and test matricies Output: model predictions and accuracy ''' gb_model = GradientBoostingClassifier(learning_rate=0.1, loss='exponential', max_depth=2, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, subsample=0.5) gb_model.fit(X_train, y_train) cv_score = np.mean( cvs(gb_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1)) y_hat = gb_model.predict(X_test) score = metrics.accuracy_score(y_test, y_hat) print('GB CV Accuracy: {:.2f}'.format(cv_score)) print('GB Test Accuracy: {:.2f}'.format(score))
y = y.flatten() x = x.T # for i in range(y.shape[0]): # if y[i] == -1: # y[i] =0 # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3) # fit a model model = LogisticRegression(C=1.0, solver='newton-cg') model.fit(x_train, y_train) score = cvs(model,x_train,y_train,cv = 5,scoring='accuracy') auc1 = cvs(model,x_train,y_train,cv = 5,scoring='roc_auc') print('score = {0}'.format(score.mean())) print('auc = {0}'.format(auc1.mean())) # predict probabilities probs = model.predict_proba(x_test) # keep probabilities for the positive outcome only probs = probs[:, 1] # calculate roc auc auc = roc_auc_score(y_test, probs) print(auc) fpr, tpr, thresholds = roc_curve(y_test, probs)
gSearch = grid.fit(input_variables, output) best_params = gSearch.best_params_ best_accuracy = gSearch.best_score_ # summarize results print("Best score: %f using params %s" % (gSearch.best_score_, gSearch.best_params_)) means = gSearch.cv_results_['mean_test_score'] stds = gSearch.cv_results_['std_test_score'] params = gSearch.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # evaluate using 10-fold cross validation kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10) results = cvs(classifier, input_variables, output, cv=kfold) print(results.mean()) '''Check point of ANN model improvements while training by max mode''' filepath = 'weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5' checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] # Fit the model classifier.fit(xTrain, yTrain, validation_split=0.33, epochs=150,
def build_gbr(training_x, training_y, holdout_x, holdout_y, _abr, abr_test, rounds): ''' INPUT: training features, training targets, holdout features, holdout target, previous model, test score from previous model OUTPUT: final model, final_score, final model training score ''' # > BUILD GRADIENT BOOSTED REGRESSOR # train gradient boosted model on all of X with rfr and abr scores # to get training error _gbr = g_br(loss='lad', learning_rate=.1, n_estimators=500, warm_start=False, verbose=False) # get RMSE (take square root of absolute value of negative mse) using 5-fold Cross Validation final_train = sqrt( abs( np.array( cvs(_gbr, training_x, training_y, cv=5, n_jobs=-1, verbose=False, scoring='neg_mean_squared_error')).mean())) print('Gradient_Boosted_score_cross_val_score = ', final_train) # now on to the test error... # set arbitrary final_score - to be used in while loop to meet threshold final_test = 100 # iteratively train Random Searched Gradient Boosted model # on training data until it beats the previous model's score while final_test > abr_test: # make simple Gradient Boosted Regressor _gbr = g_br(loss='lad', verbose=True) # set param distribution param_distribution = { "max_depth": [3, 4, 5], "learning_rate": [.2, .3, .4], "n_estimators": sp_randint(500, 3000) } n_iter_search = rounds # implement Random Search final_model = r_search(_gbr, param_distributions=param_distribution, n_iter=n_iter_search, n_jobs=-1, cv=4, verbose=1) # fit to training set final_model.fit(training_x, training_y) # get score for holdout set, reset variable final_test = predict_on_holdout(_abr, final_model, holdout_x, holdout_y) # if threshold not met, try again print('final_score = ', final_test) # return model and scores return final_model, final_test, final_train
# compute accuracy of the classifier accuracy = 100.0 * (y_test == y_test_pred).sum() / x_test.shape[0] print("Accuracy of the classifier =", round(accuracy, 2), "%") plot_classifier(classifier_gaussiannb_new, x_test, y_test) ############################################### # Cross validation and scoring functions num_validations = 5 # https://scikit-learn.org/stable/modules/model_evaluation.html accuracy = cvs(classifier_gaussiannb, x, y, scoring='accuracy', cv=num_validations) print("Accuracy: " + str(round(100 * accuracy.mean(), 2)) + "%") # precision is calculated by total number of correct identifications # divided by the total number of identifications. precision = cvs(classifier_gaussiannb, x, y, scoring='precision_weighted', cv=num_validations) print("Precision: " + str(round(100 * precision.mean(), 2)) + "%") # recall is calculated by total number of correct identifications # divided by the total number of interesting items in the dataset
# @Author : AlwaysDazz # @Time : 2021/5/9 10:58 # @IDE: : PyCharm # @Project : pythonProject # @Comment : 回归树模型,数据集波士顿房价数据集 from sklearn.tree import DecisionTreeRegressor as regressor #回归树模型 from sklearn.model_selection import cross_val_score as cvs #交叉验证方法 from sklearn.datasets import load_boston #波士顿房价 import pandas as pd # 对数据进行观察 #数据详情就这些了 boston = load_boston() pd_data = pd.concat([pd.DataFrame(boston.data), pd.DataFrame(boston.target)], axis=1) data = pd.DataFrame(boston.data) #506 rows x 13 columnspd.DataFrame(data).add data_col = data.columns['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] print(data_col) #实例化回归树模型 reg = regressor(random_state=0) #无需加任何参数,先实例化 后期可调整 res = cvs(reg, boston.data, boston.target, cv=10, scoring="neg_mean_squared_error" ) #交叉验证法,回归树模型,数据,标签,交叉次数,分数返回值(回归树默认返回R平方,我们将其转化为负均方误差) print(res)
imputed_encoded_x_train_plus.columns = encoded_x_train_plus.columns imputed_encoded_x_test_plus = pd.DataFrame( imputer.fit_transform(encoded_x_test_plus)) imputed_encoded_x_test_plus.columns = encoded_x_test_plus.columns #Align testing and training data sets final_train, final_test = imputed_encoded_x_train_plus.align( imputed_encoded_x_test_plus, join='inner', axis=1) #Create model and fit my_model = GradientBoostingRegressor() my_model.fit(final_train, y) #Use cross validation to evaluate model scores = cvs(my_model, final_train, y, scoring='neg_mean_absolute_error') print('Mean Absolute Error using Cross Validation is: ', (-1 * scores.mean())) #Plot some partial dependences my_graphs = plot_partial_dependence(my_model, X=final_train, features=[2, 5], feature_names=final_train.columns, grid_resolution=10) #Predict saleprice of test data predictions = my_model.predict(final_test) #Create a submission dataframe and export to a csv file output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions}) output.to_csv('submission.csv', index=False)
#predicting the test set results y_pred=classifier.predict(x_test) from sklearn.metrics import confusion_matrix, classification_report cm=confusion_matrix(y_test, y_pred) plt.figure(figsize = (5,5)) sns.heatmap(cm, annot=True) plt.xlabel('Predicted') plt.ylabel('Truth') print(classification_report(y_test, y_pred)) #applying k-fold cross validation from sklearn.model_selection import cross_val_score as cvs accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10) print(accuracies.mean()) print(accuracies.std()) """Logistic Regression""" #fitting logistic regression to the training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_train) #predicting the test set results y_pred=classifier.predict(x_test) from sklearn.metrics import confusion_matrix, classification_report
return data def make_matrix(l): matrix = np.full((ROW, COL), 0) for d in l: matrix[d[0]][d[1]] = d[2] return matrix if __name__ == "__main__": train = read_csv(TRAIN) gender = read_csv(GENDER) year = read_csv(YEAR) X1 = make_matrix(train) X2 = X1.T Y1 = np.asarray(gender).T[0] Y2 = np.asarray(year).T[0] clf1 = logr() scores = cvs(clf1, X1, Y1, cv=10) print("Min CV error: {}".format(1 - max(scores))) clf2 = logr(solver="saga", multi_class="multinomial") pred = cvp(clf2, X2, Y2, cv=10) mse1 = mse(Y2, pred) mse2 = mse(Y2, np.full_like(Y2, np.mean(Y2))) print("Regression MSE: {}".format(mse1)) print("Naive MSE: {}".format(mse2))
# print("RMSE: ", rmse) # 預測誤差 68628.19819848923 美元 # 因此對於擬合不足的模型, 可以選用更強大的模型或是提供更好的特徵, 減少限制等等 # 這裡則更換更強大的模型 DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor as DTR tree_reg = DTR() tree_reg.fit(housing_prepared, housing_labels) predicted = tree_reg.predict(housing_prepared) # mse = mean_squared_error(housing_labels, predicted) # rmse = np.sqrt(mse) # print("RMSE: ", rmse) # RMSE: 0.0 # 這裡的結果正確率是100%, 但要考慮是否過度擬合 Overfitting # 因此, 要使用交叉驗證來進行更好的評估模型 from sklearn.model_selection import cross_val_score as cvs scores = cvs(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10) rmse_scores = np.sqrt(-scores) # Scikit-Learn 交叉驗證更傾向於效用函數(越大越好), 而不是成本函數(越小越好), 所以計算分數實際上是負的MSE # 來查看結果 def display_score(scores): print("Score: ", scores) print("Mean:", scores.mean()) print("Stardard deviation: ", scores.std()) # display_score(rmse_scores) # Mean: 71227.31692492112 # Stardard deviation: 2926.49161963209 # 和LR的交叉驗證評分做個比較 lr_scores = cvs(lr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10) lr_rmse = np.sqrt(-lr_scores)
final = [] for i in range(len(featureList)): l = [featureList[i]] l.append(labelListAct[i]) l.append(labelListVal[i]) l.append(speakerList[i]) final.append(l) final.sort(key = getFourth) featureList = [] #list of lists used to store the extracted features of each training sample labelListAct = [] #list of strings used to store the labels(emotions) for each training sample labelListVal = [] for i in range(len(final)): featureList.append(final[i][0]) labelListAct.append(final[i][1]) labelListVal.append(final[i][2]) clf = svm.SVC(gamma = 'auto') predictionsAct = cvs(clf, featureList, labelListAct, cv = 24) predictionsVal = cvs(clf, featureList, labelListVal, cv = 24) print('Binary Activation') print(predictionsAct) print(np.mean(predictionsAct)) print('Binary Valence') print(predictionsVal) print(np.mean(predictionsVal))
seaborn.regplot(x='petal_length', y='petal_width', data=iris) plt.show() #训练线性回归模型 lm = linear_model.LinearRegression() features = ['petal_length'] X = iris[features] y = iris['petal_width'] model = lm.fit(X, y) #打印截距和系数 print(model.intercept_, model.coef_) #预测petal_length为4,petal_width的值 predict = model.predict(4) print("petal_width's value : ", predict) #预测性能评估,5次交叉检验 scores = -cvs(lm, X, y, cv=5, scoring='neg_mean_absolute_error') #平均绝对值误差均值 ave_score = numpy.mean(scores) print(ave_score) #更改为2个特征 features = ['petal_length', 'sepal_length'] X = iris[features] y = iris['petal_width'] model = lm.fit(X, y) print(model.intercept_, model.coef_) predict = model.predict([[1, 2]]) print("petal_width's value : ", predict) scores = -cvs(lm, X, y, cv=5, scoring='neg_mean_absolute_error') ave_score = numpy.mean(scores) print(ave_score)