def test_cross_val_score_fit_params(): clf = MockClassifier() n_samples = X.shape[0] n_classes = len(np.unique(y)) DUMMY_INT = 42 DUMMY_STR = "42" DUMMY_OBJ = object() def assert_fit_params(clf): # Function to test that the values are passed correctly to the # classifier arguments for non-array type assert_equal(clf.dummy_int, DUMMY_INT) assert_equal(clf.dummy_str, DUMMY_STR) assert_equal(clf.dummy_obj, DUMMY_OBJ) fit_params = { "sample_weight": np.ones(n_samples), "class_prior": np.ones(n_classes) / n_classes, "sparse_sample_weight": W_sparse, "sparse_param": P_sparse, "dummy_int": DUMMY_INT, "dummy_str": DUMMY_STR, "dummy_obj": DUMMY_OBJ, "callback": assert_fit_params, } cval.cross_val_score(clf, X, y, fit_params=fit_params)
def test_cross_val_score_fit_params(): clf = MockClassifier() n_samples = X.shape[0] n_classes = len(np.unique(y)) DUMMY_INT = 42 DUMMY_STR = '42' DUMMY_OBJ = object() def assert_fit_params(clf): # Function to test that the values are passed correctly to the # classifier arguments for non-array type assert_equal(clf.dummy_int, DUMMY_INT) assert_equal(clf.dummy_str, DUMMY_STR) assert_equal(clf.dummy_obj, DUMMY_OBJ) fit_params = {'sample_weight': np.ones(n_samples), 'class_prior': np.ones(n_classes) / n_classes, 'sparse_sample_weight': W_sparse, 'sparse_param': P_sparse, 'dummy_int': DUMMY_INT, 'dummy_str': DUMMY_STR, 'dummy_obj': DUMMY_OBJ, 'callback': assert_fit_params} cval.cross_val_score(clf, X, y, fit_params=fit_params)
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance with warnings.catch_warnings(record=True): ev_scores = cval.cross_val_score(reg, X, y, cv=5, score_func=explained_variance_score) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def getClfScore(classifier, features, labels, cv): '''Evaluating performance of estimator param: classifier : classifiers list features : data to fit labels : samples data cv : cross validation iterator return: test_score : dict of classification score ''' test_score = {} for idx, clfname in enumerate(sorted(classifier.keys())): clf_score = {} clf = classifier[clfname] precision = cross_val_score(clf, features, labels, 'precision', cv) recall = cross_val_score(clf, features, labels, 'recall', cv) clf_score['precision'] = np.mean(precision) clf_score['recall'] = np.mean(recall) test_score[clfname] = clf_score return test_score
def cv(self, parameters, scoring="roc_auc"): """ Evaluate score by cross validation. """ X = self.data.values.astype(np.float) y = self.label.values print cross_val_score(self.estimator, X, y, scoring=scoring, cv=3)
def test_cross_val_score(): clf = MockClassifier() for a in range(-10, 10): clf.a = a # Smoke test scores = cval.cross_val_score(clf, X, y) assert_array_equal(scores, clf.score(X, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) scores = cval.cross_val_score(clf, X_sparse, y) assert_array_equal(scores, clf.score(X_sparse, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) # test with X as list clf = MockListClassifier() scores = cval.cross_val_score(clf, X.tolist(), y) assert_raises(ValueError, cval.cross_val_score, clf, X, y, scoring="sklearn")
def test_cross_val_score_fit_params(): clf = MockClassifier() n_samples = X.shape[0] n_classes = len(np.unique(y)) fit_params = {'sample_weight': np.ones(n_samples), 'class_prior': np.ones(n_classes) / n_classes} cval.cross_val_score(clf, X, y, fit_params=fit_params)
def classifierExperiments(dataset): svc = svm.SVC(kernel='linear') scores = cross_validation.cross_val_score(svc, dataset.data, dataset.targets, cv=10) print "Linear kernel" for score in scores: print "Score: {0}".format(score) print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2) svc = svm.SVC(kernel='rbf') scores = cross_validation.cross_val_score(svc, dataset.data, dataset.targets, cv=10) ## print "\nRBF kernel" ## for score in scores: ## print "Score: {0}".format(score) ## ## print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2) print "\nDecision trees!" clf = tree.DecisionTreeClassifier() scores = cross_validation.cross_val_score(clf, dataset.data, dataset.targets, cv=10) for score in scores: print "Score: {0}".format(score) print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2)
def crossValidation(): data2010, labels2010 = read_tac('2010') #classifiers gnb = naive_bayes.GaussianNB() Svm = svm.SVC(kernel = "linear") logReg = linear_model.LogisticRegression() GNBscores = cross_validation.cross_val_score(gnb, data2010, labels2010, cv=2) SVMscores = cross_validation.cross_val_score(Svm, data2010, labels2010, cv=2) logRegscores = cross_validation.cross_val_score(logReg, data2010, labels2010, cv=2) print "Results:" print "Gaussian Naive Bayes: " print str(GNBscores.mean()) print "Support Vector Machine: " print str(SVMscores.mean()) print "Logistic Regression: " print str(logRegscores.mean()) fh.write("Results:" + "\n") fh.write("Gaussian Naive Bayes: " + "\n") fh.write(str(GNBscores.mean()) + "\n") fh.write("Support Vector Machine: " + "\n") fh.write(str(SVMscores.mean()) + "\n") fh.write("Logistic Regression: " + "\n") fh.write(str(logRegscores.mean()) + "\n") fh.write("-------------------------------------------------\n") fh.write("\n\n")
def get_score(clf, aX, aY, bX, bY, cX, cY, dX, dY): ''' Get the scores for this datasets: - Stratified sample - All our data - group B data - group C data - group D data It was useful to test our best model among other people's data ''' selX, selY = pick_random_values_stratified(aX, aY) scores1=cross_validation.cross_val_score(clf, selX, selY, cv=5) print "Strat %f" % (scores1.mean()) scores2=cross_validation.cross_val_score(clf, aX, aY, cv=5) print "All %f" % (scores2.mean()) clf=clf.fit(aX, aY) print "Group B", if (bX!=None and bY!=None): print clf.score(bX, bY) else: print "NaN" print "Group C", if (cX!=None and cX!=None): print clf.score(cX, cY) else: print "NaN" print "Group D", if (dX!=None and dY!=None): print clf.score(dX, dY) else: print "NaN"
def importData(datadirectory): #categories = ['n','u', 'y'] categories = ['n', 'y'] data = load_files(datadirectory,categories=categories, shuffle=True, random_state=42, encoding='latin-1') X_train, X_test, y_train, y_test = cross_validation.train_test_split(data.data, data.target, test_size = 0.4, random_state=0) print X_train # count_vect = CountVectorizer() # X_train_vec = count_vect.fit_transform(X_train) # X_test_vec = count_vect.fit_transform(X_test) # clf = svm.SVC(kernel='linear', C=1).fit(X_train_vec, y_train) # clf.score(X_test_vec, y_test) text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())]) #print text_clf.named_steps['clf'] print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' Tfidf NB' #array([ 0.62376238, 0.57 , 0.6122449 ]) text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),]) print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' CountVec NB' #array([ 0.56435644, 0.5 , 0.57142857]) clf = Pipeline([('vect', CountVectorizer()), ('svm', LinearSVC())]) print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' CountVec SVM' #array([ 0.55445545, 0.48 , 0.54081633]) clf = Pipeline([('vect', TfidfVectorizer()), ('svm', LinearSVC())]) print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' Tfidf SVM' #array([ 0.62376238, 0.57 , 0.6122449 ]) clf_sgdc = Pipeline([('vect', CountVectorizer()),('clf', linear_model.SGDClassifier()),]) print str(sum(cross_val_score(clf_sgdc, data.data,data.target ))/3.0) + ' SGDC'
def svmByPackageMachineLearning(xList, yList): ''' Example: iris = datasets.load_iris() clf = svm.SVC(kernel='linear', C=1) score = cross_validation.cross_val_score(clf, iris.data, iris.target, cv=10) print(score) print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2)) ''' #SVM with kernel clf_rbf = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='rbf', gamma=5) clf_sig = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='sigmoid', gamma=5, coef0=100.0) #clf_pol = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='polynomial', gamma=5, coef0=100.0, degree=4) clf_lin = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='linear') #cross validation score_rbf = cross_validation.cross_val_score(clf_rbf, xList, yList, cv=10) score_sig = cross_validation.cross_val_score(clf_sig, xList, yList, cv=10) #score_pol = cross_validation.cross_val_score(clf_pol, xList, yList, cv=10) score_lin = cross_validation.cross_val_score(clf_lin, xList, yList, cv=10) print("rbf: %0.2f (+/- %0.2f)" % (score_rbf.mean(), score_rbf.std() * 2)) print("sig: %0.2f (+/- %0.2f)" % (score_sig.mean(), score_sig.std() * 2)) #print("pol: %0.2f (+/- %0.2f)" % (score_pol.mean(), score_pol.std() * 2)) print("lin: %0.2f (+/- %0.2f)" % (score_lin.mean(), score_lin.std() * 2))
def test_cross_val_score_filter_feature_selection_threshold(): threshold = 1.0 scikit_data,scikit_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=None) model = classify.construct_svc_model(kernel='linear') non_thresholded_test_1 = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_threshold,threshold,scikit_data,scikit_target,cv=5) m = classify.construct_svc_model(kernel='linear') s_data,s_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=threshold) non_thresholded_test_2 = cross_val_score(m,s_data,s_target,cv=5) threshold = .05 scikit_data,scikit_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=None) model = classify.construct_svc_model(kernel='linear') thresholded_test_1 = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_threshold,threshold,scikit_data,scikit_target,cv=5) m = classify.construct_svc_model(kernel='linear') s_data,s_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=threshold) thresholded_test_2 = cross_val_score(m,s_data,s_target,cv=5) #The non-thresholded tests should be the same because if we are not thresholding, it doesn't matter where we perform thresholding assert(math.fabs(non_thresholded_test_1.mean() - non_thresholded_test_2.mean()) < .001) #The first non_thresholded test should have lower accuracy because we are doing thresholding within the cross-validation, #which will reduce cross-validation overfitting and as a consequence reported cross-validation accuracy. assert(thresholded_test_1.mean() - thresholded_test_2.mean() < 0)
def training(matrix, Y, SVM): """ def training(matrix , Y , svm ): matrix: is the train data Y: is the labels in array svm: is a boolean. If svm == True we perform svm otherwise we perform AdaBoostClassifier return: cross_validation scores """ if SVM: classifier = svm.SVC() else: classifier = AdaBoostClassifier(n_estimators=300) precision_micro_scorer = metrics.make_scorer(custom_precision_micro_score) precision_macro_scorer = metrics.make_scorer(custom_precision_macro_score) recall_micro_scorer = metrics.make_scorer(custom_recall_micro_score) recall_macro_scorer = metrics.make_scorer(custom_recall_macro_score) precision_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_micro_scorer) precision_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_macro_scorer) recall_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_micro_scorer) recall_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_macro_scorer) return {"micro": (precision_micro, recall_micro), "macro": (precision_macro, recall_macro)}
def dofitSVMstd(X_train, Y_train, X_test): shape = X_train.shape b = [] for j in range(shape[0]): a1 = [np.std(X_train[j, :, i]) for i in range(shape[2])] a2 = [getEntropy(list(X_train[j, :, i].astype(int))) for i in range(shape[2])] a1.sort(reverse=True) a2.sort() b.append(a1[0:16] + a2[0:16]) x1 = np.array(b) clf = RandomForestClassifier() dummy = clf.fit(x1, Y_train) scores = cross_validation.cross_val_score(clf, x1, Y_train) p1 = clf.predict(x1) shape = X_test.shape b = [] for j in range(shape[0]): a1 = [np.std(X_test[j, :, i]) for i in range(shape[2])] a2 = [getEntropy(list(X_test[j, :, i].astype(int))) for i in range(shape[2])] a1.sort(reverse=True) a2.sort() b.append(a1[0:16] + a2[0:16]) x2 = np.array(b) y2 = clf.predict(x2) xx = np.concatenate((x1, x2)) yy = np.concatenate((Y_train, y2)) dummy = clf.fit(xx, yy) p2 = clf.predict(x2) scores = cross_validation.cross_val_score(clf, x1, Y_train) # sum(clf.predict(x2)) return [scores, np.concatenate((p1, p2))]
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([("imputer", Imputer(strategy="mean", missing_values="NaN")), ("classifier", MockClassifier())]) cval.cross_val_score(p, X, y, cv=5)
def analytics(): trainer_data = get_thing_from_file("training_dataset.txt") tester_data = get_thing_from_file("test_dataset.txt") bayes_clf = get_thing_from_file("bayes_model.txt") svm_clf = get_thing_from_file("svm_model.txt") # we load the fitted models from file so we don't need these lines # bayes_clf.fit(trainer_data.data, trainer_data.target) # svm_clf.fit(trainer_data.data, trainer_data.target) test = tester_data.data predicted_bayes = bayes_clf.predict(test) predicted_svm = svm_clf.predict(test) print "** ACCURACIES **" print numpy.mean(predicted_bayes == tester_data.target) print numpy.mean(predicted_svm == tester_data.target) print "** K-FOLD VALIDATION ACCURACY" bayes_scores = cross_validation.cross_val_score(bayes_clf, tester_data.data, tester_data.target, cv=10) svm_scores = cross_validation.cross_val_score(svm_clf, tester_data.data, tester_data.target, cv=10) print max(bayes_scores) print max(svm_scores) print "**"
def svm_classify(std_features, surf_features, labels): score_std = cross_validation.cross_val_score(svm.SVC(), std_features, labels, cv=5) print('Accuracy (5 fold x-val) with svm [std features]: %s%%' % (0.1* round(1000*score_std.mean()))) # do logistic regression with SURF features print('predicting...') scoreSURFlr = cross_validation.cross_val_score( svm.SVC(), surf_features, labels, cv=5).mean() print('Accuracy (5 fold x-val) with svm [SURF features]: %s%%' % (0.1* round(1000*scoreSURFlr.mean()))) # do logistic regression on the combined features print('Performing log. regression using combined features...') allfeatures = np.hstack([surf_features, std_features]) score_combined = cross_validation.cross_val_score(svm.SVC(), allfeatures, labels, cv=5).mean() print('Accuracy (5 fold x-val) with svm [All features]: %s%%' % (0.1* round(1000*score_combined.mean()))) # plotting #style.use('ggplot') fig = plt.figure() fig.suptitle('SVM', fontsize=20) plt.plot([0,1,2],100*np.array([score_std.mean(), scoreSURFlr, score_combined]), 'k-', lw=8) plt.plot([0,1,2],100*np.array( [score_std.mean(), scoreSURFlr, score_combined]), 'o', mec='#cccccc', mew=12, mfc='white') plt.xlim(-.5,2.5) plt.ylim(score_std.mean()*90., score_combined*110) plt.xticks([0,1,2], ["Standard", "SURF", "Combined"]) plt.ylabel('Accuracy (%)') plt.savefig('img_classifying_graph_svm.png')
def crossValidateRegression(X, y, C, epsilon): svr = svm.SVR(kernel='linear', C=C, epsilon=epsilon) crossValidator = cross_validation.KFold(NUM_COUNTRYS, n_folds = 10) cross_validation.cross_val_score(svr, X, y, cv=crossValidator, scoring='mean_squared_error') svr.fit(X, y) predictedTargetInfo = svr.predict(X) return predictedTargetInfo
def rf_cross_val(x,y): X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.33, random_state = 42) random_forest_grid = {'n_estimators': [100], 'n_jobs': [-1]} rf_gridsearch = GridSearchCV(RandomForestRegressor(), random_forest_grid, n_jobs=-1, verbose=True, cv=3) rf_gridsearch.fit(X_train, y_train) print "best parameters:", rf_gridsearch.best_params_ best_rf_model = rf_gridsearch.best_estimator_ y_pred = best_rf_model.predict(X_test) print "Accuracy with best rf:", cross_val_score(best_rf_model, X_test, y_test).mean() rf = RandomForestRegressor(n_estimators=10, n_jobs = -1) print "Accuracy with default param rf:", cross_val_score(rf, X_test, y_test).mean() return best_rf_model
def crossValidation(self, classification): print "########## Cross Validating and Testing ##########\n\n\n" print "########## Results. ##########\n\n\n" scores = cross_validation.cross_val_score(classification, self.X_train, self.y_train, metrics.f1_score, cv=5, n_jobs=1) print "F1-score: %.2f (+/- %.2f)" % (scores.mean(), scores.std()/2) scores = cross_validation.cross_val_score(classification, self.X_train, self.y_train, metrics.accuracy_score, cv=5, n_jobs=1) print "Accuracy: %.2f (+/- %.2f)" % (scores.mean(), scores.std()/2)
def lda_run(self, k_folds = 5): self.r_forest_lda = RandomForestClassifier(n_estimators=2000,n_jobs=5, max_depth=None, min_samples_split=1, random_state =0) self.lda_scores = cross_validation.cross_val_score(self.r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5) print("Cross validation Random Forest performance LDA: Accuracy: %0.2f (std %0.2f)" % (self.lda_scores.mean()*100, self.lda_scores.std()*100)) self.r_forest_lda.fit(self.lda_iss_features,self.labels) print self.r_forest_lda.score(self.lda_iss_validation_features, self.validation_labels)*100, 'LDA test-set performance \n' ''' C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(self.labels, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(self.lda_iss_features, self.labels) print("The best parameters are %s with a score of %0.2f"% (grid.best_params_, grid.best_score_)) ''' self.svc_lda = SVC(kernel='rbf',C = 1,gamma = 'auto') self.svc_lda_scores = cross_validation.cross_val_score(self.svc_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5) print("Cross validation SVM performance LDA: Accuracy: %0.2f (std %0.2f)" % (self.svc_lda_scores.mean()*100, self.svc_lda_scores.std()*100)) self.svc_lda.fit(self.lda_iss_features,self.labels) print self.svc_lda.score(self.lda_iss_validation_features, self.validation_labels)*100, 'LDA test-set performance \n' y_true = self.validation_labels y_pred = self.svc_lda.predict(self.lda_iss_validation_features) target_names = ['S1','S2','S3','S4'] t = classification_report(y_true, y_pred, target_names=target_names) print 'Support vector report lda' print t
def validate_model(model, features, labels): accuracy = cross_val_score(model, features, labels, scoring='accuracy', cv=4).mean() precision = cross_val_score(model, features, labels, scoring='precision', cv=4).mean() recall = cross_val_score(model, features, labels, scoring='recall', cv=4).mean() f1 = cross_val_score(model, features, labels, scoring='f1', cv=4).mean() print "\n(METRICS) Accuracy: {:.3f} Precision: {:.3f} Recall: {:.3f} F1-Score: {:.3f}".\ format(accuracy,precision, recall, f1)
def run_model(data): """Do some label bucketing, print model output.""" features = data.ix[:, :-1] # more categories <--> less accuracy # labels = data.ix[:, -1].map(lambda k: 1 if k > 10 else 0) labels = data.ix[:, -1].map(lambda k: int(k / 5)) # bucketing trick print 'num classes = {}\n'.format(len(set(labels))) # weak (base) classifier print 'fitting weak classifier...' weak_clf = DecisionTreeClassifier(max_depth=MAX_DEPTH) weak_cv_results = cross_val_score(weak_clf, features, labels, cv=N_FOLDS) print 'weak_cv_results = {}'.format(weak_cv_results) print 'avg accuracy = {}\n'.format(weak_cv_results.mean()) # strong (ensemble) classifier print 'fitting strong classifier...' strong_clf = RandomForestClassifier( max_depth=MAX_DEPTH, n_estimators=N_TREES, n_jobs=N_JOBS) strong_cv_results = cross_val_score(strong_clf, features, labels, cv=N_FOLDS) print 'strong_cv_results = {}'.format(strong_cv_results) print 'avg accuracy = {}'.format(strong_cv_results.mean())
def coeff_of_deterimination(classifier, X, y, K=10): # Perform a cross-validation estimate of the coefficient of determination using # the cross_validation module using all CPUs available on the machine R21 = cross_val_score(classifier, X, y=y, n_jobs=1).mean() R2 = cross_val_score(classifier, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean() print "The %d-Folds est coeff. of determ. R2 = %s" % (K, R2) print "basic cross val ", R21
def _run_classifier(X, Y, parent, child, max_depth): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.333, random_state=0) clf = tree.DecisionTreeClassifier(min_samples_split=parent, min_samples_leaf=child, max_depth=max_depth) clf = clf.fit(X_train, y_train) print 'model score on train data data:' print clf.score(X_train, y_train) print 'ten fold cross-validation results on train data:' scores = cross_val_score(clf, X_train, y_train, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print 'model score on test data' print clf.score(X_test, y_test) print 'ten fold cross-validation results on test data:' scores = cross_val_score(clf, X_test, y_test, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print 'Gini Importance' print clf.feature_importances_ 'Classification Report' y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) 'Confusion Matrix' print(confusion_matrix(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred) print _calc_error_rate_conf_int(cm) return _calc_error_rate_conf_int(cm) + [len(y_test)]
def run_conventional_linkage(x, y, n_comps, linker_model, verbose=0, k_folds=3): print "---->Cross validating" cvs = cross_val_score(linker_model, x, y, cv=k_folds, scoring='r2', verbose=verbose) mse = cross_val_score(linker_model, x, y, cv=k_folds, scoring='mean_squared_error', verbose=verbose) print '---->R2: ', np.mean(cvs) print '---->MSE: ', np.mean(mse) return np.mean(cvs), np.std(cvs), np.mean(mse), np.std(mse)
def experiment_zero(data,company): print '___Experiment One___' # Experiment Parameters finance_datatype = 0 # finance_datatype: Integer 2 = Stock price change, 1 = Percentage stock price change, 0 = Only direction finance_n = 2 # finance_n: Integer >=0 Number of days of finance data to include sentiment_datatype = 1 # sentiment_datatype: Boolean 1 = all sentiment featues, 0 = Total sentiment_n = 1 # sentiment_n: Integer >=0 Number of days of sentiment data to include day = 0 # day: Boolean 1 = Include day of the week, 0 = do not target = 0 # target: Boolean 1 = Amount, 0 = Direction volume = 0 # volume: boolean 1 = Yes, 0 = No if (finance_n + sentiment_n + day + volume) == 0: print 'Insufficient parameters set' return # Data Processing feature_vector_meaning(company, finance_datatype, finance_n, sentiment_datatype, sentiment_n, day, target, volume) matrix = create_feature_matrix(company, data, finance_datatype, finance_n, sentiment_datatype, sentiment_n, day, target, volume) end = len(matrix[0]) train_x = matrix[:,0:end-1] train_y = matrix[:,end-1] # Classifier training scaler = preprocessing.StandardScaler().fit(train_x) train_x = scaler.transform(train_x) clf = direction_classifier(train_x,train_y) cv = cross_validation.ShuffleSplit(len(train_x), n_iter=5, test_size=0.2, random_state=0) print ' _ _ _Evaluation_ _ _' if target == 0: scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv, scoring='accuracy') print(" Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) elif target == 1: scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv, scoring='mean_squared_error') print(" MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print '====================='
def fit_from_prep(self, infile): H, y, w = self._da.load_from_file(infile) self._vq = VQ(w, hist=w.shape[0]) self._cl.fit(H, y) if self._verbose: print cross_validation.cross_val_score( self._cl, H, y, cv=3).mean()
def randomforest_info(self, max_trees = 1000, step = 40, k_folds = 5): print 'Characterising R_forest. Looping through trees: ', self.treedata = np.zeros((max_trees/step, 10)) for i,n_trees in enumerate(np.arange(0, max_trees,step)): if n_trees == 0: n_trees = 1 print n_trees, r_forest = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) scores = cross_validation.cross_val_score(r_forest, self.iss_features, self.labels, cv=k_folds,n_jobs=5) r_forest_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_full.fit(self.iss_features,self.labels) self.treedata[i,0] = n_trees self.treedata[i,1] = scores.mean() self.treedata[i,2] = scores.std() # now add the test dataset - score self.treedata[i,3] = r_forest_full.score(self.iss_validation_features, self.validation_labels) r_forest_lda = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_lda_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_lda_full.fit(self.lda_iss_features,self.labels) lda_scores = cross_validation.cross_val_score(r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5) self.treedata[i,4] = lda_scores.mean() self.treedata[i,5] = lda_scores.std() self.treedata[i,6] = r_forest_lda_full.score(self.lda_iss_validation_features, self.validation_labels) print self.treedata[i,6] r_forest_pca = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_pca_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0) r_forest_pca_full.fit(self.pca_iss_features,self.labels) pca_scores = cross_validation.cross_val_score(r_forest_pca, self.pca_iss_features, self.labels, cv=k_folds,n_jobs=5) self.treedata[i,7] = pca_scores.mean() self.treedata[i,8] = pca_scores.std() self.treedata[i,9] = r_forest_pca_full.score(self.pca_iss_validation_features, self.validation_labels)
# TN += 1 # elif (y_actual[i] != y_hat[i]): # if (y_actual[i] == 1): # FN += 1 # else: # FP += 1 # else: # print "Actual", y_actual[i], y_hat[i] return (TP/len(y_hat), FP/len(y_hat), TN/len(y_hat), FN/len(y_hat)) for i in xrange(itiration): # clf = Random(strategy='uniform') clf = LR() # 0.60 +/- 0.24 np.random.shuffle(dataSet) # print "Dataset", dataSet scores = cross_validation.cross_val_score(clf, dataSet[:, 2:], dataSet[:, 1], cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predict = cross_validation.cross_val_predict(clf, dataSet[:, 2:], dataSet[:, 1], cv=10) print zip(predict,dataSet[:, 1], dataSet[:, 0]) dTP, dFP, dTN, dFN = perf_measure(dataSet[:, 1], predict) TP += dTP FP += dFP TN += dTN FN += dFN av_mean += scores.mean() print "Average accuracy is", av_mean/itiration print "Rate", TP/itiration, FP/itiration, TN/itiration, FN/itiration
'variance': pca.explained_variance_, 'principal component': pca_df.columns.tolist() }) # adding one to pricnipal componetns (since there is no 0th compeonet) variance_df['principal component'] = variance_df['principal component'] + 1 variance_df.plot(x='principal component', y='variance') # looks like variance stops getting explained after first two components pca_df_small = pca_df.ix[:, 0:1] # getting a cross val score of transformed data rf = ensemble.RandomForestClassifier(n_estimators=500) roc_scores_rf_pca = cross_val_score(rf, pca_df_small, response_series, cv=10, scoring='roc_auc') print roc_scores_rf_pca.mean() # 74% accuracy roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc') print roc_scores_rf.mean() # PCA created significant information loss in this case ############################
numpy.random.seed(seed) estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=300, batch_size=16, verbose=0))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(y=encoded_Y, n_folds=10, shuffle=True, random_state=seed) results = cross_val_score(pipeline, X, encoded_Y, cv=kfold) print("Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) # Accuracy: 82.68% (3.90%) # dropout in the input layer with weight constraint def create_model1(): # create model model = Sequential() model.add(Dropout(0.2, input_shape=(60, ))) model.add( Dense(60, init='normal', activation='relu', W_constraint=maxnorm(3))) model.add( Dense(30, init='normal', activation='relu', W_constraint=maxnorm(3)))
def analysis(): # Delete existing data in table db.session.query(Results).delete() db.session.commit() # ABTs for each model df_lr = abt(normal=False).copy() df_log = abt(normal=False, convert=True).copy( ) # normal is false because it removes a category group df_gnb = abt(normal=False, convert=True).copy( ) # normal is false because it removes a category group df_knn = abt(normal=False, convert=True).copy( ) # normal is false because it removes a category group df_svm = abt(normal=False, convert=True).copy( ) # normal is false because it removes a category group #################################################### ##### Model 1: Simple Linear Regression ############ #################################################### lr = LinearRegression() # model object X = lin_exp_var(df_lr) # Explanatory Variable y = df_lr["min_occ_reg"] # Dependent Variable # Fit model, predict and cross eval clf = lr.fit(X, y) lr_model = pickle.dumps(clf) # Create dictionary of accuracy measures pred_lr = clf.predict(X) cross_val_scores = cross_val_score(lr, X.astype(int), y.astype(int), cv=5) lr_dict = { "Mean Squared Error": metrics.mean_squared_error(y, pred_lr), "Root Mean Squared Error": np.sqrt(metrics.mean_squared_error(y, pred_lr)), "R squared": lr.score(X, y), "Cross Val. Accuracy (+/- %0.2f)" % (cross_val_scores.std()): cross_val_scores.mean() } # Commit to database LinearModel = Results(model_type="Simple Linear Regression", model=lr_model, accuracy=str(lr_dict)) db.session.add(LinearModel) db.session.commit() #################################################### ##### Model 2: Multinomial Logistic Regression ##### #################################################### log = LogisticRegression() X = log_exp_var(df_log) # Explanatory Variable: y = df_log["occupancy"] # Dependent Variable clf = log.fit(X, y) log_model = pickle.dumps(clf) # variable containing model object # Create dictionary of accuracy measures pred_log = clf.predict(X) scores = cross_val_score(log, X, y, cv=5) log_dict = { "Accuracy Classification Score": metrics.accuracy_score(y, pred_log), "Precision Score": metrics.precision_score(y, pred_log, average="macro"), "Recall Score": metrics.recall_score(y, pred_log, average="macro"), "F-score": metrics.f1_score(y, pred_log, average="macro"), "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean() } # Commit to database Logistic_Regression = Results(model_type="Multinomial Logistic Regression", model=log_model, accuracy=str(log_dict)) db.session.add(Logistic_Regression) db.session.commit() #################################################### ##### Model 3: Gaussian Naive Bayes ################ #################################################### gnb = GaussianNB() X = gnb_exp_var(df_gnb) y = df_gnb["occupancy"] # Dependent Variable # Fit model, predict and cross eval clf = gnb.fit(X, y) # Fit the model gnb_model = pickle.dumps(clf) # Create dictionary of accuracy measures pred_gnb = clf.predict(X) scores = cross_val_score(gnb, X, y, cv=5) gnb_dict = { "Accuracy Classification Score": metrics.accuracy_score(y, pred_gnb), "Precision Score": metrics.precision_score(y, pred_gnb, average="macro"), "Recall Score": metrics.recall_score(y, pred_gnb, average="macro"), "F-score": metrics.f1_score(y, pred_gnb, average="macro"), "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean() } # Commit to database Gaussian_NB = Results(model_type="Gaussian Naive Bayes", model=gnb_model, accuracy=str(gnb_dict)) db.session.add(Gaussian_NB) db.session.commit() #################################################### ##### Model 4: k-Nearest Neighbor ################## #################################################### X = knn_exp_var(df_knn) # Explanatory Variable: y = df_knn["occupancy"] # Dependent Variable # Find best number of neighbours based on train and test scores X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) scores = [] for n in range(2, 15): test = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train) scores.append(round(test.score(X_test, y_test), 2)) neighbours = scores.index(max(scores)) + 2 # index 0 is k=2 knn = KNeighborsClassifier(n_neighbors=neighbours) # Fit model, predict and cross eval clf = knn.fit(X, y) knn_model = pickle.dumps(clf) # variable containing model object # Create dictionary of accuracy measures pred_knn = clf.predict(X) scores = cross_val_score(knn, X, y, cv=5) knn_dict = { "Accuracy Classification Score": metrics.accuracy_score(y, pred_knn), "Precision Score": metrics.precision_score(y, pred_knn, average="macro"), "Recall Score": metrics.recall_score(y, pred_knn, average="macro"), "F-score": metrics.f1_score(y, pred_knn, average="macro"), "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean() } # Commit to database KNeighbors_Classifier = Results(model_type="k-Nearest Neighbor", model=knn_model, accuracy=str(knn_dict)) db.session.add(KNeighbors_Classifier) db.session.commit() #################################################### ##### Model 5: Support Vector Machines ############# #################################################### X = svm_exp_var(df_svm) # Explanatory Variable y = df_svm["occupancy"] # Dependent Variable # Find best number of for gamma based on train and test scores X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) scores = [] for n in range(0, 20): test = SVC(gamma=n).fit(X_train, y_train) scores.append(round(test.score(X_test, y_test), 2)) gamma = scores.index(max(scores)) svc = SVC(gamma=gamma) # Fit model, predict and cross eval clf = svc.fit(X, y) # Fit the model svc_model = pickle.dumps(clf) # variable containing model object # Create dictionary of accuracy measures pred_svc = clf.predict(X) scores = cross_val_score(svc, X, y, cv=5) svc_dict = { "Accuracy Classification Score": metrics.accuracy_score(y, pred_svc), "Precision Score": metrics.precision_score(y, pred_svc, average="macro"), "Recall Score": metrics.recall_score(y, pred_svc, average="macro"), "F-score": metrics.f1_score(y, pred_svc, average="macro"), "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean() } # Commit to database SVC_results = Results(model_type="Support Vector Machines", model=svc_model, accuracy=str(svc_dict)) db.session.add(SVC_results) db.session.commit()
df.corr() # Lets go and get our hands dirty df = pd.read_csv('winequality-red.csv', sep=';') X = df[list(df.columns)[:-1]] y = df['quality'] # train_test_split function to randomly partition the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y) regressor = LinearRegression() regressor.fit(X_train, y_train) y_predictions = regressor.predict(X_test) print 'R-squared:', regressor.score(X_test, y_test) # lets get cross valiidation score scores = cross_val_score(regressor, X, y, cv=10) print scores.mean(), scores # plot with true quality and predicted quality plt.scatter(y_predictions, y_test) plt.xlabel('Predicted quality') plt.ylabel('True quality') plt.title('Predicted quality Vs True Quality') plt.show() # fitting models using gradient decent # SGDRegressor is an implementation of SGD that can be used even for # regression problems with more features. It can be used # to optimize different cost functions to fit different linear models. By default, it will # optimize the residual sum of squares
labels = epochs.events[:, -1] - 2 # cross validation cv = KFold(len(labels), 10, shuffle=True, random_state=42) # get epochs epochs_data_train = 1e6 * epochs.get_data() # compute covariance matrices cov_data_train = Covariances().transform(epochs_data_train) ############################################################################### # Classification with Minimum distance to mean mdm = MDM(metric=dict(mean='riemann', distance='riemann')) # Use scikit-learn Pipeline with cross_val_score function scores = cross_val_score(mdm, cov_data_train, labels, cv=cv, n_jobs=1) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("MDM Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance)) ############################################################################### # Classification with Tangent Space Logistic Regression clf = TSclassifier() # Use scikit-learn Pipeline with cross_val_score function scores = cross_val_score(clf, cov_data_train, labels, cv=cv, n_jobs=1) # Printing the results class_balance = np.mean(labels == labels[0])
# Using GridSearchCV to find the best values for C and gamma C_range = 10.0**np.arange(-4, 4) gamma_range = 10.0**np.arange(-10, 1) param_grid = dict(gamma=gamma_range, C=C_range) skf = cv.StratifiedKFold(y=y_train, n_folds=3) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=skf) grid.fit(X_train, y_train) # Print out parameters crossclf = svm.SVC(probability=True, **grid.best_params_) y_pred = crossclf.fit(X_train, y_train).predict(X_test) print crossclf print 'y_pred: ', y_train print 'y_pred: ', y_pred print "Best parameter", grid.best_params_ # {'C': 10.0, 'gamma': 0.001} print "Cross-Validation score", cv.cross_val_score(crossclf, X_train, y_train).mean() print "Independent accuracy score", accuracy_score(y_test, y_pred) print "Independent precision score", precision_score(y_test, y_pred) print "Independent recall score", recall_score(y_test, y_pred) print "Independent f1 score", f1_score(y_test, y_pred) ## 7.0 Plot ROC curve # Compute roc and auc print("Step 7") probas_ = crossclf.predict_proba(X_test) print probas_ y_test[y_test == 2] = 0 fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1]) print fpr, tpr roc_auc = auc(fpr, tpr) print "Area under the curve", roc_auc
name_text_map[line[0]] = line[4] name_type_map[line[0]] = line[2] line = f.readline().split(",") success = 0 count = 0 with open("result/resultSVM" + str(int(datetime.now().timestamp())) + ".txt", "w", encoding="utf-8") as f: for n in name_text_map: predicted = idx_type_map[clf.predict(w2v.vectorize( name_text_map[n]))[0]] print(n + " : " + name_type_map[n] + " -> " + predicted) f.write(n + " : " + name_type_map[n] + " -> " + predicted + "\n") count = count + 1 if predicted == name_type_map[n]: success = success + 1 print(str(success) + "/" + str(count)) f.write(str(success) + "/" + str(count) + "\n") # leave-1-out で検定してみよう scores = cross_validation.cross_val_score( clf, datas, labels, cv=5, ) print("scores: " + str(scores)) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn import cross_validation samples = [] labels = [] with open('../data/susy_10000_dense.csv.train') as csvdata: for line in csvdata: vector = line.split(',') labels.append(int(vector[-1])) sample = [float(feature) for feature in vector[:-1]] samples.append(sample) testsamples = [] testlabels = [] with open('../data/susy_10000_dense.csv.test') as csvdata: for line in csvdata: vector = line.split(',') testlabels.append(int(vector[-1])) sample = [float(feature) for feature in vector[:-1]] testsamples.append(sample) model = LogisticRegression() model.fit(samples, labels) print accuracy_score(model.predict(testsamples), testlabels) print cross_validation.cross_val_score(model, samples, labels)
def main(): # load Titanic dataset titanic = load_data("titanic_train.csv", header=1, predict_col=0) X = titanic.X Xnames = titanic.Xnames y = titanic.y yname = titanic.yname n, d = X.shape # n = number of examples, d = number of features #======================================== # part a: plot histograms of each feature ''' print('Plotting...') for i in range(d) : plot_histogram(X[:,i], y, Xname=Xnames[i], yname=yname) ''' #======================================== # train Majority Vote classifier on data print('Classifying using Majority Vote...') clf = MajorityVoteClassifier( ) # create MajorityVote classifier, which includes all model parameters clf.fit(X, y) # fit training data using the classifier y_pred = clf.predict( X) # take the classifier and run it on the training data train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True) print('\t-- training error: %.3f' % train_error) ### ========== TODO : START ========== ### # part b: evaluate training error of Random classifier print('Classifying using Random...') clfRand = RandomClassifier() clfRand.fit(X, y) y_predRand = clfRand.predict(X) train_errorRand = 1 - metrics.accuracy_score(y, y_predRand, normalize=True) print('\t-- training error for random: %.3f' % train_errorRand) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part c: evaluate training error of Decision Tree classifier # use criterion of "entropy" for Information gain print('Classifying using Decision Tree...') clf_tree = DecisionTreeClassifier(criterion="entropy") clf_tree = clf_tree.fit(X, y) y_pred_tree = clf_tree.predict(X) train_error_tree = 1 - metrics.accuracy_score( y, y_pred_tree, normalize=True) print('\t-- training error for decision tree: %.3f' % train_error_tree) ### ========== TODO : END ========== ### # note: uncomment out the following lines to output the Decision Tree graph # save the classifier -- requires GraphViz and pydot ''' from pydot import graph_from_dot_data from io import StringIO from sklearn import tree dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=Xnames) graph = graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("dtree.pdf") ''' ### ========== TODO : START ========== ### # part d: evaluate training error of k-Nearest Neighbors classifier # use k = 3, 5, 7 for n_neighbors print('Classifying using k-Nearest Neighbors...') clf_neigh3 = KNeighborsClassifier(n_neighbors=3) clf_neigh3.fit(X, y) y_neigh3 = clf_neigh3.predict(X) train_error_neigh3 = 1 - metrics.accuracy_score( y, y_neigh3, normalize=True) print('\t-- training error for 3 nearest neighbors: %.3f' % train_error_neigh3) clf_neigh5 = KNeighborsClassifier(n_neighbors=5) clf_neigh5.fit(X, y) y_neigh5 = clf_neigh5.predict(X) train_error_neigh5 = 1 - metrics.accuracy_score( y, y_neigh5, normalize=True) print('\t-- training error for 5 nearest neighbors: %.3f' % train_error_neigh5) clf_neigh7 = KNeighborsClassifier(n_neighbors=7) clf_neigh7.fit(X, y) y_neigh7 = clf_neigh7.predict(X) train_error_neigh7 = 1 - metrics.accuracy_score( y, y_neigh7, normalize=True) print('\t-- training error for 7 nearest neighbors: %.3f' % train_error_neigh7) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part e: use cross-validation to compute average training and test error of classifiers print('Investigating various classifiers...') maj_clf = MajorityVoteClassifier() rand_clf = RandomClassifier() tree_clf = DecisionTreeClassifier(criterion="entropy") knn_clf = KNeighborsClassifier(n_neighbors=5) maj_train_err, maj_test_err = error(maj_clf, X, y) print('\t-- Average training error for majority: %.3f' % maj_train_err) print('\t-- Average test error for majority: %.3f' % maj_test_err) rand_train_err, rand_test_err = error(rand_clf, X, y) print('\t-- Average training error for random: %.3f' % rand_train_err) print('\t-- Average test error for random: %.3f' % rand_test_err) tree_train_err, tree_test_err = error(tree_clf, X, y) print('\t-- Average training error for decision tree: %.3f' % tree_train_err) print('\t-- Average test error for decision tree: %.3f' % tree_test_err) knn_train_err, knn_test_err = error(knn_clf, X, y) print('\t-- Average training error for 5 nearest neighbors: %.3f' % knn_train_err) print('\t-- Average test error for 5 nearest neighbors: %.3f' % knn_test_err) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part f: use 10-fold cross-validation to find the best value of k for k-Nearest Neighbors classifier print('Finding the best k for KNeighbors classifier...') plot_error = [] plot_neigh = [] for i in range(1, 50, 2): clf_knn = KNeighborsClassifier(n_neighbors=i) err = 1 - np.mean(cross_val_score(clf_knn, X, y, cv=10)) plot_neigh.append(i) plot_error.append(err) # plt.plot(plot_neigh, plot_error, marker='o') # plt.ylabel('validation error') # plt.xlabel('# of neighbors') # plt.savefig("crossVal.pdf") ### ========== TODO : END ========== ### ''' ### ========== TODO : START ========== ### # part g: investigate decision tree classifier with various depths print('Investigating depths...') plot_train_err = [] plot_test_err = [] plot_tree = [] for i in range(1, 21, 1): train_err, test_err = error(DecisionTreeClassifier(criterion="entropy", max_depth=i), X, y) print(i, " ", test_err) plot_tree.append(i) plot_train_err.append(train_err) plot_test_err.append(test_err) red_patch = mpl.patches.Patch(color='red', label='training error') green_patch = mpl.patches.Patch(color='blue', label='test error') plt.plot(plot_tree, plot_train_err, 'r', plot_tree, plot_test_err, marker='.') plt.ylabel('average error') plt.xlabel('max depth of tree') plt.legend(handles=[red_patch, green_patch]) plt.savefig("decisionTreeVal.pdf") ### ========== TODO : END ========== ### ''' ### ========== TODO : START ========== ### # part h: investigate Decision Tree and k-Nearest Neighbors classifier with various training set sizes print('Investigating training set sizes...') plt_knn_trerr = [] plt_knn_tserr = [] plt_tree_trerr = [] plt_tree_tserr = [] plt_amt_training = [] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234) for i in [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]: sum1 = 0 sum2 = 0 sum3 = 0 sum4 = 0 for j in range(100): h_knn_clf = KNeighborsClassifier(n_neighbors=7) h_tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth=6) X_tr, X_ts, y_tr, y_ts = train_test_split(X_train, y_train, test_size=(i * 0.1)) h_knn_clf.fit(X_tr, y_tr) y_tr_pred_knn = h_knn_clf.predict(X_tr) y_ts_pred_knn = h_knn_clf.predict(X_test) h_tree_clf.fit(X_tr, y_tr) y_tr_pred_tree = h_tree_clf.predict(X_tr) y_ts_pred_tree = h_tree_clf.predict(X_test) sum1 += ( 1 - metrics.accuracy_score(y_test, y_ts_pred_knn, normalize=True)) sum2 += ( 1 - metrics.accuracy_score(y_tr, y_tr_pred_knn, normalize=True)) sum3 += ( 1 - metrics.accuracy_score(y_tr, y_tr_pred_tree, normalize=True)) sum4 += ( 1 - metrics.accuracy_score(y_test, y_ts_pred_tree, normalize=True)) plt_knn_tserr.append(sum1 / 100) plt_knn_trerr.append(sum2 / 100) plt_tree_trerr.append(sum3 / 100) plt_tree_tserr.append(sum4 / 100) plt_amt_training.append(1 - (i / 10)) red_line = mpl.lines.Line2D(plt_amt_training, plt_knn_trerr, color='red', label='KNN training Error', marker='.') blue_line = mpl.lines.Line2D(plt_amt_training, plt_knn_tserr, color='blue', label='KNN test Error', marker='.') green_line = mpl.lines.Line2D(plt_amt_training, plt_tree_trerr, color='green', label='Decision Tree training Error', marker='1') purple_line = mpl.lines.Line2D(plt_amt_training, plt_tree_tserr, color='purple', label='Decision Tree test Error', marker='1') fig = plt.figure() ax = fig.add_subplot(111) ax.add_line(red_line) ax.add_line(blue_line) ax.add_line(green_line) ax.add_line(purple_line) ax.set_xlim(0, 1.1) ax.set_ylim(0, 0.3) plt.ylabel('error') plt.xlabel('fraction of 90% used to train') plt.legend(handles=[red_line, blue_line, green_line, purple_line]) fig.savefig("h.pdf") ### ========== TODO : END ========== ### print('Done')
print('Included features are: %s' % features) sys.stdout.flush() rfr = RandomForestRegressor(n_estimators=n_estimators, n_jobs=n_jobs, min_samples_leaf=msl, min_samples_split=mss, verbose=1) do_sklearn_cv = False if do_sklearn_cv: X = train[features].values y = train['elo'] msg("CROSS VALIDATING") cvs = cross_val_score(rfr, X, y, cv=n_cv_groups, n_jobs=n_jobs, scoring='mean_absolute_error') print(cvs, np.mean(cvs)) sys.stdout.flush() do_semimanual_cv = False if do_semimanual_cv: msg("fold") kf = KFold(train.shape[0], n_folds=n_cv_groups, shuffle=True) ins = [] outs = [] for train_index, test_index in kf: msg("fit") foo = rfr.fit(train.iloc[train_index][features], train.iloc[train_index]['elo'])
lr=linear_model.LogisticRegression(random_state=13) clf_grid_search=grid_search.GridSearchCV(lr,params,verbose=0,scoring='roc_auc') clf_grid_search.fit(model.X,model.Y) best_params=clf_grid_search.best_params_ print('格点搜索得到最优参数:',best_params) #%%交叉验证获取平均得分 lr=linear_model.LogisticRegression(random_state=13) best_params={'C': 0.1, 'penalty': 'l2', 'class_weight': 'balanced'} lr.set_params(**best_params) ks_scoring=metrics.make_scorer(ModelEvaluate.cal_ks,needs_proba=True,return_split=False,decimals=4) print('五折交叉验证AUC值:',cross_validation.cross_val_score(lr,model.X,model.Y,scoring='roc_auc',cv=5)) print('五折交叉验证KS值:',cross_validation.cross_val_score(lr,model.X,model.Y,scoring=ks_scoring,cv=5)) #%%最终模型效果评估 lr.fit(model.X,model.Y) score=lr.predict_proba(model.X)[:,0] score=pd.Series(score) score.index=model.Y.index y_pred=lr.predict(model.X) y_pred=pd.Series(y_pred) y_pred.index=model.Y.index result_ks=ModelEvaluate.plot_ks_cdf(model.Y,score,decimals=4,close=False)
#replace the null data with -99999 df_features.replace(np.nan,-99999,inplace=True) df_occ_slot = pd.DataFrame() df_occ_slot["occupancy"] = pd.Series(occupancy_slot_list) #define features(X) and labels(y) X = np.array(df_features) Y = np.array(df_occ_slot["occupancy"]) coef_importance = clf.feature_importances_ norm_coef_importance = [100*float(i)/sum(coef_importance) for i in coef_importance] print "norm_coef_importance:" print norm_coef_importance y = clf.predict(X) cnf_matrix = confusion_matrix(Y, y) # plot unnormalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=["unoccupied", "Occupied"], title='unnormalization confusion matrix (Use house#02 Random Forest Model to predict house#03)') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=["unoccupied", "Occupied"], normalize=True, title='Normalized confusion matrix (Use house#02 Random Forest Model to predict house#03)') scores = cross_validation.cross_val_score(clf,X,Y,cv=10) f4_MLP = f1_score(Y, y, average='micro') print("scores by RandomForestClassifier: ") print("score4.mean: "+str(np.mean(scores)) + "score4.var: " + str(np.var(scores)) ) print ("f1 score for RandomForest:") print f4_MLP
from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest # load data url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(url, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] # create feature union features = [] features.append(('pca', PCA(n_components=3))) features.append(('select_best', SelectKBest(k=6))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('logistic', LogisticRegression())) model = Pipeline(estimators) # evaluate pipeline num_folds = 10 num_instances = len(X) seed = 7 kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold) print(results.mean())
def log_regr(dataframe): df = dataframe.copy() # Выводим первые 5 строк из df print(df.head()) y = df.radiant_win # Избавляемся от полей, отсутствующих в тестовой выборке delete_cols = [ 'radiant_win', 'duration', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire' ] df.drop(delete_cols, inplace=True, axis=1) # Проверяем есть ли пропуски в данных rows = len(df) data_with_skip = df.count()[df.count() != rows] print('Rows with empty values:') print(data_with_skip) # Заполняем пропуски df = df.fillna(0) X = df # Delete categorical columns del_list = [ 'lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero' ] df.drop(del_list, inplace=True, axis=1) X = df # Scaler scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) kf = KFold(y.size, n_folds=5, shuffle=True) c = 0.01 clf = LogisticRegression(penalty='l2', C=c) clf.fit(X, y) start_time = datetime.datetime.now() scores = cross_validation.cross_val_score(clf, X, y, cv=kf, scoring='roc_auc') print 'Time elapsed:', datetime.datetime.now() - start_time mean = scores.mean() print(c, mean) # Find unique players data = pd.read_csv('data/data/features.csv', index_col='match_id') heroes_cols_list = [ 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero' ] uniq_heroes_list = set() for row in heroes_cols_list: for id in data[row].unique(): uniq_heroes_list.add(id) print(uniq_heroes_list) print('Number of uniq heroes:{}'.format(len(uniq_heroes_list))) # Coding information about heroes # N — количество различных героев в выборке N = 113 # count heroes in heroes.csv X_pick = np.zeros((df.shape[0], N)) for i, match_id in enumerate(data.index): for p in xrange(5): X_pick[i, data.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1 X_pick[i, data.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1 X_2 = np.concatenate([X, X_pick], axis=1) # Scaler scaler = StandardScaler() scaler.fit(X_2) X_2 = scaler.transform(X_2) kf = KFold(y.size, n_folds=5, shuffle=True) c = 0.01 clf = LogisticRegression(penalty='l2', C=c) clf.fit(X_2, y) start_time = datetime.datetime.now() scores = cross_validation.cross_val_score(clf, X_2, y, cv=kf, scoring='roc_auc') print 'Time elapsed:', datetime.datetime.now() - start_time mean = scores.mean() print(c, mean)
import pandas from sklearn import cross_validation from sklearn.linear_model import ElasticNet url = "https://goo.gl/sXleFv" names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] dataframe = pandas.read_csv(url, delim_whitespace=True, names=names) array = dataframe.values X = array[:, 0:13] Y = array[:, 13] num_folds = 10 num_instances = len(X) seed = 7 kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed) model = ElasticNet() scoring = 'mean_squared_error' results = cross_validation.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(results.mean()) # -*- coding: utf-8 -*-
__author__ = 'pratapdangeti' import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split data = load_boston() print(data) x_train, x_test, y_train, y_test = train_test_split(data.data, data.target) x_scaler = StandardScaler() y_scaler = StandardScaler() x_train = x_scaler.fit_transform(x_train) y_train = y_scaler.fit_transform(y_train) x_test = x_scaler.transform(x_test) y_test = y_scaler.transform(y_test) regressor = SGDRegressor(loss='squared_loss') scores = cross_val_score(regressor, x_train, y_train, cv=5) print('Cross Validation r-squared scores:', scores) print('Average cross validation r-squared score', np.mean(scores)) regressor.fit_transform(x_train, y_train) print('Test set r-squared score', regressor.score(x_test, y_test))
SC_X = StandardScaler() X_Train = SC_X.fit_transform(X_Train) X_Test = SC_X.fit_transform(X_Test) # Fitting SVC SVC_Model = SVC(kernel='rbf', random_state=0) SVC_Model.fit(X_Train, Y_Train) # Predicting the result sets Y_Pred = SVC_Model.predict(X_Test) # Confusion Matrix cm = confusion_matrix(Y_Test, Y_Pred) # Applying K-Fold Cross Validation accuracies = cross_val_score(estimator=SVC_Model, X=X_Train, y=Y_Train, cv=10) accuracies.mean() accuracies.std() # Applying grid search to find best parameters parameteres = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1, 0.01] }] GridSearch = GridSearchCV(param_grid=parameteres, estimator=SVC_Model,
import numpy as np from sklearn import datasets from sklearn.cross_validation import KFold from sklearn.neighbors import KNeighborsRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import scale data = datasets.load_boston() x = scale(data['data']) y = data['target'] kf = KFold(len(x), n_folds=5, shuffle=True, random_state=42) res = {} for i in np.linspace(1.0, 10.0, num=200): scores = cross_val_score(KNeighborsRegressor(n_neighbors=5, weights='distance', p=i, metric='minkowski'), x, y, scoring='mean_squared_error', cv=kf) res[str(format(i, '.2f'))] = format(sum(scores) / 5.0, '.2f') val = min(res.items(), key=(lambda x: x[1])) print(val)
# Random Forest Regressor print "testing Regressor" #print "size of training set: "+str(len(train_Y_R_decile)) for j in [10]: for i in [12]: # clf = AdaBoostRegressor(n_estimators=j) clf = RandomForestRegressor(n_estimators=j, max_depth=i, min_samples_split=10, random_state=0) print "\tmax_depth= " + str(i) print "\tn_estimators= " + str(j) scores = cross_val_score( clf, X, Y_R_decile, scoring=lambda clf, X, Y: mean_squared_error(Y, clf.predict(X))) print "Random Forest Regressio mean cross validation score: " + str( scores.mean()) print "\n\n" # clf=GradientBoostingRegressor(n_estimators=100,learning_rate=i) clf.fit(train_X, train_Y_R_decile) #clf.fit(train_X,train_Y_R) Y_R_decile_predicted = list(clf.predict(test_X)) MSE = mean_squared_error(Y_R_decile_predicted, test_Y_R_decile) print "\tMSE=" + str(MSE) heat_matrix = regression_acc_heat_map(Y_R_decile_predicted, test_Y_R_decile, num_buckets)
# We have d dimensions, d=64 # We have z classes, z=6, [digit0, digit1, digit2, digit7, digit8, digit9] lbl = preprocessing.LabelEncoder() y_train = lbl.fit_transform(y[np.where((y == 0) | (y == 1) | (y == 2) | (y == 7) | (y == 8) | (y == 9))]) X_train = X[np.where((y == 0) | (y == 1) | (y == 2) | (y == 7) | (y == 8) | (y == 9))] # We have Weight matrix, W, d x z model = linear_model.LogisticRegression(random_state=1) model.fit(X_train, y_train) W = model.coef_.T print cross_validation.cross_val_score(model, X_train, y_train, scoring=make_scorer(accuracy_score)) # We have a attributes, a=4 [pca_d1, pca_d2, lle_d1, lle_d2] # We have Signature matrix, S a x z pca = decomposition.PCA(n_components=2) lle = manifold.LocallyLinearEmbedding(n_components=2, random_state=1) X_pca = pca.fit_transform(X_train) X_lle = lle.fit_transform(X_train) for i, ys in enumerate(np.unique(y_train)): if i == 0: S = np.r_[np.mean(X_pca[y_train == ys], axis=0), np.mean(X_lle[y_train == ys], axis=0)] else: S = np.c_[S, np.r_[np.mean(X_pca[y_train == ys], axis=0),
y, test_size=.25, random_state=1) # Take a look at the shape print('Taking a look at training and testing data shape') print(x_train.shape, y_train.shape) print('\n') # Decision Tree Classifier print('Decision Tree Classifier') clf = DecisionTreeClassifier(random_state=1) # Run 10 fold cross validation print('Run 10 fold cross validation') cvs = cross_val_score(clf, x, y, cv=5) print(cvs) # Show cross validation score mean and std print('Show cross validation score mean and std') print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std() * 2)) # Fit the model with data clf.fit(x_train, y_train) # Accuracy acc_decision_tree = round(clf.score(x_train, y_train), 4) print("Accuracy: %0.4f" % (acc_decision_tree)) # Predict y given validation set print('Predict y given validation set')
best_poly=0 for j in range(0, len(poly)): for times in range(0,3): results=np.zeros(len(exploreC)) for i in range(0,len(exploreC)): svm_poly_model=svm.SVC(exploreC[i], kernel='poly', degree=poly[j],shrinking=True, probability=False , cache_size=2000, verbose=False, max_iter=-1, decision_function_shape='ovr' , random_state=0) scores_cv=cross_validation.cross_val_score(svm_poly_model, X, target_train, scoring='f1_weighted', cv=5, n_jobs=-1) results[i]=float(scores_cv.mean()) print "run for C="+str(exploreC[i])+", poly="+str(poly[j])+" is "+str(results[i]) print results best_val=np.max(results) print "Best result of iteration was "+str(best_val) results=results.tolist() index_val = results.index(best_val) if best_val>best_err: best_err=best_val best_c=exploreC[index_val] best_poly=poly[j] print "CV averages for values "+str(exploreC)+" are:"+str(results) print "Best C is"+str( exploreC[index_val])
# Now we switch to scikit learn # We set the inverse regularizer, C, to infinity to make sure we're doing MLE #http://stackoverflow.com/questions/24924755/logit-estimator-in-statsmodels-and-sklearn model = LogisticRegression(fit_intercept=False, C=1e9) y = np.ravel(y) model = model.fit(X, y) coef_patsy = np.ravel(model.coef_) pd.DataFrame(list(zip(X.columns, [round(c, 3) for c in coef_patsy]))) # # evaluate the model using 10-fold cross-validation scores = cross_val_score(LogisticRegression(fit_intercept=False, C=1e9), X, y, scoring='accuracy', cv=10) print(scores) #[ 0.805 0.61 0.732 0.725 0.675 0.7 0.7 0.692 0.744 0.667] print('average CV accuracy = {0:.2f}'.format(scores.mean())) #0.70 print('baseline accuracy = {0:.2f}'.format(1 - y.mean())) # 0.68 # Split data into train and test sets. # We first shuffle the order of the rows (although this is done inside the # train_test_split function, so is not strictly neccessary). np.random.seed(42) # ensure reproducibility #X = pd.DataFrame(np.random.randn(5,2)) #y = np.random.rand(5) N = len(X)
#,'Parch' #,'Mother' ]] deck_train_df = all_deck_train_df.loc[all_deck_train_df['DeckId'] != 0].copy() deck_null_df = all_deck_train_df.loc[all_deck_train_df['DeckId'] == 0].copy() deck_target_df = deck_train_df['DeckId'].copy() deck_train_df.drop(['DeckId'], axis=1, inplace=True) deck_null_df.drop(['DeckId'], axis=1, inplace=True) # Linear Regression print 'Training Deck model...' deck_train_model = RandomForestClassifier(n_estimators=100) # Cross validation scores = cross_validation.cross_val_score(deck_train_model, deck_train_df, deck_target_df, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Predict print 'Predicting Deck...' deck_train_model = deck_train_model.fit(deck_train_df, deck_target_df) deck_train_result = deck_train_model.predict(deck_null_df) main_all_df.loc[main_all_df['DeckId'] == 0, 'DeckId'] = deck_train_result print 'Done.' plt.figure(figsize=(8, 4)) plt.title('DeckId (after)') plt.xlabel('Value') main_all_df['DeckId'].plot.hist()
encoded_y_train = label_encoder.fit_transform(y_train) xgb = XGBClassifier(max_depth=args.max_depth, learning_rate=args.learning_rate, n_estimators=args.n_estimators, objective="multi:softprob", gamma=0, min_child_weight=1, max_delta_step=0, subsample=args.subsample, colsample_bytree=args.colsample_bytree, colsample_bylevel=args.colsample_bylevel, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None, silent=True, nthread=-1, seed=42) kf = KFold(len(x_train), n_folds=10, random_state=42) score = cross_val_score(xgb, x_train, encoded_y_train, cv=kf, scoring=ndcg_scorer) print(xgb.get_params(), score.mean())
def run_rf(data, lebals): clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, data, lebals) print(scores.mean())
cross_val_data[score] = {'mean': [] , 'std': []} roc_data = dict() pr_data = dict() det_data = dict() for n in CONFIG['n_estimators']: roc_data[n] = dict() pr_data[n] = dict() det_data[n] = dict() for n in CONFIG['n_estimators']: # цикл по количеству деревьев # кроссвалидация по разным метрикам model_rfc = RandomForestClassifier(n_estimators = n) #,max_depth = n) for score in CONFIG['scorings']: # цикл по метрикам values = cross_validation.cross_val_score(model_rfc, X, y, cv=kf, scoring=score) mean = values.mean() std = values.std() cross_val_data[score]['mean'].append(mean) cross_val_data[score]['std'].append(std) print('N estimators = %d, scoring = \'%s\', mean value = %f, std value = %f' % (n,score,mean,std)) #обучаем сам классификатор proba = model_rfc.fit(X_train, y_train).predict_proba(X_test) # roc кривая fpr, tpr, thresholds = roc_curve(y_test, proba[:, 1]) print("Y: ", y_test,"PROBA: ", proba) roc_auc = auc(fpr, tpr) roc_data[n]['fpr'] = fpr roc_data[n]['tpr'] = tpr roc_data[n]['roc_auc'] = roc_auc # pr кривая
C = np.linspace(300, 5000, num = 10)[::-1] models = [lm.LogisticRegression(penalty = "l1", C = c) for c in C] if modelname == "sgd": C = np.linspace(0.00005, .01, num = 5) models = [lm.SGDClassifier(loss = "log", penalty = "l2", alpha = c, warm_start = False) for c in C] if modelname == "randomforest": C = np.linspace(50, 300, num = 10) models = [RandomForestClassifier(n_estimators = int(c)) for c in C] print "calculating cv scores" cv_scores = [0] * len(models) for i, model in enumerate(models): # for all of the models, save the cross-validation scores into the array cv_scores cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, scoring = auc_scorer)) #cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, score_func = auc)) print " (%d/%d) C = %f: CV = %f" % (i + 1, len(C), C[i], cv_scores[i]) # find which model and C is the best best = cv_scores.index(max(cv_scores)) best_model = models[best] best_cv = cv_scores[best] best_C = C[best] print "BEST %f: %f" % (best_C, best_cv) print "training on full data" # fit the best model on the full data best_model.fit(X, y) print "prediction"
X_sample=samples_features_and_labels[sample]["X"] print "\t" +sample+": "+str(len(X_sample))+" points" tr_size= int(float(len(X_sample))*0.8) te_size= len(X_sample)-tr_size Y_C_sample_tr=Y_C_sample[0:tr_size] Y_C_sample_te=Y_C_sample[tr_size:tr_size+te_size] X_sample_tr=X_sample[0:tr_size] X_sample_te=X_sample[tr_size:tr_size+te_size] clf.fit(X_sample_tr,Y_C_sample_tr) pred=clf.predict(X_sample_te) accuracy=(100*(float(sum(Y_C_sample_te==pred)))/len(Y_C_sample_te)) mean_acc.append(accuracy*(float(len(X_sample)/float(num_points)))) print "\t"+sample+" test accuracy: "+str(accuracy) print "average sample accuracy:"+str(sum(mean_acc)) ''' scores = cross_val_score(clf, X, Y_C) print "RandomForest mean cross validation score: " + str(scores.mean()) #if only two dimensions are being used, we can plot two dimentionally if False: # Plot decision bondary based on first two dimensions. def extrema(lol, index, reverse): # list of lists return sorted(lol, key=operator.itemgetter(index), reverse=reverse)[0][index] sorted_feature_importances = sorted(feature_importances, reverse=True) x_idx = np.where( feature_importances == sorted_feature_importances[0])[0][0] y_idx = np.where( feature_importances == sorted_feature_importances[1])[0][0]