def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def test_cross_val_generator_with_mask(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) ss = cval.ShuffleSplit(4, indices=False) for cv in [loo, lpo, kf, skf, lolo, lopo, ss]: for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def test_cross_indices_exception(): X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]])) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) assert_raises(ValueError, cval.check_cv, loo, X, y) assert_raises(ValueError, cval.check_cv, lpo, X, y) assert_raises(ValueError, cval.check_cv, kf, X, y) assert_raises(ValueError, cval.check_cv, skf, X, y) assert_raises(ValueError, cval.check_cv, lolo, X, y) assert_raises(ValueError, cval.check_cv, lopo, X, y)
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def show_cross_val(method): if method == "lolo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeaveOneLabelOut(labels) elif method == "lplo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeavePLabelOut(labels, p=2) elif method == "loo": cv = cross_validation.LeaveOneOut(n=len(y)) elif method == "lpo": cv = cross_validation.LeavePOut(n=len(y), p=3) for train_index, test_index in cv: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print "X_train:", X_train print "y_train:", y_train print "X_test:", X_test print "y_test:", y_test
def tenFoldCV_onChicagoCrimeData(features=['corina'], CVmethod='10Fold', P=10, NUM_ITER=20, SHUFFLE=True): """ Use different years data to train the NB model """ YEARS = range(2003, 2014) Y = [] C = [] FL = [] GL = [] T = [] for year in YEARS: W = generate_transition_SocialLag(year, lehd_type=0) Yhat = retrieve_crime_count(year - 1, ['total']) y = retrieve_crime_count(year, ['total']) c = generate_corina_features() popul = c[1][:, 0].reshape((77, 1)) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents y = np.divide(y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 W2 = generate_geographical_SpatialLag_ca() f1 = np.dot(W, Yhat) f2 = np.dot(W2, Yhat) FL.append(f1) GL.append(f2) Y.append(y) T.append(Yhat) C.append(c[1]) Y = np.concatenate(Y, axis=0) columnName = ['intercept'] f = np.ones(Y.shape) if 'corina' in features: C = np.concatenate(C, axis=0) f = np.concatenate((f, C), axis=1) columnName += c[0] if 'sociallag' in features: FL = np.concatenate(FL, axis=0) f = np.concatenate((f, FL), axis=1) columnName += ['sociallag'] if 'spatiallag' in features: GL = np.concatenate(GL, axis=0) f = np.concatenate((f, GL), axis=1) columnName += ['spatiallag'] if 'temporallag' in features: T = np.concatenate(T, axis=0) f = np.concatenate((f, T), axis=1) columnName += ['temporallag'] if SHUFFLE: f, Y = shuffle(f, Y) if CVmethod == '10Fold': splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True) elif CVmethod == 'leaveOneOut': splt = cross_validation.LeaveOneOut(n=f.shape[0]) elif CVmethod == 'leavePOut': splt = cross_validation.LeavePOut(n=f.shape[0], p=P) mae1 = [] mae2 = [] mre1 = [] mre2 = [] sd_mae1 = [] sd_mae2 = [] sd_mre1 = [] sd_mre2 = [] med_mae1 = [] med_mae2 = [] med_mre1 = [] med_mre2 = [] cnt = 0 if CVmethod == 'leaveOneOut': y_gnd = [] y_lr = [] for train_idx, test_idx in splt: cnt += 1 if cnt > NUM_ITER: break f_train, f_test = f[train_idx, :], f[test_idx, :] Y_train, Y_test = Y[train_idx, :], Y[test_idx, :] # write file for invoking NB regression in R np.savetxt("Y_train.csv", Y_train, delimiter=",") np.savetxt("Y_test.csv", Y_test, delimiter=",") pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv", sep=",", index=False) pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv", sep=",", index=False) # NB regression nbres = subprocess.check_output(['Rscript', 'nbr_eval_kfold.R']).split(" ") y1 = np.array([float(e) for e in nbres]) y1 = y1.reshape((y1.shape[0], 1)) a = np.abs(Y_test - y1) mae1.append(np.mean(a)) sd_mae1.append(np.std(a)) med_mae1 += a.tolist() r = a / Y_test mre1.append(np.mean(r)) sd_mre1.append(np.std(r)) med_mre1 += r.tolist() # Linear regression r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) y2 = y2.reshape((y2.shape[0], 1)) ae = np.abs(Y_test - y2) mae2.append(np.mean(ae)) sd_mae2.append(np.std(ae)) med_mae2 += ae.tolist() re = ae / Y_test mre2.append(np.mean(re)) sd_mre2.append(np.std(re)) med_mre2 += re.tolist() if CVmethod == 'leaveOneOut': y_gnd.append(Y_test) y_lr.append(y2) if CVmethod == 'leaveOneOut': print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1), print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2) return y_gnd, y_lr else: print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean( mre1), np.mean(sd_mre1), np.median(med_mre1), print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean( mre2), np.mean(sd_mre2), np.median(med_mre2) return mae1, mae2
#Initializing the classifiers (All are tree based classifiers) dt = DecisionTreeClassifier() rf = RandomForestClassifier(n_estimators=51) extree = ExtraTreeClassifier() classifier_list = [dt, rf, extree] classifier_name_list = ["Decision Tree", "Random Forests", "Extra Trees"] data = dataFrame.values # Initializing Cross Validation Models kf = cross_validation.KFold(len(labels), n_folds=5) stratifiedkf = cross_validation.StratifiedKFold(labels, n_folds=4) labeledkf = cross_validation.LabelKFold(labels, n_folds=4) leavePout = cross_validation.LeavePOut(len(labels), p=100) cross_validation_model_list = [kf, stratifiedkf, labeledkf, leavePout] cross_validation_model_names = [ "K-Fold", "Stratified K-fold", "Labeled K-Fold", "Leave P Out" ] # Cross Validating each given classifier for classifier, classifier_name in zip(classifier_list, classifier_name_list): scores = cross_validation.cross_val_score(classifier, data, labels, cv=10) print "-------- For Classifier : ", classifier_name, " ---------------" print "Score Array : ", scores print "Mean Score : ", scores.mean() print "Standard Deviation : ", scores.std() print "------------------------------------------------------"
# boston_df.plot(kind="scatter", x="LSTAT", y="target") #pd plot # pyl.show() myseed = 1234 lm = linear_model.LinearRegression() en = linear_model.ElasticNet(random_state=myseed) rid = linear_model.Ridge(random_state=myseed) svr_lin = svm.SVR(kernel='linear') svr_rbf = svm.SVR(kernel='rbf', gamma=0.7) svr_pl3 = svm.SVR(kernel='poly', degree=3) models = [lm, en, rid, svr_lin, svr_rbf, svr_pl3] cv_kf5 = cross_validation.KFold(n=n_samples, n_folds=5, shuffle=True, random_state=myseed) cv_loo = cross_validation.LeaveOneOut(n=n_samples) cv_lpo10 = cross_validation.LeavePOut(n=n_samples, p=2) cvs = [cv_kf5, cv_loo, cv_lpo10] for model in models: for validator in cvs: scores = cross_validation.cross_val_score(model, boston_data_scaled, boston_target, cv=validator) print "Accuracy: %0.2f (+/- %0.2f), cvs: %0.2f" % ( scores.mean(), scores.std() * 2, len(scores)) print "--"
# 最基础的CV算法,也是默认采用的CV策略。 # 主要的参数包括两个,一个是样本数目,一个是k-fold要划分的份数 kf = cv.KFold(iris.data.shape[0],n_folds=10,random_state=1) scores2 = cv.cross_val_score(clf,iris.data,iris.target,cv=kf) # 与k-fold类似,将数据集划分成k份, #不同点在于,划分的k份中,每一份内各个类别数据的比例和原始数据集中各个类别的比例相同 sf = cv.StratifiedKFold(iris.target,n_folds=10,random_state=1) scores3 = cv.cross_val_score(clf,iris.data,iris.target,cv=sf) # Leave-one-out leaveOneOut = cv.LeaveOneOut(iris.data.shape[0]) scores4 = cv.cross_val_score(clf,iris.data,iris.target,cv=leaveOneOut) # Leave-P-out 每次从整体样本中去除p条样本作为测试集 #如果共有n条样本数据,那么会生成c(n,p)个训练集/测试集对。 # 和LOO,KFold不同,这种策略中p个样本中会有重叠。 lpo = cv.LeavePOut(iris.data.shape[0],2) #一万一千多个 scores5 = cv.cross_val_score(clf,iris.data,iris.target,cv=lpo) # Leave-one-label-out """ 这种策略划分样本时,会根据第三方提供的整数型样本类标号进行划分。 每次划分数据集时,取出某个属于某个类标号的样本作为测试集,剩余的作为训练集。 """ # Leave-P-Label-Out """ 与Leave-One-Label-Out类似,但这种策略每次取p种类标号的数据作为测试集, 其余作为训练集。 """