Beispiel #1
0
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)

    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind)
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)
    ss = cval.ShuffleSplit(4, indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss]:
        for train, test in cv:
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
Beispiel #3
0
def test_cross_indices_exception():
    X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)

    assert_raises(ValueError, cval.check_cv, loo, X, y)
    assert_raises(ValueError, cval.check_cv, lpo, X, y)
    assert_raises(ValueError, cval.check_cv, kf, X, y)
    assert_raises(ValueError, cval.check_cv, skf, X, y)
    assert_raises(ValueError, cval.check_cv, lolo, X, y)
    assert_raises(ValueError, cval.check_cv, lopo, X, y)
Beispiel #4
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
Beispiel #5
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
def show_cross_val(method):
    if method == "lolo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeaveOneLabelOut(labels)
    elif method == "lplo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeavePLabelOut(labels, p=2)
    elif method == "loo":
        cv = cross_validation.LeaveOneOut(n=len(y))
    elif method == "lpo":
        cv = cross_validation.LeavePOut(n=len(y), p=3)
    for train_index, test_index in cv:
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print "X_train:", X_train
        print "y_train:", y_train
        print "X_test:", X_test
        print "y_test:", y_test
Beispiel #7
0
def tenFoldCV_onChicagoCrimeData(features=['corina'],
                                 CVmethod='10Fold',
                                 P=10,
                                 NUM_ITER=20,
                                 SHUFFLE=True):
    """
    Use different years data to train the NB model
    """
    YEARS = range(2003, 2014)

    Y = []
    C = []
    FL = []
    GL = []
    T = []
    for year in YEARS:
        W = generate_transition_SocialLag(year, lehd_type=0)
        Yhat = retrieve_crime_count(year - 1, ['total'])
        y = retrieve_crime_count(year, ['total'])
        c = generate_corina_features()
        popul = c[1][:, 0].reshape((77, 1))

        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        y = np.divide(y, popul) * 10000
        Yhat = np.divide(Yhat, popul) * 10000

        W2 = generate_geographical_SpatialLag_ca()

        f1 = np.dot(W, Yhat)
        f2 = np.dot(W2, Yhat)

        FL.append(f1)
        GL.append(f2)
        Y.append(y)
        T.append(Yhat)
        C.append(c[1])

    Y = np.concatenate(Y, axis=0)
    columnName = ['intercept']
    f = np.ones(Y.shape)
    if 'corina' in features:
        C = np.concatenate(C, axis=0)
        f = np.concatenate((f, C), axis=1)
        columnName += c[0]
    if 'sociallag' in features:
        FL = np.concatenate(FL, axis=0)
        f = np.concatenate((f, FL), axis=1)
        columnName += ['sociallag']
    if 'spatiallag' in features:
        GL = np.concatenate(GL, axis=0)
        f = np.concatenate((f, GL), axis=1)
        columnName += ['spatiallag']
    if 'temporallag' in features:
        T = np.concatenate(T, axis=0)
        f = np.concatenate((f, T), axis=1)
        columnName += ['temporallag']

    if SHUFFLE:
        f, Y = shuffle(f, Y)

    if CVmethod == '10Fold':
        splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True)
    elif CVmethod == 'leaveOneOut':
        splt = cross_validation.LeaveOneOut(n=f.shape[0])
    elif CVmethod == 'leavePOut':
        splt = cross_validation.LeavePOut(n=f.shape[0], p=P)

    mae1 = []
    mae2 = []
    mre1 = []
    mre2 = []
    sd_mae1 = []
    sd_mae2 = []
    sd_mre1 = []
    sd_mre2 = []
    med_mae1 = []
    med_mae2 = []
    med_mre1 = []
    med_mre2 = []
    cnt = 0

    if CVmethod == 'leaveOneOut':
        y_gnd = []
        y_lr = []

    for train_idx, test_idx in splt:
        cnt += 1
        if cnt > NUM_ITER:
            break
        f_train, f_test = f[train_idx, :], f[test_idx, :]
        Y_train, Y_test = Y[train_idx, :], Y[test_idx, :]

        # write file for invoking NB regression in R
        np.savetxt("Y_train.csv", Y_train, delimiter=",")
        np.savetxt("Y_test.csv", Y_test, delimiter=",")
        pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv",
                                                         sep=",",
                                                         index=False)
        pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv",
                                                        sep=",",
                                                        index=False)

        # NB regression
        nbres = subprocess.check_output(['Rscript',
                                         'nbr_eval_kfold.R']).split(" ")
        y1 = np.array([float(e) for e in nbres])
        y1 = y1.reshape((y1.shape[0], 1))
        a = np.abs(Y_test - y1)

        mae1.append(np.mean(a))
        sd_mae1.append(np.std(a))
        med_mae1 += a.tolist()
        r = a / Y_test
        mre1.append(np.mean(r))
        sd_mre1.append(np.std(r))
        med_mre1 += r.tolist()

        # Linear regression
        r2 = linearRegression(f_train, Y_train)
        y2 = r2.predict(f_test)
        y2 = y2.reshape((y2.shape[0], 1))
        ae = np.abs(Y_test - y2)
        mae2.append(np.mean(ae))
        sd_mae2.append(np.std(ae))
        med_mae2 += ae.tolist()
        re = ae / Y_test
        mre2.append(np.mean(re))
        sd_mre2.append(np.std(re))
        med_mre2 += re.tolist()

        if CVmethod == 'leaveOneOut':
            y_gnd.append(Y_test)
            y_lr.append(y2)

    if CVmethod == 'leaveOneOut':
        print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1),
        print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2)
        return y_gnd, y_lr
    else:
        print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean(
            mre1), np.mean(sd_mre1), np.median(med_mre1),
        print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean(
            mre2), np.mean(sd_mre2), np.median(med_mre2)

    return mae1, mae2
Beispiel #8
0
#Initializing the classifiers (All are tree based classifiers)

dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=51)
extree = ExtraTreeClassifier()
classifier_list = [dt, rf, extree]
classifier_name_list = ["Decision Tree", "Random Forests", "Extra Trees"]

data = dataFrame.values

# Initializing Cross Validation Models

kf = cross_validation.KFold(len(labels), n_folds=5)
stratifiedkf = cross_validation.StratifiedKFold(labels, n_folds=4)
labeledkf = cross_validation.LabelKFold(labels, n_folds=4)
leavePout = cross_validation.LeavePOut(len(labels), p=100)
cross_validation_model_list = [kf, stratifiedkf, labeledkf, leavePout]
cross_validation_model_names = [
    "K-Fold", "Stratified K-fold", "Labeled K-Fold", "Leave P Out"
]

# Cross Validating each given classifier

for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    scores = cross_validation.cross_val_score(classifier, data, labels, cv=10)
    print "-------- For Classifier : ", classifier_name, " ---------------"
    print "Score Array : ", scores
    print "Mean Score : ", scores.mean()
    print "Standard Deviation : ", scores.std()
    print "------------------------------------------------------"
Beispiel #9
0
# boston_df.plot(kind="scatter", x="LSTAT", y="target")  #pd plot
# pyl.show()

myseed = 1234

lm = linear_model.LinearRegression()
en = linear_model.ElasticNet(random_state=myseed)
rid = linear_model.Ridge(random_state=myseed)
svr_lin = svm.SVR(kernel='linear')
svr_rbf = svm.SVR(kernel='rbf', gamma=0.7)
svr_pl3 = svm.SVR(kernel='poly', degree=3)
models = [lm, en, rid, svr_lin, svr_rbf, svr_pl3]

cv_kf5 = cross_validation.KFold(n=n_samples,
                                n_folds=5,
                                shuffle=True,
                                random_state=myseed)
cv_loo = cross_validation.LeaveOneOut(n=n_samples)
cv_lpo10 = cross_validation.LeavePOut(n=n_samples, p=2)
cvs = [cv_kf5, cv_loo, cv_lpo10]

for model in models:
    for validator in cvs:
        scores = cross_validation.cross_val_score(model,
                                                  boston_data_scaled,
                                                  boston_target,
                                                  cv=validator)
        print "Accuracy: %0.2f (+/- %0.2f), cvs: %0.2f" % (
            scores.mean(), scores.std() * 2, len(scores))
    print "--"
    # 最基础的CV算法,也是默认采用的CV策略​。
    # 主要的参数包括两个,一个是样本数目,一个是k-fold要划分的份数
    kf = cv.KFold(iris.data.shape[0],n_folds=10,random_state=1)
    scores2 = cv.cross_val_score(clf,iris.data,iris.target,cv=kf)
    # 与k-fold类似,将数据集划分成k份,
    #不同点在于,划分的k份中,每一份内各个类别数据的比例和原始数据集中各个类别的比例相同
    sf = cv.StratifiedKFold(iris.target,n_folds=10,random_state=1)
    scores3 = cv.cross_val_score(clf,iris.data,iris.target,cv=sf)
    #  Leave-one-out
    leaveOneOut = cv.LeaveOneOut(iris.data.shape[0])
    scores4 = cv.cross_val_score(clf,iris.data,iris.target,cv=leaveOneOut)
    
    # Leave-P-out 每次从整体样本中去除p条样本作为测试集
    #如果共有n条样本数据,那么会生成c(n,p)个训练集/测试集对。
    # 和LOO,KFold不同,这种策略中p个样本中会有重叠。
    lpo = cv.LeavePOut(iris.data.shape[0],2) #一万一千多个
    scores5 = cv.cross_val_score(clf,iris.data,iris.target,cv=lpo)
    
    # Leave-one-label-out
    """
    这种策略划分样本时,会根据第三方提供的整数型样本类标号进行划分。
    每次划分数据集时,取出某个属于某个类标号的样本作为测试集,剩余的作为训练集。
    """
    
    # Leave-P-Label-Out
    """
    与Leave-One-Label-Out类似,但这种策略每次取p种类标号的数据作为测试集,
    其余作为训练集。
    """