def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
Beispiel #2
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
Beispiel #3
0
def _cross_val_score_loo_r0(lm, X, y):
    """
	mean_square_error metric is used from sklearn.metric.

	Return 
	--------
	The mean squared error values are returned. 
	"""

    if len(y.shape) == 1:
        y = np.array([y]).T

    kf = cross_validation.LeaveOneOut(y.shape[0])
    score_l = list()
    for tr, te in kf:
        lm.fit(X[tr, :], y[tr, :])
        yp = lm.predict(X[te, :])
        score_l.append(metrics.mean_squared_error(y[te, :], yp))

    return score_l
Beispiel #4
0
 def LOOCV(self, labels, values, details=False):
     '''
     Performs leave one out cross validation.
     Takes a subset of the input data label_array and values to do the cross validation.
     RETURNS: array of length folds of cross validation scores,
     detailedResults is an array of length # of samples containing the predicted classes
     '''
     num_samples = values.shape[0]
     scores = np.zeros(num_samples)
     detailedResults = np.zeros(num_samples)
     # get training and testing set, train on training set, score on test set
     for train, test in cross_validation.LeaveOneOut(num_samples):
         values_test = values[test]
         label_test = labels[test]
         self.Train(labels[train], values[train], fout=None)
         scores[test] = self.classifier.score(values_test, label_test)
         if details:
             detailedResults[test] = self.Predict(values_test)
     if details:
         return scores, detailedResults
     return scores
    def do_LOOCV(all_x, all_y):
        loo = cross_validation.LeaveOneOut(len(all_x))
        tot_cnt = np.zeros(num_cls)
        hit_cnt = np.zeros(num_cls)

        cancer_prob = []
        labels = []
        cnt = 0
        for train_index, test_index in tqdm(loo):
            train_val_x, test_x = all_x[train_index], all_x[test_index]
            train_val_y, test_y = all_y[train_index], all_y[test_index]

            train_val_x, train_val_y = shuffle(train_val_x,
                                               train_val_y,
                                               random_state=RANDOM_STATE)
            n_trn = len(train_val_x)
            n_dev = int(n_trn * VAL_RATIO)
            n_trn = n_trn - n_dev
            train_x = train_val_x[0:n_trn]
            train_y = train_val_y[0:n_trn]
            val_x = train_val_x[n_trn:]
            val_y = train_val_y[n_trn:]

            prob = classifier_LOOCV(train_x,
                                    train_y,
                                    val_x,
                                    val_y,
                                    test_x,
                                    test_y,
                                    method_clf=method_clf,
                                    verbose=verbose,
                                    num_cls=num_cls)

            for i in range(0, num_cls):
                if test_y[0] == i:
                    tot_cnt[i] += 1
                    if np.argmax(prob) == i:
                        hit_cnt[i] += 1

        return (tot_cnt, hit_cnt)
Beispiel #6
0
def LR_training_python(lrf, Y, verboseoutput):
    Y = Y.reshape((len(Y), ))
    loo = cross_validation.LeaveOneOut(len(Y))
    mae2 = 0
    errors2 = []
    for train_idx, test_idx in loo:
        f_train, f_test = lrf[train_idx], lrf[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)):
            r2 = linearRegression(f_train, Y_train)
            y2 = r2.predict(f_test)
            errors2.append(np.abs(Y_test - y2))
            if verboseoutput:
                print Y_test[0], y2[0]
        else:
            print 'nan or infinite'
            pass

    mae2 = np.mean(errors2)
    var2 = np.sqrt(np.var(errors2))
    mre2 = mae2 / Y.mean()
    return mae2, var2, mre2
Beispiel #7
0
def validation(X,Y):
    repeats = 20
    metric_list = [] 
    parameters = {'estimator__n_estimators':np.arange(5,40)}
        
    for i in np.arange(repeats):
        # split train test
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.2,random_state=i)   
        loocv = cross_validation.LeaveOneOut(n = len(X_train))
        kfold = cross_validation.KFold(n = len(X_train),n_folds = 6)
        cv = GridSearchCV(clf,param_grid=parameters,n_jobs=-1, cv=loocv)
        cv.fit(X_train, y_train)
        tuned_model = cv.best_estimator_
        model = tuned_model.fit(X_train,y_train)
        y_score = model.predict_proba(X_test)
        roc_auc = get_roc(y_test, y_score, y_train)
        roc_df = pd.DataFrame(roc_auc, index=['auc'])
        roc_df = pd.melt(roc_df, value_name='AUC',var_name='classes')
        roc_df['repeat'] = np.repeat(i,len(roc_df))
        metric_list.append(roc_df)
        print 'Trained %i times' %i
    return pd.concat(metric_list,axis=0)
Beispiel #8
0
 def GetOptimalBandwidth(self,datalabel,bandlims,numbands):
     '''Optimize the bandwidth using leave-one-out cross-validation.  
     Essentially, for a single bandwidth, a PDF is made with all 
     points except one, and the unused point is tested against the model. 
     This is done many times, and an average error is computed.  This is
     done for each bandwidth, and the bandwidth with the lowest average
     error is returned.
     Example follows that at jakevdp.github.io/PythonDataScienceHandbook.
     Args
         datalabel: string
             string describing which datalabel in the dataframe to find
             the bandwidth for
         bandlims: array (length 2)
             limits to search for the optimal bandwidth in
         numbands: int
             ints 
     '''
     if bandlims[1] < 0 or bandlims[0] < 0:
         print("Bandwidth must be greater than zero")
         return
     bandwidths = np.linspace(bandlims[0],bandlims[1],numbands)
     data = self.df[datalabel]
     if isinstance(self.df[datalabel][0],np.ndarray):
         print("WERE IN HERE")
         data_arr = []
         for i in range(len(self.df[datalabel])):
             data_arr = data_arr + list(self.df[datalabel][0])
         data=np.array(data_arr)
     if len(data)>500:
         print("This may take some time depending on your data length.")
         print("numbands > 10 with len(data)>500 starts to take a bit")
     grid = sgs.GridSearchCV(skn.KernelDensity(kernel='gaussian'),
                         {'bandwidth': bandwidths},
                         cv=cv.LeaveOneOut(len(data)),
                         verbose=1)
     grid.fit(data[:,None])
     thebandwidth = grid.best_params_['bandwidth']
     return thebandwidth
Beispiel #9
0
    def DoLOOCV(all_x, all_y):
        loo = cross_validation.LeaveOneOut(len(all_x))
        acc = []
        for train_index, test_index in loo:
            train_val_x, test_x = all_x[train_index], all_x[test_index]
            train_val_y, test_y = all_y[train_index], all_y[test_index]

            train_val_x, train_val_y = shuffle(train_val_x,
                                               train_val_y,
                                               random_state=RANDOM_STATE)
            n_trn = len(train_val_x)
            n_dev = int(n_trn * VAL_RATIO)
            n_trn = n_trn - n_dev
            train_x = train_val_x[0:n_trn]
            train_y = train_val_y[0:n_trn]
            val_x = train_val_x[n_trn:]
            val_y = train_val_y[n_trn:]

            is_correct = ClassifierLoocv(train_x, train_y, val_x, val_y,
                                         test_x, test_y)
            acc.append(is_correct)

        return acc
Beispiel #10
0
def test_cross_val_predict():
    boston = load_boston()
    X, y = boston.data, boston.target
    cv = cval.KFold(len(boston.target))

    est = Ridge()

    # Naive loop (should be same as cross_val_predict):
    preds2 = np.zeros_like(y)
    for train, test in cv:
        est.fit(X[train], y[train])
        preds2[test] = est.predict(X[test])

    preds = cval.cross_val_predict(est, X, y, cv=cv)
    assert_array_almost_equal(preds, preds2)

    preds = cval.cross_val_predict(est, X, y)
    assert_equal(len(preds), len(y))

    cv = cval.LeaveOneOut(len(y))
    preds = cval.cross_val_predict(est, X, y, cv=cv)
    assert_equal(len(preds), len(y))

    Xsp = X.copy()
    Xsp *= (Xsp > np.median(Xsp))
    Xsp = coo_matrix(Xsp)
    preds = cval.cross_val_predict(est, Xsp, y)
    assert_array_almost_equal(len(preds), len(y))

    preds = cval.cross_val_predict(KMeans(), X)
    assert_equal(len(preds), len(y))

    def bad_cv():
        for i in range(4):
            yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])

    assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
Beispiel #11
0
def runOneOut():
    num_folds = 2
    num_instances = len(X)
    num_trees = 100
    loocv = cross_validation.LeaveOneOut(n=num_instances)
    model = LogisticRegression()
    #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1)
    #print("LogisticRegression Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0)

    model = DecisionTreeClassifier(max_depth=5)
    #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1)
    #print("Decision Tree Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0)

    #model = GradientBoostingClassifier(learning_rate=0.005,n_estimators=num_trees, random_state=seed,max_depth=5, min_samples_split=1600,min_samples_leaf=50, subsample=0.8)
    #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1)

    model = XGBClassifier()
    results = cross_validation.cross_val_score(model,
                                               X,
                                               Y.ravel(),
                                               cv=loocv,
                                               n_jobs=-1)
    print("Xboost Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0,
                                                 results.std() * 100.0)
Beispiel #12
0
def use_grid_search_svm(traindata_features, traindata_labels, nfolds):
    X = traindata_features[:, 0:2]
    numobs = X.shape[0]

    hyperparam_grid = [{
        'C': np.logspace(-12, 12, 100, base=2),
        'gamma': np.logspace(-12, 12, 100, base=2)
    }]
    y = np.array(traindata_labels)

    #1. first shuffle the data
    inds = list(range(numobs))
    shuffle(inds)
    X = X[inds, :]
    y = y[inds]

    cv = cross_validation.LeaveOneOut(numobs)
    grid_searcher = grid_search.GridSearchCV(svm.SVC(kernel='rbf'),
                                             hyperparam_grid,
                                             cv=cv)
    grid_searcher.fit(X, y)

    print(grid_searcher.best_score_)
    return grid_searcher.best_estimator_
Beispiel #13
0
def use_grid_search_LogisticRegression(traindata_features, traindata_labels,
                                       nfolds):
    print("in")
    X = traindata_features
    numobs = X.shape[0]

    hyperparam_grid = [{'C': np.logspace(-12, 12, 10, base=2)}]
    y = np.array(traindata_labels)
    print("in 1")
    # 1. first shuffle the data
    inds = list(range(numobs))
    shuffle(inds)
    X = X[inds, :]
    y = y[inds]

    print("in 2")
    cv = cross_validation.LeaveOneOut(numobs)
    grid_searcher = grid_search.GridSearchCV(linear_model.LogisticRegression(),
                                             hyperparam_grid,
                                             cv=cv)
    grid_searcher.fit(X, y)
    print("in 3")
    print(grid_searcher.best_score_)
    return grid_searcher.best_estimator_
Beispiel #14
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets, cross_validation

data = datasets.load_svmlight_file("a9a")
X = data[0]
print X.shape
y = data[1]

loo = cross_validation.LeaveOneOut(X.shape[0])
errors = []
# Train and find best k
# k = [1, 2, 5, 10, 20]
# for idx, k in enumerate(k):
#     score = 0
#     i = 0
#     for train_index, test_index in loo:
#         i += 1
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
#         clf = KNeighborsClassifier(k)
#         clf.fit(X_train, y_train)
#         score += clf.score(X_test, y_test)
#         if i % 1000 == 0:
#             print "Progress - i = " + str(i) + " k = " + str(k)
#     sc = score/float(X.shape[0])
#     print "Finale score for k = " + str(k) + " " + str(sc)
#     errors.append(1-sc)

clf = KNeighborsClassifier(20)
clf.fit(X, y)
Beispiel #15
0
for i in inds:
    t = read('store/tmp/%d_err.txt'%i)
    if t:
        m,s = t.split('elapsed')[0].split(' ')[-1].split(':')
        y.append(float(m)*60+float(s))
        f = [float(i) for i in t.split('Features: ')[-1].split('\n')[0].split(' ')]
        p = []
        print(f, y[-1])
        for i in range(len(f)):
            for j in range(i):
                p.append(f[i]*f[j])
            p.append(f[i])
        p = [f[0], f[3], f[0]*f[3]]
        x.append(p)

"""loo = cross_validation.LeaveOneOut(len(y))
regr = linear_model.LinearRegression()
scores = cross_validation.cross_val_score(regr, x, y, scoring='neg_mean_squared_error', cv=loo,)
print(10**((-scores.mean())**.5))"""
model = linear_model.LinearRegression()
model.fit(x, y)
r_sq = model.score(x, y)

loo = cross_validation.LeaveOneOut(len(y))
scores = cross_validation.cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=loo,)
print(((-scores.mean())**.5))

print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)
B_enabled = True
# IRIS (arff) - load datasets
data, meta = arff.loadarff(open("datasets/iris.arff", "r"))
y_train = np.array(data['class'])
X_train = np.array([list(x) for x in data[meta._attrnames[0:-1]]])
X_train = X_train.toarray() if sps.issparse(
    X_train) else X_train  # avoid sparse data
class_names = np.unique(y_train)
# IRIS (arff) - cross validation example
clf = WisardClassifier(nobits=16,
                       bleaching=B_enabled,
                       notics=256,
                       mapping='linear',
                       debug=True,
                       default_bleaching=3)
kf = cross_validation.LeaveOneOut(len(class_names))
predicted = cross_validation.cross_val_score(clf,
                                             X_train,
                                             y_train,
                                             cv=kf,
                                             n_jobs=1)
print("Accuracy Avg: %.2f" % predicted.mean())

# IRIS (libsvm) - load datasets
X_train, y_train = load_svmlight_file(open("datasets/iris.libsvm", "r"))
class_names = np.unique(y_train)
X_train = X_train.toarray() if sps.issparse(
    X_train) else X_train  # avoid sparse data
# IRIS - cross validation example (with fixed seed)
clf = WisardClassifier(nobits=16,
                       notics=1024,
Beispiel #17
0
plot(np.arange(1, kMax), scoreK)
xlabel('k')
ylabel('accuracy')
title('k-NN 5-fold cross validation accuracy vs k')
grid(True)
savefig('q1b.png')
print('q1b done')

#### q1a : leave one out accuracy

#take a random n size sample from the dataset
n = 10000
sampleIndices = np.random.randint(N, size=n)
X = X[sampleIndices, :]
y = y[sampleIndices]
loo = cross_validation.LeaveOneOut(n)

scoreK = []
for k in range(1, kMax):
    neigh = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validation.cross_val_score(neigh, X, y, cv=loo)
    scoreK.append(np.mean(scores))

print(scoreK)
plot(np.arange(1, kMax), scoreK)
xlabel('k')
ylabel('accuracy')
title('k-NN leave one out accuracy vs k')
grid(True)
savefig('q1a.png')
print('q1a done')
Beispiel #18
0
    chistogram.append(chist(imc))
    labels.append(fname[:-len('xx.jpg')])

print('Finished computing features.')

haralick = np.array(haralick)
chistogram = np.array(chistogram)
labels = np.array(labels)

haralick_plus_chist = np.hstack([chistogram, haralick])

clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])

from sklearn import cross_validation
cv = cross_validation.LeaveOneOut(len(images))
scores = cross_validation.cross_val_score(clf, haralick, labels, cv=cv)
print(
    'Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'
    .format(scores.mean()))

scores = cross_validation.cross_val_score(clf, chistogram, labels, cv=cv)
print(
    'Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'
    .format(scores.mean()))

scores = cross_validation.cross_val_score(clf,
                                          haralick_plus_chist,
                                          labels,
                                          cv=cv)
print(
def RF_Model(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    RFModel = RandomForestRegressor()
    RFModel.fit(Scaled_Input_Data, Output_Data)
    RF_Time = time.time() - T0
    print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time)
    MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_RF = np.mean(list(MSEs_RF))
    print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF))
    return(MeanMSE_RF, RFModel)
Beispiel #20
0
words = [k for k,v in filtered_word_dict.iteritems()]

wiki_requests_data = np.zeros((len(binary_wiki_requests),len(words)))
wiki_requests_target = np.zeros(2178)

request_count = 0
for request in binary_wiki_requests:
    for word in request['words']:
        if word in words:
            idx = words.index(word)
            wiki_requests_data[request_count][idx] += 1
    wiki_requests_target[request_count] = request['class']
    request_count += 1

loo = cross_validation.LeaveOneOut(len(binary_wiki_requests))
print len(loo)
print len(binary_wiki_requests)

accuracy = 0.0
count = 0

for train_index, test_index in loo:
    X_train, X_test = wiki_requests_data[train_index], wiki_requests_data[test_index]
    y_train, y_test = wiki_requests_target[train_index], wiki_requests_target[test_index]

    clf = svm.LinearSVC()
    clf.fit(X_train,y_train)

    prediction = clf.predict(X_test)
    
Beispiel #21
0
 def setup_indices(self, train_data, test_data):
     self.indices = skl_cross_validation.LeaveOneOut(len(test_data))
Beispiel #22
0
def region_CV_fits_and_errors(X,
                              Y,
                              P_X,
                              P_Y,
                              P_Y_dag,
                              err_fun,
                              Omega=None,
                              rel_type=2):
    n_inj = X.shape[1]
    outer_sets = cross_validation.LeaveOneOut(n_inj)
    err_reg = np.zeros((len(outer_sets), ))
    err_homog = np.zeros((len(outer_sets), ))
    rel_err_reg = np.zeros((len(outer_sets), ))
    rel_err_homog = np.zeros((len(outer_sets), ))
    GOF_reg = np.zeros((len(outer_sets), ))
    GOF_homog = np.zeros((len(outer_sets), ))
    for i, (train, test) in enumerate(outer_sets):
        # compare models in outer sets only, same as eventual test errors in the
        # nested cross-validation procedure
        X_train = X[:, train]
        X_test = X[:, test]
        Y_train = Y[:, train]
        Y_test = Y[:, test]
        if Omega is not None:
            Omega_train = Omega[:, train]
            Omega_test = Omega[:, test]
            # W = fit_linear_model_proj(X_train, Y_train, P_Y_dag, P_X,
            #                           Omega_train)
            W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train))
        else:
            W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train))
        # W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train))
        Y_pred = W.dot(P_X.dot(X_test))
        Y_pred_homog = P_Y_dag.dot(Y_pred)
        Y_test_reg = P_Y.dot(Y_test)
        resid_reg = Y_pred - Y_test_reg  # regional matrix
        resid_homog = Y_pred_homog - Y_test  # voxel-homogeneous matrix
        err_reg[i] = err_fun(resid_reg)
        if Omega is not None:
            err_homog[i] = err_fun(proj_Omega(resid_homog, Omega_test))
        else:
            err_homog[i] = err_fun(resid_homog)
        if rel_type == 1:
            rel_err_reg[i] = err_reg[i] / err_fun(Y_test_reg)
            rel_err_homog[i] = err_homog[i] / err_fun(Y_test)
            GOF_reg[i] = err_fun(W.dot(P_X.dot(X_train))-P_Y.dot(Y_train))/\
              err_fun(P_Y.dot(Y_train))
            GOF_homog[i] = err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train)/\
              err_fun(Y_train)
        elif rel_type == 2:
            rel_err_reg[i] = \
              2*err_reg[i] / (err_fun(Y_test_reg) + err_fun(Y_pred))
            GOF_reg[i] = 2*err_fun(W.dot(P_X.dot(X_train))-P_Y.dot(Y_train))/\
              (err_fun(P_Y.dot(Y_train))+err_fun(W.dot(P_X.dot(X_train))))
            if (Omega is not None) and proj_errors:
                rel_err_homog[i] = 2*err_homog[i] / \
                  (err_fun(proj_Omega(Y_test, Omega_test))+
                   err_fun(proj_Omega(Y_pred_homog, Omega_test)))
                GOF_homog[i] = \
                  2*err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train) / \
                  (err_fun(proj_Omega(Y_train, Omega_train)) +
                   err_fun(proj_Omega(P_Y_dag.dot(W.dot(P_X.dot(X_train))),
                                      Omega_train)))
            else:
                rel_err_homog[i] = 2*err_homog[i]/\
                  (err_fun(Y_test) + err_fun(Y_pred_homog))
                GOF_homog[i] = \
                  2*err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train)/\
                  (err_fun(Y_train) +
                   err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))))
        #if i == 2:
        #    import pdb
        #    pdb.set_trace()
    return (err_reg, err_homog, rel_err_reg, rel_err_homog, GOF_reg, GOF_homog)
Beispiel #23
0
def compute_acc_conf(x,
                     y,
                     confounds,
                     verbose=False,
                     balanced=True,
                     loo=False,
                     optimize=True,
                     C=.01):
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)

    # remove intra matrix mean and var
    #x = ts.normalize_data(x)
    #cv = cross_validation.KFold(len(y),n_folds=10)
    if loo:
        cv = cross_validation.LeaveOneOut(len(y))
    else:
        cv = StratifiedKFold(y=encoder.transform(y), n_folds=10)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    total_test_score = []
    y_pred = []
    #clf_array = []
    bc_all = []

    prec = []
    recall = []

    if len(np.unique(y)) == 1:
        print 'Unique class: 100%', np.sum(encoder.transform(y) == 0) / len(y)
        return (1., 0., len(y))

    for i, (train, test) in enumerate(cv):

        select_x = x.copy()

        #betacluster = bc.BetaCluster(crm.transform(confounds[train,:],select_x[train,:]),encoder.transform(y[train]),100,k_feature=200)
        #bc_all.append(betacluster)

        if balanced:
            clf = SVC(kernel='linear', class_weight='auto', C=C)
        else:
            clf = SVC(kernel='linear', C=C)

        if len(confounds) == 0:
            xtrain = select_x[train, :]
            xtest = select_x[test, :]
        else:
            crm = ConfoundsRm(confounds[train, :], select_x[train, :])
            xtrain = crm.transform(confounds[train, :], select_x[train, :])
            xtest = crm.transform(confounds[test, :], select_x[test, :])

        ytrain = encoder.transform(y[train])
        ytest = encoder.transform(y[test])

        #clf.probability = True
        if optimize:
            clf, score = plib.grid_search(clf,
                                          xtrain,
                                          ytrain,
                                          n_folds=10,
                                          verbose=verbose)

        clf.fit(xtrain, ytrain)
        total_test_score.append(clf.score(xtest, ytest))
        #clf_array.append(clf)

        prec.append(metrics.precision_score(ytest, clf.predict(xtest)))
        recall.append(metrics.recall_score(ytest, clf.predict(xtest)))

        if loo:
            y_pred.append(clf.predict(xtest))
        if verbose:
            print('nSupport: ', clf.n_support_)
            print "Train:", clf.score(xtrain, ytrain)
            print "Test :", clf.score(xtest, ytest)
            print "Prediction :", clf.predict(xtest)
            print "Real Labels:", ytest
            print('Precision:', prec[-1], 'Recall:', recall[-1])

    if loo:
        total_std_test_score = estimate_std(
            metrics.accuracy_score(encoder.transform(y), np.array(y_pred)),
            len(y))
        print('Mean:', np.mean(total_test_score), 'Std:', total_std_test_score,
              'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall))
        return (np.mean(total_test_score), total_std_test_score, len(y))
    else:
        print('Mean:', np.mean(total_test_score), 'Std:',
              np.std(total_test_score), 'AvgPrecision:', np.mean(prec),
              'AvgRecall:', np.mean(recall))
        return (np.mean(total_test_score), np.std(total_test_score))
def First_Model_SVR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"C": [1e-2, 1e-1,1e0, 1e1, 1e2],"gamma": np.logspace(-4, 2, 6)}
    svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol = 0.005), cv=5,param_grid=Grid_Dict, scoring="mean_absolute_error")
    svr_Tuned.fit(Scaled_Input_Data, Output_Data)
    SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol = 0.01)
    SVR_Time = time.time() - T0
    print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time)
    MSEs_SVR = cross_validation.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_SVR = np.mean(list(MSEs_SVR))
    print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', (-1*MeanMSE_SVR))
    return(MeanMSE_SVR, svr_Tuned)
Beispiel #25
0
def tenFoldCV_onChicagoCrimeData(features=['corina'],
                                 CVmethod='10Fold',
                                 P=10,
                                 NUM_ITER=20,
                                 SHUFFLE=True):
    """
    Use different years data to train the NB model
    """
    YEARS = range(2003, 2014)

    Y = []
    C = []
    FL = []
    GL = []
    T = []
    for year in YEARS:
        W = generate_transition_SocialLag(year, lehd_type=0)
        Yhat = retrieve_crime_count(year - 1, ['total'])
        y = retrieve_crime_count(year, ['total'])
        c = generate_corina_features()
        popul = c[1][:, 0].reshape((77, 1))

        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        y = np.divide(y, popul) * 10000
        Yhat = np.divide(Yhat, popul) * 10000

        W2 = generate_geographical_SpatialLag_ca()

        f1 = np.dot(W, Yhat)
        f2 = np.dot(W2, Yhat)

        FL.append(f1)
        GL.append(f2)
        Y.append(y)
        T.append(Yhat)
        C.append(c[1])

    Y = np.concatenate(Y, axis=0)
    columnName = ['intercept']
    f = np.ones(Y.shape)
    if 'corina' in features:
        C = np.concatenate(C, axis=0)
        f = np.concatenate((f, C), axis=1)
        columnName += c[0]
    if 'sociallag' in features:
        FL = np.concatenate(FL, axis=0)
        f = np.concatenate((f, FL), axis=1)
        columnName += ['sociallag']
    if 'spatiallag' in features:
        GL = np.concatenate(GL, axis=0)
        f = np.concatenate((f, GL), axis=1)
        columnName += ['spatiallag']
    if 'temporallag' in features:
        T = np.concatenate(T, axis=0)
        f = np.concatenate((f, T), axis=1)
        columnName += ['temporallag']

    if SHUFFLE:
        f, Y = shuffle(f, Y)

    if CVmethod == '10Fold':
        splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True)
    elif CVmethod == 'leaveOneOut':
        splt = cross_validation.LeaveOneOut(n=f.shape[0])
    elif CVmethod == 'leavePOut':
        splt = cross_validation.LeavePOut(n=f.shape[0], p=P)

    mae1 = []
    mae2 = []
    mre1 = []
    mre2 = []
    sd_mae1 = []
    sd_mae2 = []
    sd_mre1 = []
    sd_mre2 = []
    med_mae1 = []
    med_mae2 = []
    med_mre1 = []
    med_mre2 = []
    cnt = 0

    if CVmethod == 'leaveOneOut':
        y_gnd = []
        y_lr = []

    for train_idx, test_idx in splt:
        cnt += 1
        if cnt > NUM_ITER:
            break
        f_train, f_test = f[train_idx, :], f[test_idx, :]
        Y_train, Y_test = Y[train_idx, :], Y[test_idx, :]

        # write file for invoking NB regression in R
        np.savetxt("Y_train.csv", Y_train, delimiter=",")
        np.savetxt("Y_test.csv", Y_test, delimiter=",")
        pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv",
                                                         sep=",",
                                                         index=False)
        pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv",
                                                        sep=",",
                                                        index=False)

        # NB regression
        nbres = subprocess.check_output(['Rscript',
                                         'nbr_eval_kfold.R']).split(" ")
        y1 = np.array([float(e) for e in nbres])
        y1 = y1.reshape((y1.shape[0], 1))
        a = np.abs(Y_test - y1)

        mae1.append(np.mean(a))
        sd_mae1.append(np.std(a))
        med_mae1 += a.tolist()
        r = a / Y_test
        mre1.append(np.mean(r))
        sd_mre1.append(np.std(r))
        med_mre1 += r.tolist()

        # Linear regression
        r2 = linearRegression(f_train, Y_train)
        y2 = r2.predict(f_test)
        y2 = y2.reshape((y2.shape[0], 1))
        ae = np.abs(Y_test - y2)
        mae2.append(np.mean(ae))
        sd_mae2.append(np.std(ae))
        med_mae2 += ae.tolist()
        re = ae / Y_test
        mre2.append(np.mean(re))
        sd_mre2.append(np.std(re))
        med_mre2 += re.tolist()

        if CVmethod == 'leaveOneOut':
            y_gnd.append(Y_test)
            y_lr.append(y2)

    if CVmethod == 'leaveOneOut':
        print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1),
        print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2)
        return y_gnd, y_lr
    else:
        print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean(
            mre1), np.mean(sd_mre1), np.median(med_mre1),
        print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean(
            mre2), np.mean(sd_mre2), np.median(med_mre2)

    return mae1, mae2
# exercise 7.1.2

from pylab import *
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation

# requires data from exercise 4.1.1
from ex4_1_1 import *

# Maximum number of neighbors
L=40

CV = cross_validation.LeaveOneOut(N)
errors = np.zeros((N,L))
i=0
for train_index, test_index in CV:
    print('Crossvalidation fold: {0}/{1}'.format(i+1,N))

    # extract training and test set for current CV fold
    X_train = X[train_index,:]
    y_train = y[train_index,:]
    X_test = X[test_index,:]
    y_test = y[test_index,:]

    # Fit classifier and classify the test points (consider 1 to 40 neighbors)
    for l in range(1,L+1):
        knclassifier = KNeighborsClassifier(n_neighbors=l);
        knclassifier.fit(X_train, ravel(y_train));
        y_est = knclassifier.predict(X_test);
        errors[i,l-1] = np.sum(y_est[0]!=y_test[0,0])
valid_param_vals = []
print "Checking for errors for classifier %s"  % (classifier_to_print)
print "Params: %s" % (classifier_varying_params)
for param_val in classifier_varying_params[key]:
    try:
        alt_classifier = base.clone(classifier)
        alt_classifier = alt_classifier.set_params(**{key:param_val})
        alt_classifier.fit(population, training_labels)
        valid_param_vals.append(param_val)
    except ValueError as e:
        info_fh.write("The parameter %s:%s for SVM %s for antigen %s errored: '%s'\n" % (key, param_val, classifier_to_print, antigen_type, e))
if len(valid_param_vals) > 0:
    try:
        print "Running grid for classifier %s"  % (classifier_to_print)
        print "Params: %s" % (classifier_varying_params)
        cv = cross_validation.LeaveOneOut(len(training_labels))
        grid = grid_search.GridSearchCV(classifier, {key:valid_param_vals}, cv=cv,, refit=False, verbose=3)
        t0 = time.time()
        grid.fit(population, training_labels)
        t1 = time.time()
        time_logging_fh.write('Fitting grid with SVM %s for antigen %s took %f seconds\n' % (classifier_to_print, antigen_type, t1-t0))
        print "The best parameters for antigen %s, SVM %s are:"  % (antigen_type, classifier_to_print)
        print "%s with a score of %f" % (grid.best_params_, grid.best_score_)
        print "Grid scores:"
        for thing in grid.grid_scores_:
            print thing
        info_fh.write("The best parameters for SVM %s for antigen %s are %s with a score of %f\n" % (classifier_to_print, antigen_type, grid.best_params_, grid.best_score_))
        info_fh.write("Grid scores:\n")
        info_fh.write("%s\n" % (grid.grid_scores_))
        plt.figure()
        plt.errorbar(
import pandas
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
url = "https://goo.gl/vhm1eU"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
num_folds = 10
num_instances = len(X)
loocv = cross_validation.LeaveOneOut(n=num_instances)
model = LogisticRegression()
results = cross_validation.cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)# -*- coding: utf-8 -*-

Beispiel #29
0
def benchmark(df):

    predictors = [
        "LinearRegression", "Lasso", "AdaBoostRegressor",
        "RandomForestRegressor", "DecisionTreeRegressor"
    ]
    predictorsMapper = {
        'LinearRegression': linear_model.LinearRegression(),
        'Lasso': linear_model.Lasso(alpha=0.1, max_iter=1000),
        'AdaBoostRegressor': ensemble.AdaBoostRegressor(),
        'RandomForestRegressor': ensemble.RandomForestRegressor(),
        'DecisionTreeRegressor': tree.DecisionTreeRegressor()
    }

    #Separate data by operators
    sumData = df[(df.Operator == 1)]
    sumTarget = sumData.Time
    sumData = sumData.drop(sumData.columns[[1, 3]], axis=1)

    subData = df[(df.Operator == 2)]
    subTarget = subData.Time
    subData = subData.drop(subData.columns[[1, 3]], axis=1)

    mulData = df[(df.Operator == 3)]
    mulTarget = mulData.Time
    mulData = mulData.drop(mulData.columns[[1, 3]], axis=1)

    divData = df[(df.Operator == 4)]
    divTarget = divData.Time
    divData = divData.drop(divData.columns[[1, 3]], axis=1)

    sumLoo = cross_validation.LeaveOneOut(len(sumTarget))
    subLoo = cross_validation.LeaveOneOut(len(subTarget))
    mulLoo = cross_validation.LeaveOneOut(len(mulTarget))
    divLoo = cross_validation.LeaveOneOut(len(divTarget))

    for p in predictors:
        print("Benchmarking " + p + "...")
        scoreTotal = 0
        sumRegr = predictorsMapper.get(p, False)
        subRegr = predictorsMapper.get(p, False)
        mulRegr = predictorsMapper.get(p, False)
        divRegr = predictorsMapper.get(p, False)
        scoreSum = abs(
            cross_validation.cross_val_score(sumRegr,
                                             sumData,
                                             sumTarget,
                                             scoring='mean_squared_error',
                                             cv=sumLoo).mean())
        scoreSub = abs(
            cross_validation.cross_val_score(subRegr,
                                             subData,
                                             subTarget,
                                             scoring='mean_squared_error',
                                             cv=subLoo).mean())
        scoreMul = abs(
            cross_validation.cross_val_score(mulRegr,
                                             mulData,
                                             mulTarget,
                                             scoring='mean_squared_error',
                                             cv=mulLoo).mean())
        scoreDiv = abs(
            cross_validation.cross_val_score(divRegr,
                                             divData,
                                             divTarget,
                                             scoring='mean_squared_error',
                                             cv=divLoo).mean())
        scoreTotal = scoreSum + scoreSub + scoreMul + scoreDiv
        print("Mean Squared Error (by operator):")
        print("\tSum regressor: " + str(scoreSum))
        print("\tSubstraction regressor: " + str(scoreSub))
        print("\tMultiplication regressor: " + str(scoreMul))
        print("\tDivision regressor: " + str(scoreDiv))
        print("\tTotal: " + str(scoreTotal))
def Second_Model_KRR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)}
    krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error")
    krr_Tuned.fit(Scaled_Input_Data, Output_Data)
    KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma'])
    KRR_Time = time.time() - T0
    print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time)
    MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_KRR = np.mean(list(MSEs_KRR))
    print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR))
    return(MeanMSE_KRR, krr_Tuned)