Beispiel #1
0
def test_dtype_match_cholesky():
    # Test different alphas in cholesky solver to ensure full coverage.
    # This test is separated from test_dtype_match for clarity.
    rng = np.random.RandomState(0)
    alpha = (1.0, 0.5)

    n_samples, n_features, n_target = 6, 7, 2
    X_64 = rng.randn(n_samples, n_features)
    y_64 = rng.randn(n_samples, n_target)
    X_32 = X_64.astype(np.float32)
    y_32 = y_64.astype(np.float32)

    # Check type consistency 32bits
    ridge_32 = Ridge(alpha=alpha, solver='cholesky')
    ridge_32.fit(X_32, y_32)
    coef_32 = ridge_32.coef_

    # Check type consistency 64 bits
    ridge_64 = Ridge(alpha=alpha, solver='cholesky')
    ridge_64.fit(X_64, y_64)
    coef_64 = ridge_64.coef_

    # Do all the checks at once, like this is easier to debug
    assert coef_32.dtype == X_32.dtype
    assert coef_64.dtype == X_64.dtype
    assert ridge_32.predict(X_32).dtype == X_32.dtype
    assert ridge_64.predict(X_64).dtype == X_64.dtype
    assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
def test_dtype_match(solver):
    rng = np.random.RandomState(0)
    alpha = 1.0

    n_samples, n_features = 6, 5
    X_64 = rng.randn(n_samples, n_features)
    y_64 = rng.randn(n_samples)
    X_32 = X_64.astype(np.float32)
    y_32 = y_64.astype(np.float32)

    # Check type consistency 32bits
    ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
    ridge_32.fit(X_32, y_32)
    coef_32 = ridge_32.coef_

    # Check type consistency 64 bits
    ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
    ridge_64.fit(X_64, y_64)
    coef_64 = ridge_64.coef_

    # Do the actual checks at once for easier debug
    assert coef_32.dtype == X_32.dtype
    assert coef_64.dtype == X_64.dtype
    assert ridge_32.predict(X_32).dtype == X_32.dtype
    assert ridge_64.predict(X_64).dtype == X_64.dtype
    assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
Beispiel #3
0
def test_dtype_match():
    rng = np.random.RandomState(0)
    alpha = 1.0

    n_samples, n_features = 6, 5
    X_64 = rng.randn(n_samples, n_features)
    y_64 = rng.randn(n_samples)
    X_32 = X_64.astype(np.float32)
    y_32 = y_64.astype(np.float32)

    solvers = ["svd", "sparse_cg", "cholesky", "lsqr"]
    for solver in solvers:

        # Check type consistency 32bits
        ridge_32 = Ridge(alpha=alpha, solver=solver)
        ridge_32.fit(X_32, y_32)
        coef_32 = ridge_32.coef_

        # Check type consistency 64 bits
        ridge_64 = Ridge(alpha=alpha, solver=solver)
        ridge_64.fit(X_64, y_64)
        coef_64 = ridge_64.coef_

        # Do the actual checks at once for easier debug
        assert coef_32.dtype == X_32.dtype
        assert coef_64.dtype == X_64.dtype
        assert ridge_32.predict(X_32).dtype == X_32.dtype
        assert ridge_64.predict(X_64).dtype == X_64.dtype
        assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
Beispiel #4
0
def _test_multi_ridge_diabetes(filter_):
    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T
    n_features = X_diabetes.shape[1]

    ridge = Ridge(fit_intercept=False)
    ridge.fit(filter_(X_diabetes), Y)
    assert_equal(ridge.coef_.shape, (2, n_features))
    Y_pred = ridge.predict(filter_(X_diabetes))
    ridge.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge.predict(filter_(X_diabetes))
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
Beispiel #5
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    K, v, Q = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(v, Q, y_diabetes, 1.0)
    values, c = ridge_gcv._values(K, v, Q, y_diabetes, 1.0)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value) ** 2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    best_alpha = ridge_gcv.best_alpha
    ret.append(best_alpha)

    # check that we get same best alpha with custom loss_func
    ridge_gcv2 = _RidgeGCV(fit_intercept=False, loss_func=mean_squared_error)
    ridge_gcv2.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.best_alpha, best_alpha)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.best_alpha, best_alpha)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
                              Y_pred, decimal=5)

    return ret
def test_fit_simple_backupsklearn():
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.Ridge

    enet = Solver(glm_stop_early=False)
    print("h2o4gpu fit()")
    enet.fit(X, y)
    print("h2o4gpu predict()")
    print(enet.predict(X))
    print("h2o4gpu score()")
    print(enet.score(X,y))

    enet_wrapper = Solver(normalize=True, random_state=1234)
    print("h2o4gpu scikit wrapper fit()")
    enet_wrapper.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet_wrapper.predict(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet_wrapper.score(X, y))

    from sklearn.linear_model.ridge import Ridge
    enet_sk = Ridge(normalize=True, random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit score()")
    print(enet_sk.score(X, y))

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()

    print(enet_sk.coef_)

    print(enet_sk_coef)

    print(enet_wrapper.coef_)

    print(enet_sk.intercept_)
    print(enet_wrapper.intercept_)

    print(enet_sk.n_iter_)
    print(enet_wrapper.n_iter_)

    print("Coeffs, intercept, and n_iters should match")
    assert np.allclose(enet_wrapper.coef_, enet_sk_coef)
    assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_)
Beispiel #7
0
def test_toy_ridge_object():
    # Test BayesianRegression ridge classifier
    # TODO: test also n_samples > n_features
    X = np.array([[1], [2]])
    Y = np.array([1, 2])
    clf = Ridge(alpha=0.0)
    clf.fit(X, Y)
    X_test = [[1], [2], [3], [4]]
    assert_almost_equal(clf.predict(X_test), [1., 2, 3, 4])

    assert_equal(len(clf.coef_.shape), 1)
    assert_equal(type(clf.intercept_), np.float64)

    Y = np.vstack((Y, Y)).T

    clf.fit(X, Y)
    X_test = [[1], [2], [3], [4]]

    assert_equal(len(clf.coef_.shape), 2)
    assert_equal(type(clf.intercept_), np.ndarray)
def eval_aggr_shifts(X, y, ignore_rows):
    eps = 1e-6
    pred = []
    real = []
    
    for inst_n in ignore_rows:
        X = np.concatenate((X[:inst_n], X[inst_n+1:]))
        y = np.concatenate((y[:inst_n], y[inst_n+1:]))
    
    n = X.shape[0]
    for inst_n in range(n):
        x_i = X[inst_n]
        y_i = y[inst_n]
        
        X_train = np.concatenate((X[:inst_n], X[inst_n+1:]))
        y_train = np.concatenate((y[:inst_n], y[inst_n+1:]))
        
        y_train = np.array([max(eps, min(1 - eps, val)) for val in y_train])
        y_train = np.log(y_train / (1 - y_train))
        
        model = Ridge(alpha=.2, fit_intercept=True, normalize=True)
        #model = Lasso(alpha=.001, fit_intercept=True, normalize=True)
        model.fit(X_train, y_train)
        
        y_hat = model.predict(x_i.reshape(1, -1))[0]
        
        y_i1 = max(eps, min(1 - eps, y_i))
        y_i1 = np.log(y_i1 / (1 - y_i1))
        print('inst: ' + str(inst_n) + ', prediction: ' + str(y_hat) + ', err: ' + str(y_hat - y_i1))
        
        pred.append(1 / (1 + exp(-y_hat)))
        real.append(y_i)
        
    model = Ridge(alpha=.2, fit_intercept=True, normalize=True)
    model.fit(X, y)
        
    return pred, real, model.coef_
Beispiel #9
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(alpha=1.0, fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value) ** 2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
                              Y_pred, decimal=5)

    return ret
        model2_train1 += virtual_test1

    print "now saving the result"

    ff = open('virtual_train_data.json', 'w')
    ff.write(json.dumps([model2_train0, model2_train1]))
    ff.close()

if sys.argv[1] == "second":
    ff = open('virtual_train_data.json', 'r')
    model2_train0, model2_train1 = json.loads(ff.read())
    ff.close()
    print "opened train0 and train1 with each length", len(model2_train0), len(model2_train1)
    print model2_train0[0]
    print model2_train1[0]
    ff = open('intermediate_result.json', 'r')
    model2_test0, _ = json.loads(ff.read())
    print model2_test0[0]
    model2 = Ridge()
    print "start fitting 2nd model"
    model2.fit(model2_train0, model2_train1)
    print "start predicting"
    predictions=model2.predict(model2_test0)
    print "saving the predicted result into the file"
    f = open('result.csv', 'w')
    f.write("ID;COTIS\n");
    for ind, prd in enumerate(predictions):
        f.write(my_ids[ind] + ';' + str(prd) + '\n')
    f.close()
    print "all tasks completed"
train1 = extract_target(train_dataset)
test0 = extract_predictor(test_dataset, False)

results = []
for cnt in range(1000):
    projected0 = []
    projected1 = []
    for i in xrange(len(train0)):
        if random.random() < 0.4:
            continue
        projected0.append(train0[i])
        projected1.append(train1[i])
    print "now fitting the model", cnt, "with len", len(projected0)
    model = Ridge()
    model.fit(projected0, projected1)
    predictions=model.predict(test0)
    results.append(list(predictions))

final_result = []
for ind in xrange(len(results[0])):
    cand = []
    for i in xrange(len(results)):
        cand.append(results[i][ind])
    final_result.append(sum(sorted(cand)[100:-100])*1.0/(len(cand)-200))

#predictions=model.predict(valid_dataset)

#Evaluate the quality of the prediction
#print sklearn.metrics.mean_absolute_error(predictions,valid_target)

print "saving the predicted result into the file"
def trainModel(param,feat_folder,feat_name):
    #read data from folder
    print 'now we read data from folder:%s'%(feat_folder)
   
    #start cv
    print 'now we need to generate cross_validation'
    accuracy_cv = []
  
    for i in range(0,2):
        print 'this is the run:%d cross-validation'%(i+1)
        testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1)))
        #if we use xgboost to train model ,we need to use svmlib format
        if param['task'] in ['regression']:
            #with xgb we will dump the file with CV,and we will read data 
            train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            watchlist = [(train_data,'train'),(valid_data,'valid')]
            bst = xgb.train(param,train_data,int(param['num_round']),watchlist)
            pred = bst.predict(valid_data)
        
        elif param['task'] in ['clf_skl_lr']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            clf = LogisticRegression()
            clf.fit(train_data,train_label)
            pred = clf.predict(test_data)
        
        elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                    rf.fit(train_data, test_label)
                    pred = rf.predict(test_data)
        
        elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                    etr.fit(train_data,test_label)
                    pred = etr.predict(test_data)
                    
        elif param['task'] in ['reg_skl_gbm'] :
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data,train_label)
            pred = gbm.predict(test_data) 
        
        elif param['task'] in ['reg_skl_ridge']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data,train_label)
            
            predraw = ridge.predict(test_data)
            print predraw
            predrank = predraw.argsort().argsort()
            trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1)))
            cdf = creatCDF(train, trainIndex)
            pred = getScore(predrank,cdf)
            print pred
            
        """
        elif param['task'] in ['regression']:
            
            
        
        elif param['task'] in ['reg_skl_gbm'] :
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data[feat_names],train_data['cid'])
            pred = gbm.predict(valid_data[feat_names])
        elif param['task'] in ['reg_skl_ridge']:
            feat_names.remove('cid')
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data[feat_names],train_data['cid'])
            pred = ridge.predict(valid_data[feat_names])
        """
        #now we use the the accuracy to limit our model
        acc = accuracy_model(pred,train.iloc[testIndex]['cid'])
        print "the model accurary:%s"%(acc)
        accuracy_cv.append(acc)

    #here we will count the 
    accuracy_cv_mean = np.mean(accuracy_cv)
    accuracy_cv_std = np.std(accuracy_cv)
    print 'the accuracy for %.6f'%(accuracy_cv_mean)
    return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
Beispiel #13
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    if fit_intercept:
        X_diabetes_ = X_diabetes - X_diabetes.mean(0)
    else:
        X_diabetes_ = X_diabetes
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)
    ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept)

    # because fit_intercept is applied

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes_[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes_[i]])[0]
        error = (y_diabetes[i] - value)**2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes),
                  y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5)

    return ret
Beispiel #14
0
 def predict_prices(self,
                    predict_start,
                    num_previous_dates,
                    num_successive_dates,
                    include_training_dates=False,
                    method=Regression.LINEAR):
     '''
     Linear Regression (Linear): Standard linear regression.
     
     Support Vector Regression (SVR_RBF) is an extension of Support Vector Machines used to solve regression problems.
     An explanation and example of its usage are available here:
     http://scikit-learn.org/dev/modules/svm.html#regression
     http://scikit-learn.org/dev/auto_examples/svm/plot_svm_regression.html#example-svm-plot-svm-regression-py
     
     @param predict_start: string The date to start the prediction.
     @param num_previous_dates: int Number of dates to use prior to predict_start as the training data.
     @param num_successive_dates: int Number of dates to predict after predict_start.
     @param include_training_dates: bool Include the training dates in the predicted prices.
     @param method: Method used for regression. One of the Regression.methods values.
     
     @return: A Series with a DatetimeIndex with the dates and predicted prices.
     '''
     predict_dates = next_n_business_days(predict_start, num_successive_dates, include_start=True)
     training_dates = prev_n_business_days(predict_start, num_previous_dates, include_start=False)
     training_prices_series = self.get_prices_range(str(training_dates[-1]), str(training_dates[0]))
     training_date_index_ls = training_prices_series.index.values
     
     if include_training_dates:
         # Include training data in predictions.
         predict_dates = np.hstack((training_dates,predict_dates))
         
     td_ordinals = datetime64_to_ordinal_arr(training_date_index_ls)
     p = datetime64_to_ordinal_arr(predict_dates)
     
     
     # Have to reshape into a 2d array from a 1d array.
     p = p.reshape(p.shape[0], 1)
     X = td_ordinals.reshape(td_ordinals.shape[0], 1)
     
     # Normalize dates
     A = np.vstack((X, p))  # Have to stack to normalize training and test data as one.
     A = scale(A, axis=0)
     n = p.shape[0]
     X = A[:-n]
     p = A[-n:]
     
     y = training_prices_series.values
     
     if method == Regression.SVR_RBF:
         regressor = svm.SVR(kernel='rbf')
     elif method == Regression.SVR_POLY:
         regressor = svm.SVR(kernel='poly')
     elif method == Regression.RIDGE:
         regressor = Ridge()
     elif method == Regression.LINEAR:
         regressor = LinearRegression()
     else:
         raise ValueError('Unrecognized regression method %s' % (method))
     try:
         regressor.fit(X, y)  # Train using the training X and y data.
     except ValueError:
         raise ValueError('Issue fitting, re-throwing.')
     predictions = regressor.predict(p)
     index = pd.DatetimeIndex(predict_dates)
     series = pd.Series(predictions,index=index)
     return series
Beispiel #15
0
def main(num_pts, num_children, learning_rate=1.5, learning_scale=0.8, rand_seed=0):
    top_node = Node(SqLoss, parent=None, name="root", input_dim=0)
    child_nodes = [Node(SqLoss, parent=top_node, input_dim=FEATURE_DIM, name='Child {:d}'.format(i))
                   for i in xrange(num_children)]
    #child_nodes = []
    # for i in xrange(num_children):
    #    func = linear_features
    #    if i % 2 == 0:
    #        func = square_features
    #    child_nodes.append(Node(None, parent=top_node, input_dim=FEATURE_DIM, predict_func=func,
    #        name='Child {:d}'.format(i)))

    validation_set = [pt for pt in dataset(500, seed=rand_seed + 1)]

    batch_set = [pt for pt in dataset(num_pts, seed=rand_seed)]
    from sklearn.linear_model.ridge import Ridge
    batch_learner = Ridge(alpha=1e-15, fit_intercept=False)
    batch_learner.fit(np.vstack([pt.x for pt in batch_set]), np.array([pt.y for pt in batch_set]))
    batch_pred = batch_learner.predict(np.vstack([pt.x for pt in validation_set]))
    Yval = np.array([pt.y for pt in validation_set])
    # THIS HAS TO BE THE SAME LOSS AS THE TOP NODE!
    mean_batch_err = np.mean([top_node.loss(pred, val) for (pred, val) in zip(batch_pred, Yval)])
    #err = batch_pred - Yval; mean_batch_err = np.mean(0.5*err*err)
    print('Batch err: {:.4g}'.format(mean_batch_err))

    npprint = partial(np.array_str, precision=3)

    multiprocess = num_children >= 75
    if multiprocess:
        from pathos.multiprocessing import ProcessingPool as Pool
        from pathos.multiprocessing import cpu_count
        #p = Pool(int(ceil(0.75*cpu_count())))
        p = Pool(cpu_count())
        val_helper = partial(predict_layer, child_nodes=child_nodes, top_node=top_node)

    learner_weights = np.array([node.w for node in child_nodes])
    disp_num_child = 15
    if num_children < disp_num_child:
        print('Child learner weights: {}'.format(npprint(learner_weights.ravel())))

    validation_preds = []
    per_iter_learner_weights = []
    print 'Starting Online Boosting...'
    for i, pt in enumerate(dataset(num_pts, seed=rand_seed)):
        per_iter_learner_weights.append(learner_weights)
        # Compute loss on Validation set
        if multiprocess:
            val_results = p.map(val_helper, validation_set)
        else:
            val_results = [predict_layer(val_pt, child_nodes, top_node) for val_pt in validation_set]
        val_psums, val_losses = zip(*val_results)
        val_preds = [psum[-1] for psum in val_psums]
        validation_preds.append(val_preds)
        avg_val_loss = np.mean(val_losses)
        # Compute the partial sums, loss on current data point
        partial_sums, top_loss = predict_layer(pt, child_nodes, top_node)

        # get the gradient of the top loss at each partial sum
        true_val = pt.y
        offset_partials = partial_sums.copy()
        offset_partials[1:] = partial_sums[:-1]
        offset_partials[0] = 0
        dlosses = [node.dloss(pred_val, true_val) for pred_val, node in zip(offset_partials, child_nodes)]
        step_size = learning_scale / np.power((i + 1), learning_rate)
        learner_weights = np.array([node.grad_step(pt.x, loss, step_size)
                                    for (node, loss) in zip(child_nodes, dlosses)])
        if  i < 1 or i == num_pts - 1 or (i < num_children and num_children < disp_num_child)\
                or i % min(int(ceil(num_pts * 0.05)), 25) == 0 or avg_val_loss > 1e3:
            print('Iteration {:d}/{:d}: (x={:.2g},y={:.2g})'.format(i + 1, num_pts, pt.x, pt.y))
            print(' Avg validation loss on pt: {:.4g} vs Batch: {:.4g}'.format(avg_val_loss,
                                                                               mean_batch_err))
            print('  Top layer loss on pt: {:.4g}'.format(top_loss))
            if num_children < disp_num_child:
                print('  Child learner weights: {}'.format(npprint(learner_weights.ravel())))
                print('  Partial sums: {}'.format(npprint(partial_sums)))
            print('  Took descent step of step size {:.4g}...'.format(step_size))
    # endfor

    return validation_set, validation_preds, batch_pred, batch_set, per_iter_learner_weights
Beispiel #16
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value) ** 2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(1.0, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(1.0, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    best_alpha = ridge_gcv.best_alpha
    ret.append(best_alpha)

    # check that we get same best alpha with custom loss_func
    ridge_gcv2 = _RidgeGCV(fit_intercept=False, loss_func=mean_squared_error)
    ridge_gcv2.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.best_alpha, best_alpha)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.best_alpha, best_alpha)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
                              Y_pred, decimal=5)

    return ret