def test_dtype_match_cholesky(): # Test different alphas in cholesky solver to ensure full coverage. # This test is separated from test_dtype_match for clarity. rng = np.random.RandomState(0) alpha = (1.0, 0.5) n_samples, n_features, n_target = 6, 7, 2 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples, n_target) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver='cholesky') ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver='cholesky') ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do all the checks at once, like this is easier to debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_dtype_match(): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) solvers = ["svd", "sparse_cg", "cholesky", "lsqr"] for solver in solvers: # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
def _test_multi_ridge_diabetes(filter_): # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T n_features = X_diabetes.shape[1] ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), Y) assert_equal(ridge.coef_.shape, (2, n_features)) Y_pred = ridge.predict(filter_(X_diabetes)) ridge.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(fit_intercept=False) # generalized cross-validation (efficient leave-one-out) K, v, Q = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(v, Q, y_diabetes, 1.0) values, c = ridge_gcv._values(K, v, Q, y_diabetes, 1.0) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) best_alpha = ridge_gcv.best_alpha ret.append(best_alpha) # check that we get same best alpha with custom loss_func ridge_gcv2 = _RidgeGCV(fit_intercept=False, loss_func=mean_squared_error) ridge_gcv2.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.best_alpha, best_alpha) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.best_alpha, best_alpha) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def test_fit_simple_backupsklearn(): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.Ridge enet = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet.fit(X, y) print("h2o4gpu predict()") print(enet.predict(X)) print("h2o4gpu score()") print(enet.score(X,y)) enet_wrapper = Solver(normalize=True, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet_wrapper.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet_wrapper.predict(X)) print("h2o4gpu scikit wrapper score()") print(enet_wrapper.score(X, y)) from sklearn.linear_model.ridge import Ridge enet_sk = Ridge(normalize=True, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit score()") print(enet_sk.score(X, y)) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet_wrapper.coef_) print(enet_sk.intercept_) print(enet_wrapper.intercept_) print(enet_sk.n_iter_) print(enet_wrapper.n_iter_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet_wrapper.coef_, enet_sk_coef) assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_)
def test_toy_ridge_object(): # Test BayesianRegression ridge classifier # TODO: test also n_samples > n_features X = np.array([[1], [2]]) Y = np.array([1, 2]) clf = Ridge(alpha=0.0) clf.fit(X, Y) X_test = [[1], [2], [3], [4]] assert_almost_equal(clf.predict(X_test), [1., 2, 3, 4]) assert_equal(len(clf.coef_.shape), 1) assert_equal(type(clf.intercept_), np.float64) Y = np.vstack((Y, Y)).T clf.fit(X, Y) X_test = [[1], [2], [3], [4]] assert_equal(len(clf.coef_.shape), 2) assert_equal(type(clf.intercept_), np.ndarray)
def eval_aggr_shifts(X, y, ignore_rows): eps = 1e-6 pred = [] real = [] for inst_n in ignore_rows: X = np.concatenate((X[:inst_n], X[inst_n+1:])) y = np.concatenate((y[:inst_n], y[inst_n+1:])) n = X.shape[0] for inst_n in range(n): x_i = X[inst_n] y_i = y[inst_n] X_train = np.concatenate((X[:inst_n], X[inst_n+1:])) y_train = np.concatenate((y[:inst_n], y[inst_n+1:])) y_train = np.array([max(eps, min(1 - eps, val)) for val in y_train]) y_train = np.log(y_train / (1 - y_train)) model = Ridge(alpha=.2, fit_intercept=True, normalize=True) #model = Lasso(alpha=.001, fit_intercept=True, normalize=True) model.fit(X_train, y_train) y_hat = model.predict(x_i.reshape(1, -1))[0] y_i1 = max(eps, min(1 - eps, y_i)) y_i1 = np.log(y_i1 / (1 - y_i1)) print('inst: ' + str(inst_n) + ', prediction: ' + str(y_hat) + ', err: ' + str(y_hat - y_i1)) pred.append(1 / (1 + exp(-y_hat))) real.append(y_i) model = Ridge(alpha=.2, fit_intercept=True, normalize=True) model.fit(X, y) return pred, real, model.coef_
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
model2_train1 += virtual_test1 print "now saving the result" ff = open('virtual_train_data.json', 'w') ff.write(json.dumps([model2_train0, model2_train1])) ff.close() if sys.argv[1] == "second": ff = open('virtual_train_data.json', 'r') model2_train0, model2_train1 = json.loads(ff.read()) ff.close() print "opened train0 and train1 with each length", len(model2_train0), len(model2_train1) print model2_train0[0] print model2_train1[0] ff = open('intermediate_result.json', 'r') model2_test0, _ = json.loads(ff.read()) print model2_test0[0] model2 = Ridge() print "start fitting 2nd model" model2.fit(model2_train0, model2_train1) print "start predicting" predictions=model2.predict(model2_test0) print "saving the predicted result into the file" f = open('result.csv', 'w') f.write("ID;COTIS\n"); for ind, prd in enumerate(predictions): f.write(my_ids[ind] + ';' + str(prd) + '\n') f.close() print "all tasks completed"
train1 = extract_target(train_dataset) test0 = extract_predictor(test_dataset, False) results = [] for cnt in range(1000): projected0 = [] projected1 = [] for i in xrange(len(train0)): if random.random() < 0.4: continue projected0.append(train0[i]) projected1.append(train1[i]) print "now fitting the model", cnt, "with len", len(projected0) model = Ridge() model.fit(projected0, projected1) predictions=model.predict(test0) results.append(list(predictions)) final_result = [] for ind in xrange(len(results[0])): cand = [] for i in xrange(len(results)): cand.append(results[i][ind]) final_result.append(sum(sorted(cand)[100:-100])*1.0/(len(cand)-200)) #predictions=model.predict(valid_dataset) #Evaluate the quality of the prediction #print sklearn.metrics.mean_absolute_error(predictions,valid_target) print "saving the predicted result into the file"
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER if fit_intercept: X_diabetes_ = X_diabetes - X_diabetes.mean(0) else: X_diabetes_ = X_diabetes ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept) # because fit_intercept is applied # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes_[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes_[i]])[0] error = (y_diabetes[i] - value)**2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def predict_prices(self, predict_start, num_previous_dates, num_successive_dates, include_training_dates=False, method=Regression.LINEAR): ''' Linear Regression (Linear): Standard linear regression. Support Vector Regression (SVR_RBF) is an extension of Support Vector Machines used to solve regression problems. An explanation and example of its usage are available here: http://scikit-learn.org/dev/modules/svm.html#regression http://scikit-learn.org/dev/auto_examples/svm/plot_svm_regression.html#example-svm-plot-svm-regression-py @param predict_start: string The date to start the prediction. @param num_previous_dates: int Number of dates to use prior to predict_start as the training data. @param num_successive_dates: int Number of dates to predict after predict_start. @param include_training_dates: bool Include the training dates in the predicted prices. @param method: Method used for regression. One of the Regression.methods values. @return: A Series with a DatetimeIndex with the dates and predicted prices. ''' predict_dates = next_n_business_days(predict_start, num_successive_dates, include_start=True) training_dates = prev_n_business_days(predict_start, num_previous_dates, include_start=False) training_prices_series = self.get_prices_range(str(training_dates[-1]), str(training_dates[0])) training_date_index_ls = training_prices_series.index.values if include_training_dates: # Include training data in predictions. predict_dates = np.hstack((training_dates,predict_dates)) td_ordinals = datetime64_to_ordinal_arr(training_date_index_ls) p = datetime64_to_ordinal_arr(predict_dates) # Have to reshape into a 2d array from a 1d array. p = p.reshape(p.shape[0], 1) X = td_ordinals.reshape(td_ordinals.shape[0], 1) # Normalize dates A = np.vstack((X, p)) # Have to stack to normalize training and test data as one. A = scale(A, axis=0) n = p.shape[0] X = A[:-n] p = A[-n:] y = training_prices_series.values if method == Regression.SVR_RBF: regressor = svm.SVR(kernel='rbf') elif method == Regression.SVR_POLY: regressor = svm.SVR(kernel='poly') elif method == Regression.RIDGE: regressor = Ridge() elif method == Regression.LINEAR: regressor = LinearRegression() else: raise ValueError('Unrecognized regression method %s' % (method)) try: regressor.fit(X, y) # Train using the training X and y data. except ValueError: raise ValueError('Issue fitting, re-throwing.') predictions = regressor.predict(p) index = pd.DatetimeIndex(predict_dates) series = pd.Series(predictions,index=index) return series
def main(num_pts, num_children, learning_rate=1.5, learning_scale=0.8, rand_seed=0): top_node = Node(SqLoss, parent=None, name="root", input_dim=0) child_nodes = [Node(SqLoss, parent=top_node, input_dim=FEATURE_DIM, name='Child {:d}'.format(i)) for i in xrange(num_children)] #child_nodes = [] # for i in xrange(num_children): # func = linear_features # if i % 2 == 0: # func = square_features # child_nodes.append(Node(None, parent=top_node, input_dim=FEATURE_DIM, predict_func=func, # name='Child {:d}'.format(i))) validation_set = [pt for pt in dataset(500, seed=rand_seed + 1)] batch_set = [pt for pt in dataset(num_pts, seed=rand_seed)] from sklearn.linear_model.ridge import Ridge batch_learner = Ridge(alpha=1e-15, fit_intercept=False) batch_learner.fit(np.vstack([pt.x for pt in batch_set]), np.array([pt.y for pt in batch_set])) batch_pred = batch_learner.predict(np.vstack([pt.x for pt in validation_set])) Yval = np.array([pt.y for pt in validation_set]) # THIS HAS TO BE THE SAME LOSS AS THE TOP NODE! mean_batch_err = np.mean([top_node.loss(pred, val) for (pred, val) in zip(batch_pred, Yval)]) #err = batch_pred - Yval; mean_batch_err = np.mean(0.5*err*err) print('Batch err: {:.4g}'.format(mean_batch_err)) npprint = partial(np.array_str, precision=3) multiprocess = num_children >= 75 if multiprocess: from pathos.multiprocessing import ProcessingPool as Pool from pathos.multiprocessing import cpu_count #p = Pool(int(ceil(0.75*cpu_count()))) p = Pool(cpu_count()) val_helper = partial(predict_layer, child_nodes=child_nodes, top_node=top_node) learner_weights = np.array([node.w for node in child_nodes]) disp_num_child = 15 if num_children < disp_num_child: print('Child learner weights: {}'.format(npprint(learner_weights.ravel()))) validation_preds = [] per_iter_learner_weights = [] print 'Starting Online Boosting...' for i, pt in enumerate(dataset(num_pts, seed=rand_seed)): per_iter_learner_weights.append(learner_weights) # Compute loss on Validation set if multiprocess: val_results = p.map(val_helper, validation_set) else: val_results = [predict_layer(val_pt, child_nodes, top_node) for val_pt in validation_set] val_psums, val_losses = zip(*val_results) val_preds = [psum[-1] for psum in val_psums] validation_preds.append(val_preds) avg_val_loss = np.mean(val_losses) # Compute the partial sums, loss on current data point partial_sums, top_loss = predict_layer(pt, child_nodes, top_node) # get the gradient of the top loss at each partial sum true_val = pt.y offset_partials = partial_sums.copy() offset_partials[1:] = partial_sums[:-1] offset_partials[0] = 0 dlosses = [node.dloss(pred_val, true_val) for pred_val, node in zip(offset_partials, child_nodes)] step_size = learning_scale / np.power((i + 1), learning_rate) learner_weights = np.array([node.grad_step(pt.x, loss, step_size) for (node, loss) in zip(child_nodes, dlosses)]) if i < 1 or i == num_pts - 1 or (i < num_children and num_children < disp_num_child)\ or i % min(int(ceil(num_pts * 0.05)), 25) == 0 or avg_val_loss > 1e3: print('Iteration {:d}/{:d}: (x={:.2g},y={:.2g})'.format(i + 1, num_pts, pt.x, pt.y)) print(' Avg validation loss on pt: {:.4g} vs Batch: {:.4g}'.format(avg_val_loss, mean_batch_err)) print(' Top layer loss on pt: {:.4g}'.format(top_loss)) if num_children < disp_num_child: print(' Child learner weights: {}'.format(npprint(learner_weights.ravel()))) print(' Partial sums: {}'.format(npprint(partial_sums))) print(' Took descent step of step size {:.4g}...'.format(step_size)) # endfor return validation_set, validation_preds, batch_pred, batch_set, per_iter_learner_weights
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(1.0, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(1.0, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) best_alpha = ridge_gcv.best_alpha ret.append(best_alpha) # check that we get same best alpha with custom loss_func ridge_gcv2 = _RidgeGCV(fit_intercept=False, loss_func=mean_squared_error) ridge_gcv2.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.best_alpha, best_alpha) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.best_alpha, best_alpha) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret