def svr_main(X, Y): X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) predict_list = [] for i in xrange(TEST_SIZE): X = [ [x] for x in xrange(i, TRAIN_SIZE+i)] clf.fit(X, Y[i:TRAIN_SIZE+i]) y_pred = clf.predict([TRAIN_SIZE+1+i]) predict_list.append(y_pred) print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list) print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list)) origin_data = Y_test print "origin data:%s"%origin_data plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
def SVM(train, test, tunings=None, smoteit=True, bin=True, regress=False): "SVM " if not isinstance(train, pd.core.frame.DataFrame): train = csv2DF(train, as_mtx=False, toBin=bin) if not isinstance(test, pd.core.frame.DataFrame): test = csv2DF(test, as_mtx=False, toBin=True) if smoteit: train = SMOTE(train, resample=True) # except: set_trace() if not tunings: if regress: clf = SVR() else: clf = SVC() else: if regress: clf = SVR() else: clf = SVC() features = train.columns[:-1] klass = train[train.columns[-1]] # set_trace() clf.fit(train[features], klass) actual = test[test.columns[-1]].as_matrix() try: preds = clf.predict(test[test.columns[:-1]]) except: set_trace() return actual, preds
def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") assert_raises(TypeError, check_is_fitted, "SVR", "support_") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard, "coef_") assert_raises(NotFittedError, check_is_fitted, svr, "support_") except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") except ValueError as e: assert_equal(str(e), "Random message ARDRegression, ARDRegression") try: check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") except AttributeError as e: assert_equal(str(e), "Another message SVR, SVR") ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_"))
def getError1(signal, normedDay, period, phase): ''' Gets the error for a list of points across a normed day given a sklean model, the period, and the phase of the fitted signal. Here I'm using the Euclidean distance as the error measurement. This requires a little more computation due to the need to fit an inverse model, but provides better fits. Returns the squared Euclidean error. ''' if rank(normedDay.index[0]) > 0: t0= round((array(normedDay.index.get_level_values(0))- phase)%period,3) else: t0 = round((array(normedDay.index,dtype=float) - phase)%period,3) nD = Series(normedDay, index=t0) tUp = array([arange(0,period+.1,.1)]).T invSignal = SVR(kernel='rbf', C=signal.C, gamma=signal.gamma, epsilon=signal.epsilon) invSignal.fit(array([signal.predict(tUp)]).T, tUp.flatten()) xDiff = nD - signal.predict(array([array(nD)]).T) yDiff = nD - signal.predict(array([nD.index]).T) error = sum(pow(xDiff/period,2) + pow(yDiff/2,2)) return error
def train_svm(train_file): test_X, test_Y, weight = load_data(train_file, get_avg(train_file)) svr = SVR(kernel='rbf', C=100, gamma=1) print("start train") svr.fit(test_X, test_Y) print("train finish") return svr
def train_svm(data): test_X, test_Y = load_data(data) svr = SVR(kernel='rbf', C=100, gamma=1) print("start train") svr.fit(test_X, test_Y) print("train finish") return svr
def RunSVRScikit(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. opts = {} if "c" in options: opts["C"] = float(options.pop("c")) if "epsilon" in options: opts["epsilon"] = float(options.pop("epsilon")) if "gamma" in options: opts["gamma"] = float(options.pop("gamma")) opts["kernel"] = "rbf" if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform SVR. model = SSVR(**opts) model.fit(X, y) except Exception as e: return -1 return totalTimer.ElapsedTime()
class HotTweets: ''' Train and get tweet hotness ''' def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100): ''' Prepare support vector regression ''' self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True) #self.svr = LogisticRegression(random_state=42, verbose=0) self.n_comp = n_comp def fit_scaler(self, dev, i_dev): ''' Train normalizers for features and importances ''' # importance scaler self.std_scaler_i = sklearn.preprocessing.StandardScaler() self.std_scaler_i.fit(i_dev) self.norm = sklearn.preprocessing.StandardScaler() self.norm.fit(dev[:,0:self.n_comp]) self.n_comp = self.n_comp def train(self, features, importances): ''' Train regression ''' importances = self.std_scaler_i.transform(importances) features = self.norm.transform(features[:,0:self.n_comp]) self.svr.fit(features, importances) def predict(self, features): ''' Predict importances ''' features = self.norm.transform(features[:,0:self.n_comp]) results = self.svr.predict(features) #print results[0:100:5] results = self.std_scaler_i.inverse_transform(results) #print results[0:100:5] return results
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'kernel': ['rbf', 'sigmoid', 'linear'], 'C': [0.01, 0.1, 1, 10, 100], 'epsilon': [0.0000001, 0.000001, 0.00001] }] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): svr = SVR(**param) # ada = AdaBoostRegressor(svr) svr.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, svr.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = svr return self
def train_single_model(train_data, train_labels, algo): """ Train the model for a single label dimension """ if algo == 'svr_rbf': """ SVM regression, RBF kernel """ svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(train_data, train_labels) return svr_rbf if algo == 'svr_lin': """ SVM regression, linear """ svr_lin = SVR(kernel='linear') svr_lin.fit(train_data, train_labels) return svr_lin if algo == 'ridge': """ Ridge regression """ clf = Ridge(alpha = 0.5) clf.fit(train_data, train_labels) return clf # No hit algorithm print "unimplemented model type" return None
def train(self, x, y, param_names, random_search=100, kernel_cache_size=2000, **kwargs): if self._debug: print "First training sample\n", x[0] start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search c, gamma = self._random_search(random_iter=random_search, x=scaled_x, y=y, kernel_cache_size=kernel_cache_size) # Now train model try: svr = SVR(gamma=gamma, C=c, random_state=self._rng, cache_size=kernel_cache_size) svr.fit(scaled_x, y) self._model = svr except Exception, e: print "Training failed", e.message svr = None
def predict_device_byday_SVR(): X,Y_unique,Y_all,X_raw = load_device_counter_byday() from sklearn.svm import SVR model = SVR() # model = SVR(kernel='linear') training_size = 160 # model.fit(X[:training_size],Y_unique[:training_size]) model.fit(X[:training_size],Y_all[:training_size]) start_index = 180 end_index = 190 X_to_predict = X[start_index:end_index] # X_to_predict.append([date_str_toordinal('2017-04-18')]) # X_to_predict.append([date_str_toordinal('2017-03-27')]) print X_to_predict # Y_real = Y_unique[start_index:end_index] Y_real = Y_all[start_index:end_index] print X_raw[start_index:end_index] y_predicted=model.predict(X_to_predict) # print y_predicted y_predicted = np.array(y_predicted).astype(int) print y_predicted print Y_real # print y_predicted - np.array(Y_real) # plt.subplot(111) # plt.scatter(X_to_predict,Y_real,c='r') plt.scatter(X_to_predict,y_predicted) # plt.plot(X_to_predict,y_predicted) plt.show()
def main(args): (training_file, label_file, test_file, test_label, c, e) = args svr = SVR(C=float(c), epsilon=float(e), kernel='rbf') X = load_feat(training_file) y = [float(line.strip()) for line in open(label_file)] X = np.asarray(X) y = np.asarray(y) test_X = load_feat(test_file) test_X = np.asarray(test_X) test_X[np.isnan(test_X)] = 0 svr.fit(X, y) pred = svr.predict(test_X) if test_label != 'none': test_y = [float(line.strip()) for line in open(test_label)] test_y = np.asarray(test_y) print 'MAE: ', mean_absolute_error(test_y, pred) print 'RMSE: ', sqrt(mean_squared_error(test_y, pred)) print 'corrpearson: ', sp.stats.pearsonr(test_y, pred) print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2 print mquantiles(test_y, prob=[0.10, 0.90]) print mquantiles(pred, prob=[0.10, 0.90]) with open(test_file + '.svr.pred', 'w') as output: for p in pred: print >>output, p return
def train_learning_model_svm(df): X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) regressor = SVR() regressor.fit(X_train, y_train) calculate_results(regressor, X_train, X_test, y_train, y_test)
def train_SVR(viper): from sklearn.svm import SVR model = SVR(C=10, kernel='rbf', shrinking=False, verbose=True) model.fit(viper.train_feat, viper.train_y) return model
def svr(self, X, y): """ Train support vector regression model Parameters ---------- X : numpy ndarray with numeric values Array containing input parameters for the model. Model will try to learn the output y[i] in terms of inputs X[i] y : columnar numpy array with numeric values Array containing single column of output values. Entry at y[i] corresponds to value of the underlying experiment for input parameters X[i] Returns ------- result : model Model learnt from incoming input inputs and outputs """ clf = SVR(C=1.0, epsilon=0.2) clf.fit(X, y) return clf
def draw_svr_single(real_data, name): history = [] for i in range(1, 32): h = [i] history.append(h) from sklearn.svm import SVR svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = SVR(kernel='linear', C=1e3) svr_poly = SVR(kernel='poly', C=1e3, degree=2) y_rbf = svr_rbf.fit(history, real_data).predict(history) y_lin = svr_lin.fit(history, real_data).predict(history) y_poly = svr_poly.fit(history, real_data).predict(history) import pylab as pl pl.scatter(history, real_data, c='k', label='data') pl.hold('on') pl.plot(history, y_rbf, c='g', label='RBF model') pl.plot(history, y_lin, c='r', label='Linear model') pl.plot(history, y_poly, c='b', label='Polynomial model') pl.xlabel('data') pl.ylabel('target') pl.title('Support Vector Regression: ' + name) pl.legend() pl.show()
def machinelearning(csv_file): # parse CSV d = {} d['date'] = [] d['radiation'] = [] d['humidity'] = [] d['temperature'] = [] d['wind'] = [] d['demand'] = [] dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',') next(dictreader) for row in dictreader: for key in row: d[key].append(row[key]) # interpolate weather data interpolate(d['radiation']) interpolate(d['humidity']) interpolate(d['temperature']) interpolate(d['wind']) # train machine learning algorithm training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32]) training_y = np.array(d['demand'][:32]) poly_svr = SVR(kernel='poly', degree=2) poly_svr.fit(training_x, training_y) prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:]) demand_predictions = poly_svr.predict(prediction_x) return demand_predictions
class SVR(PlayerModel): ### a wrapper for support vector regression using scikit-learn for this project def __init__(self): PlayerModel.__init__(self) # configure support vector regression and start training self.regr = SupportVectorRegression(kernel = 'linear', C = 1000) self.regr.fit(self.dataset_X_train, self.dataset_Y_train) print "Finish building player model." print "Parameters: ", self.regr.get_params() print "============================================================" def testScore(self, test_X): score = self.regr.predict(self.normalizeTest(test_X)) return np.mean(score) def getParams(self): return self.regr.get_params() def visualize(self): x = np.zeros((10, self.col - 1)) mean = self.dataset_X_train.mean(0) for i in range(10): x[i, :] = mean x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T # print x y = self.regr.predict(x) # print y pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data') pyplot.hold('on') pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression') pyplot.xlabel('data collect from player') pyplot.ylabel('score') pyplot.title('Support Vector Regression') pyplot.legend() pyplot.show()
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator): # # param_grid = {'C':[10000], # 'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1] # } # # svr = SVR(random_state=42, cache_size=1000, verbose=2) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator) # search.fit(X_train, Y_train["Ca"]) # #search.grid_scores_ # # model = search.best_estimator_ #scaler = StandardScaler() model = SVR(C=10000, epsilon = 0.01, cache_size=1000) model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) yhat_svr = model.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr)) return model, test_error
def train_svm(train_file, avg={}): test_X, test_Y, weight = load_data(train_file, avg) svr = SVR(kernel='rbf', C=100, gamma=1, verbose=True, cache_size=1024) print("start train") svr.fit(test_X, test_Y) print("train finish") return svr
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator): #=========================================================================== # param_grid = {'C':[100,500,1000, 5000, 10000, 100000], # 'epsilon':[0.075,0.1, 0.125] # } # # svr = SVR(cache_size = 1000, random_state=42) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator) #=========================================================================== #search.fit(X_train, Y_train["Sand"]) #search.grid_scores_ #svr = search.best_estimator_ #svr.fit(X_train, Y_train["SAND"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) svr = SVR(C=10000) svr.fit(X_train, Y_train["Sand"]) yhat_svr = svr.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr)) return svr, test_error
def RunSVRScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. c = re.search("-c (\d+\.\d+)", options) e = re.search("-e (\d+\.\d+)", options) g = re.search("-g (\d+\.\d+)", options) C = 1.0 if not c else float(c.group(1)) epsilon = 1.0 if not e else float(e.group(1)) gamma = 0.1 if not g else float(g.group(1)) try: with totalTimer: # Perform SVR. model = SSVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma) model.fit(X, y) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def train_model(train, test, labels): clf = SVR(C=1.0, epsilon=0.2) clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape test['id'].to_csv("TEST_TEST.csv",index=False) predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
def learn(X, y): # do pca pca = PCA(n_components=6) pca_6 = pca.fit(X) print('variance ratio') print(pca_6.explained_variance_ratio_) X = pca.fit_transform(X) # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1) # do svr svr_rbf = SVR(kernel='rbf', C=1) svr_rbf.fit(X, y) # print(model_rbf) y_rbf = svr_rbf.predict(X) print(y_rbf) print(y) # see difference y_rbf = np.transpose(y_rbf) deviation(y, y_rbf) # pickle model with open('rbfmodel.pkl', 'wb') as f: pickle.dump(svr_rbf, f) with open('pcamodel.pkl', 'wb') as f: pickle.dump(pca_6, f)
class SVMLearner(object): def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False): self.name = "{} Support Vector Machine Learner".format(kernel.capitalize()) self.kernel=kernel if kernel=="linear": self.svr = SVR(kernel=kernel, C=C) elif kernel=="rbf": self.svr = SVR(kernel=kernel, C=C, gamma=gamma) elif kernel=="poly": self.svr = SVR(kernel=kernel, C=C, degree=degree) def addEvidence(self,dataX,dataY): """ @summary: Add training data to learner @param dataX: X values of data to add @param dataY: the Y training values """ # build and save the model self.svr.fit(dataX, dataY) def query(self,points): """ @summary: Estimate a set of test points given the model we built. @param points: should be a numpy array with each row corresponding to a specific query. @returns the estimated values according to the saved model. """ return self.svr.predict(points)
def test_regression_custom_mse(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf', gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric=mean_squared_error, num_rounds=1, seed=123) norm_imp_vals = imp_vals / np.abs(imp_vals).max() assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert norm_imp_vals[0] == -1.
def train(self, pairings): X, Y = self.getXY(pairings) self.svms = [] for i in range(self.wine_feat_len): svm = SVR(kernel='rbf') svm.fit(X, Y[:, i]) self.svms.append(svm)
def test_regression(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='r2', num_rounds=1, seed=123) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert sum(imp_vals[3:]) <= 0.01
Y = Y.reshape(-1, 1) # In[37]: from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_Y = StandardScaler() X = sc_X.fit_transform(X) Y = sc_Y.fit_transform(Y) # In[39]: from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, Y) # In[40]: Y_Pred = sc_Y.inverse_transform( regressor.predict(sc_X.transform(np.array([6.5]).reshape(-1, 1)))) # In[42]: import matplotlib.pyplot as plt plt.scatter(X, Y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Regression Results') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
def run_vary_cutoff(arg): k, cat = arg ibp = IBP(cutoff=k, enable_cluster=True, n_max_iter=10000) ibp.fit(training_votes[2], training_votes[1], cats=training_cats[:, 0]) y_per = training_votes[1] / training_votes[2].astype(float) y_ibp = ibp(training_votes[2], training_votes[1], training_cats[:, 0]) clf_per = SVR(**cat_parameters['PER'][cat]) clf_ibp = SVR(**cat_parameters['IBP'][cat]) X = np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, 0], training_x))))) X_tsb = np.array( list( map(lambda x: x[1], filter(lambda x: x[0] == cat, zip(tsb_cats[:, 0], tsb_x))))) y_tsb = np.array( list( map(lambda x: x[1], filter(lambda x: x[0] == cat, zip(tsb_cats[:, 0], tsb_truth))))) clf_per.fit( X, np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, 0], y_per)))))) clf_ibp.fit( X, np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, 0], y_ibp)))))) tsb_y_hat_per = clf_per.predict(X_tsb) tsb_y_hat_ibp = clf_ibp.predict(X_tsb) mse_tsb_per = ((tsb_y_hat_per - y_tsb)**2).mean() mae_tsb_per = abs(tsb_y_hat_per - y_tsb).mean() rmse_tsb_per = mse_tsb_per**0.5 mse_tsb_ibp = ((tsb_y_hat_ibp - y_tsb)**2).mean() mae_tsb_ibp = abs(tsb_y_hat_ibp - y_tsb).mean() rmse_tsb_ibp = mse_tsb_ibp**0.5 print(2, cat, 'tsb', (training_cats[:, 0] == cat).astype(int).sum(), (tsb_cats[:, 0] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp, (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp, (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per) return [[ 2, cat, 'tsb', (training_cats[:, 0] == cat).astype(int).sum(), (tsb_cats[:, 0] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp, (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp, (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per, ttest_rel(tsb_y_hat_per, tsb_y_hat_ibp).pvalue ]]
def home(): """Renders the home page.""" if request.method == 'POST': ticker = request.form.get("ticker") ticker = ticker.upper() sns.set_style("whitegrid") files = [] files_SMA = [] ticker_list = [] #global ticker_list # ticker_list = ticker.split() ticker_list = ticker #global ticker_list #global files today = date.today() start_date = "2016-01-01" end_date = today def getData(ticker): # downloading data try: data = pdr.get_data_yahoo(ticker, start=start_date, end=end_date) # dataname = ticker + '_' + str(start_date) + '-' + str(end_date) data['SMA_200'] = data.iloc[:, 5].rolling(window=200).mean() data['SMA_50'] = data.iloc[:, 5].rolling(window=50).mean() files_SMA.append(ticker) files.append(ticker) SaveData(data, ticker) except RemoteDataError: pass def SaveData(df, filename): # df.to_csv('./data/' + filename + ".csv") dnew = df.iloc[200:] dnew.to_csv(filename + '.csv') # def SMA(filename): # df = pd.read_csv(filename + ".csv") # df['SMA_200'] = df.iloc[:, 5].rolling(window=200).mean() # df['SMA_50'] = df.iloc[:, 5].rolling(window=50).mean() # dataname = filename + "_with_SMA" # files_SMA.append(filename) # SaveData(df, filename) #for tik in ticker_list: #getData(tik) getData(ticker_list) # for i in files: # SMA(i) filename = ticker # filename = input('enter ticker symbol: ') df = pd.read_csv(filename + '.csv') # Remove the date del df['Date'] # A variable for predicting 'n' days out into the future forecast_out = 30 # 'n=30' days # Create another column (the target ) shifted 'n' units up df['Prediction'] = df[['Adj Close']].shift(-forecast_out) # print(df.tail()) # Convert the dataframe to a numpy array X = np.array(df.drop(['Prediction'], 1)) # Remove the last '30' rows X = X[:-forecast_out] # Convert the data frame to a numpy array y = np.array(df['Prediction']) # Get all of the y values except the last '30' rows y = y[:-forecast_out] # Split the data into 80% training and 20% testing x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create and train the Support Vector Machine (Regressor) svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(x_train, y_train) svr_confidence = 0 x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) stop = 1 # while loop to get the best results while svr_confidence <= stop: x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(x_train, y_train) svr_confidence = svr_rbf.score(x_test, y_test) stop -= 0.01 # Create and train the Linear Regression Model lr = LinearRegression() lr.fit(x_train, y_train) lr_confidence = 0 stop = 1 # while loop to get the best results while lr_confidence <= stop: lr.fit(x_train, y_train) lr_confidence = lr.score(x_test, y_test) stop -= 0.01 # Set x_forecast equal to the last 30 rows of the original data set from Adj. Close column x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:] # Print linear regression model predictions for the next '30' days lr_prediction = lr.predict(x_forecast) old = df[['Adj Close']] for x in lr_prediction: new_row = { 'Open': 0, 'High': 0, 'Low': 0, 'Close': 0, 'Adj Close': x, 'Volume': 0 } df = df.append(new_row, ignore_index=True) # svm_prediction = svr_rbf.predict(x_forecast) plt.rcParams.update({'font.size': 18}) plt.figure(figsize=(15, 11)) plt.xlim([len(df) - 100, len(df) - 1]) plt.ylim([(df['Adj Close'].tail(100).min() - df['Adj Close'].tail(100).min() * 0.1), (df['Adj Close'].tail(100).max() + df['Adj Close'].tail(100).max() * 0.1)]) plt.plot(df['Adj Close'], color='red', label='Predicted Price') plt.plot(old['Adj Close'], color='k', label="Past Data") plt.plot(df['SMA_200'], color='b', label='SMA 200') plt.plot(df['SMA_50'], color='g', label='SMA 50') # plt.yticks(np.arange(int(df['Adj Close'].tail(100).min() * 1.1), int(df['Adj Close'].tail(100).max() * 1.1), step=(int(df['Adj Close'].tail(100).max() * 1.1))/10)) plt.title("30 Day Prediction of " + filename) plt.xlabel("Days") plt.ylabel("Adj. Close Price $") plt.legend() plt.savefig("FlaskWebProject1\\static\\images\\_graph.png") # plt.show() return render_template( 'index.html', title='Home Page', url='static/images/_graph.png', year=datetime.now().year, ) else: return render_template( 'index.html', title='Home Page', year=datetime.now().year, )
import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(6.5) # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
def clip_to_100(val): if val < 0: return 0 if val > 100: return 100 return val # In[6]: train_df[input_var_names] = train_df.word.apply(get_features) # In[7]: valid_df[input_var_names] = valid_df.word.apply(get_features) # In[11]: predict_df = valid_df.copy() for feat_name in output_var_names: #model = LinearRegression() model = SVR() model.fit(train_df[input_var_names], train_df[feat_name]) predict_df[feat_name] = model.predict(predict_df[input_var_names]) predict_df[feat_name] = predict_df[feat_name].apply(clip_to_100) # In[12]: src.eval_metric.evaluate(predict_df, valid_df)
# Visualizando e descrevendo o dataset df.describe() df.head(5) # Definindo as variáveis indepedentes e dependentes X = df.loc[:, 'LotArea'].values.reshape(-1,1) y = df.loc[:, 'SalePrice'].values.reshape(-1,1) # Dividindo o dataset em conjunto de treinamento e testes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) # Normalização das features X_train = feature_scaling(X_train) X_test = feature_scaling(X_test) # Treinando o modelo de regressão linear com o conjunto de treinamento regressor = SVR(kernel = 'rbf') regressor.fit(X_train, y_train) # Avaliando o modelo com a métrica r2 regressor.score(X_test, y_test) # Prevendo os resultados com o conjunto de testes y_pred = regressor.predict(X_test) # Visualizando os resultados do conjunto de treinamento plot_results_reg(X_train, y_train, regressor, 'SVR (Conj. de Treinamento)') # Visualizando os resultados do conjunto de testes plot_results_reg(X_test, y_test, regressor, 'SVR (Conj. de Testes)')
target = 'G3' X = np.array(df_new.drop([target], 1)) y = np.array(df_new[target]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) linear = linear_model.LinearRegression() tree = DecisionTreeRegressor() svr = SVR(kernel='rbf', C=8) linear.fit(X_train, y_train) tree.fit(X_train, y_train) svr.fit(X_train, y_train) linear_predict = pd.DataFrame(linear.predict(X_test)) tree_predict = pd.DataFrame(tree.predict(X_test)) svr_predict = pd.DataFrame(svr.predict(X_test)) y_real = pd.DataFrame(y_test) new_df = pd.DataFrame() new_df[['y_real']] = y_real new_df[['linear_predict']] = linear_predict new_df[['tree_predict']] = tree_predict new_df[['svr_predict']] = svr_predict print(new_df.head())
def apply_regression(filename, inCol, outCol, predValues): # read input output columns from filename degree,rank,isString = load_all(filename, inCol, outCol); printV(zip(degree,rank)) #TODO : check that output column can't be string if (len(degree)==0 or len(rank)==0 ): print 'ERROR : Input or Output data is empty.' return []; # REPLACE STRINGS with numbers #degree, uniqDegree = removeStrings(degree,isString); degree = array(degree); rank = array(rank); #degree = addStrings(degree, uniqDegree); """ print 'Degree: '; printV(degree); print 'uniqDegree: '; printV(uniqDegree); """ degree,Dscalers = normalizeColumns(degree); print degree; rank,Rscalers = normalizeColumns(rank); #printV(zip(degree,rank)); """ degree= deNormalizeColumns(degree,Dscalers); rank= deNormalizeColumns(rank,Rscalers); printV(zip(degree,rank)); """ # generate prediction inputs order = []; pv = []; for e in predValues: v = linspace(e[1], (e[1]+(e[2]-1)*e[3]), e[2] ); v=v.reshape(-1,1); if (len(pv)==0): pv=v; else: pv=hstack((pv,v)); order.append(e[0]); #print 'Order:',order #print 'Predicted Values:\n',pv runs=1; # convert both degree,rank to log scale for better prediction accuracy #rank = [ log(x) for x in rank ] #degree = [ log(x) for x in degree ] degree = array(degree); rank = array(rank); #freq = array(freq); """ # normalize degrees and ranks between [0,1] MaxDegree = max(degree)*2; MinDegree = min(degree); MaxRank = max(rank); degree = array( [(x)/float(MaxDegree) for x in degree] ); rank = rank/float(MaxRank); """ if (pri>3): printV(zip(degree,rank)) print degree.shape, rank.shape N = len(degree) AvgErr = 0; AvgWerr = 0; for rr in xrange(0,runs): degree,rank=doShuffle(degree,rank); """ ff = open('in.txt','w'); ff.write(str(MaxRank)+'\n'); for e in zip(degree,rank): ff.write(str(e[0])+' '+str(e[1])+'\n'); """ if (len(inCol)==1): degree = degree.reshape(-1,1); if (len(rank.shape)==1): rank = rank.reshape(-1,1); printV( zip(degree,rank) ) # split data into training and testing instances splitRatio=0.9 # splitRatio determines how many instances are used for training and testing. eg: 0.2 means 20% train, 80% test spl= int(splitRatio*N); #split location for train-test trI=array(degree[:spl]); trL=array(rank[:spl]); # trI - training instances, trL - training labels teI=array(degree[spl:]); teL=array(rank[spl:]); # teI - testing instances, teL - testing labels trI=trI.astype('float'); teI=teI.astype('float'); """ print 'Train data:\n' printV(zip(trI,trL)); print 'Test data:\n' printV(zip(teI,teL)); print '\n\n\n' """ print trI.shape, trL.shape; print "Train : ",int(splitRatio*N),"\t Test: ",int((1.0-splitRatio)*N) useSVM=1; NoInputs = 1; ignoreExtra =1; svr=SVR(); """ if (useSVM==0): # set parameters of neural network regression model nn = MLPRegressor(hidden_layer_sizes=(20), activation='tanh', solver='lbfgs', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, max_iter=500, shuffle=True, random_state=None, tol=0.00001, verbose=False, momentum=0.5, early_stopping=True, validation_fraction=0.15) else: """ svr = SVR(C=100, cache_size=200, epsilon=0.00001, gamma=3,kernel='rbf', max_iter=-1, shrinking=True, tol=0.000001, verbose=False) # gamma is fitting parameter, small gamma-> simpler curve, >gamma-> complex curve # train NN regression model svr.fit(trI,trL); # test model to get accuracy #res = svr.score(teI,teL); # 'res' represents how well regression model is learned. # It is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() # and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum() if (pri>2): print 'Accuracy measure: ', res # predict label/rank for test instances/degrees for calculating error yres = svr.predict(teI); sum=0; wsum=0; if (pri>2): print 'Predicted','\t','Actual Rank' linSum = 0; """ # calculate deviation from true vaue for each test instance for e in sorted(zip(yres,teL,teI)): prank = max(1, (e[0]*MaxRank)) # predicted rank trank = (e[1]*MaxRank) # true rank sum+=abs(prank-trank) lrank =0 ; if (pri>2): print int(prank),"\t", trank, '\t',e[2],'\t',lrank if (pri>2): print 'Avg error: ',(sum/len(yres)) AvgErr+=(sum/len(yres)) AvgWerr+=(wsum/len(yres)) if (pri>2): print 'Avg error: ',(AvgErr/runs) """ #pv = array( [(x)/float(MaxDegree) for x in pv] ); pv = normalizeValues(pv,Dscalers); Pred = svr.predict(pv); #printV(zip(pv,Pred)); #print(pv) """ Pred = ( [(x)*MaxRank for x in Pred] ); yres = ( [(x)*MaxRank for x in yres] ); pv = array( [(x)*float(MaxDegree) for x in pv] ); teI = array( [(x)*float(MaxDegree) for x in teI] ); trI = array( [(x)*float(MaxDegree) for x in trI] ); teL = ( [(x)*MaxRank for x in teL] ); trL = ( [(x)*MaxRank for x in trL] ); """ if (len(Pred.shape)==1): Pred = Pred.reshape(-1,1); Pred = deNormalizeColumns(Pred,Rscalers); trI = deNormalizeColumns(trI,Dscalers); trL = deNormalizeColumns(trL,Rscalers); teI = deNormalizeColumns(teI,Dscalers); teL = deNormalizeColumns(teL,Rscalers); pv = deNormalizeColumns(pv,Dscalers); #printV(zip(pv,Pred)); #printV(zip(teI,teL)); # show plot of predicted rank(dotted line) and actual rank (continuous line). # NOTE : x-axis is degree. Both degree(x-axis) and rank(y-axis) are on log scale and normalized #z=array(sorted(zip(teL,yres,teI[:,0]))); #plt.plot(z[:,2],z[:,1],'x',ms=3) #plt.plot(z[:,2],z[:,0],'-',ms=2) plt.plot(trI[:,0],trL,'o',ms=2) #plt.plot(teI[:,0],teL,'o',ms=5) plt.plot(pv,Pred,'x',ms=3) #plt.xlabel('Year') #plt.ylabel('Mortality Rate') #plt.show() savefig('C:\\xampp1\\htdocs\\ogd\\visual.jpg'); pv = pv.tolist(); Pred = [e[0] for e in Pred] pv = [e[0] for e in pv] result = [ [e[0],e[1]] for e in zip(pv,Pred) ] #result = result.tolist(); return result; #Pr = apply_regression('data.txt',[0],[1],[[0,2013,20,1]]); #printV (Pr);
def impute_regression(x_train, y_train, x_test): svr_rbf = SVR(kernel='rbf', C=1, gamma=0.15) model = svr_rbf.fit(x_train, y_train) y_test = model.predict(x_test) return y_test
# The mathematical definition of "kernels" and "support vector machines" is # beyond the scope of this course. We encourage interested readers with a # mathematical training to have a look at the scikit-learn [documentation on # SVMs](https://scikit-learn.org/stable/modules/svm.html) for more details. # # For the rest of us, let us just develop some intuitions on the relative # expressive power of support vector machines with linear and non-linear # kernels by fitting them on the same dataset. # # First, consider a support vector machine with a linear kernel: # %% from sklearn.svm import SVR svr = SVR(kernel="linear") svr.fit(data, target) target_predicted = svr.predict(data) mse = mean_squared_error(target, target_predicted) # %% ax = sns.scatterplot(data=full_data, x="input_feature", y="target") ax.plot(data, target_predicted, color="tab:orange") _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] # # The predictions of our SVR with a linear kernel are all aligned on a straight # line. `SVR(kernel="linear")` is indeed yet another example of a linear model. # # The estimator can also be configured to use a non-linear kernel. Then, it can # learn a prediction function that computes non-linear interaction between
def train(DO_x, DO_y): DO_net = SVR(kernel='rbf') DO_net.fit(DO_x, DO_y) return DO_net
sc_y_DR.fit(y_DR_train) sc_y_VT.fit(y_VT_train) sc_y_VV.fit(y_VV_train) # transform training dataset y_DR_train = sc_y_DR.transform(y_DR_train) y_VT_train = sc_y_VT.transform(y_VT_train) y_VV_train = sc_y_VV.transform(y_VV_train) # transform test dataset y_DR_test = sc_y_DR.transform(y_DR_test) y_VT_test = sc_y_VT.transform(y_VT_test) y_VV_test = sc_y_VV.transform(y_VV_test) regr = SVR(kernel='rbf', gamma='scale', C=100., epsilon=0.01, coef0=0.0) regr = MultiOutputRegressor(estimator=regr) regr.fit(x_DR_train, y_DR_train) y_DR_regr = regr.predict(x_DR_test) regr.fit(x_VT_train, y_VT_train) y_VT_regr = regr.predict(x_VT_test) regr.fit(x_VV_train, y_VV_train) y_VV_regr = regr.predict(x_VV_test) # open a file to append #outF = open("output_MO.txt", "a") #print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit, file=outF) #print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict),file=outF) #print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr), file=outF) #print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr), file=outF) #print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr)), file=outF) #outF.close()
y_train = target[:480] x_test = data[480:] y_true = target[480:] line = LinearRegression() lasso = Lasso() ridge = Ridge() tree = DecisionTreeRegressor() svr = SVR() line.fit(x_train, y_train) lasso.fit(x_train, y_train) ridge.fit(x_train, y_train) tree.fit(x_train, y_train) svr.fit(x_train, y_train) line_y_pre = line.predict(x_test) lasso_y_pre = lasso.predict(x_test) ridge_y_pre = ridge.predict(x_test) tree_y_pre = tree.predict(x_test) svr_y_pre = svr.predict(x_test) line.score = r2_score(y_true, line_y_pre) lasso.score = r2_score(y_true, lasso_y_pre) ridge.score = r2_score(y_true, ridge_y_pre) tree.score = r2_score(y_true, tree_y_pre) svr.score = r2_score(y_true, svr_y_pre) print(line.score) print(lasso.score) print(ridge.score)
#Feature scaling as SVR doesnot apply.. from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) y = sc_y.fit_transform(y) #fitting svr from sklearn.svm import SVR regressor = SVR(kernel='rbf') #regressor = SVR(kernel='linear') regressor.fit(x,y) y_pred = regressor.predict(x) print(y) print(y_pred) plt.scatter(x, y, c='green', label="regression line") plt.plot(x,y_pred,label="predicted line") plt.xlabel("X parameters") plt.ylabel("Y parameters") plt.legend() plt.show()
save_fig("svm_regression_plot") plt.show() # In[30]: np.random.seed(42) m = 100 X = 2 * np.random.rand(m, 1) - 1 y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1) / 10).ravel() # In[31]: from sklearn.svm import SVR svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1) svm_poly_reg.fit(X, y) # In[32]: from sklearn.svm import SVR svm_poly_reg1 = SVR(kernel="poly", degree=2, C=100, epsilon=0.1) svm_poly_reg2 = SVR(kernel="poly", degree=2, C=0.01, epsilon=0.1) svm_poly_reg1.fit(X, y) svm_poly_reg2.fit(X, y) # In[33]: plt.figure(figsize=(9, 4)) plt.subplot(121) plot_svm_regression(svm_poly_reg1, X, y, [-1, 1, 0, 1])
X_data = data_after_lag.iloc[:, 2:] y_data = transformation_fn(tran_type=tran_type, data=data_after_lag["Total_Daily_Trnx"]) X_train, X_test, y_train, y_test = split_train_test( X_data=X_data, y_data=y_data, split_type=split_type, test_size=test_size) for k in ker: for g in gam: for costi in cost: svmFit = SVR(kernel=str(k), gamma=g, C=costi, verbose=False) svmFit.fit(X_train, y_train) MAPE = accuracy_metric(metric="MAPE", actual=y_test, pred=svmFit.predict(X_test), tran_type=tran_type) result_MAPE.append([ s_type, fc, l, svmFit.kernel, svmFit.gamma, svmFit.C, svmFit.epsilon, svmFit.tol, svmFit.degree, MAPE ]) print( str(ind) + "/" + str(Total_run), [ s_type, fc, l, svmFit.kernel, svmFit.gamma, svmFit.C, svmFit.epsilon, svmFit.tol, svmFit.degree, MAPE ]) ind = ind + 1
class TimeSeriesSVR(TimeSeriesSVMMixin, RegressorMixin, TimeSeriesBaseEstimator): """Time-series specific Support Vector Regressor. Parameters ---------- C : float, optional (default=1.0) Penalty parameter C of the error term. kernel : string, optional (default='gak') Specifies the kernel type to be used in the algorithm. It must be one of 'gak' or a kernel accepted by ``sklearn.svm.SVC``. If none is given, 'gak' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. degree : int, optional (default=3) Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. gamma : float, optional (default='auto') Kernel coefficient for 'gak', 'rbf', 'poly' and 'sigmoid'. If gamma is 'auto' then: - for 'gak' kernel, it is computed based on a sampling of the training set (cf :ref:`tslearn.metrics.gamma_soft_dtw <fun-tslearn.metrics.gamma_soft_dtw>`) - for other kernels (eg. 'rbf'), 1/n_features will be used. coef0 : float, optional (default=0.0) Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. tol : float, optional (default=1e-3) Tolerance for stopping criterion. epsilon : float, optional (default=0.1) Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value. shrinking : boolean, optional (default=True) Whether to use the shrinking heuristic. cache_size : float, optional (default=200.0) Specify the size of the kernel cache (in MB). n_jobs : int or None, optional (default=None) The number of jobs to run in parallel for GAK cross-similarity matrix computations. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See scikit-learns' `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_ for more details. verbose : int, default: 0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. max_iter : int, optional (default=-1) Hard limit on iterations within solver, or -1 for no limit. Attributes ---------- support_ : array-like, shape = [n_SV] Indices of support vectors. support_vectors_ : array of shape [n_SV, sz, d] Support vectors in tslearn dataset format dual_coef_ : array, shape = [1, n_SV] Coefficients of the support vector in the decision function. coef_ : array, shape = [1, n_features] Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and `support_vectors_`. intercept_ : array, shape = [1] Constants in decision function. sample_weight : array-like, shape = [n_samples] Individual weights for each sample svm_estimator_ : sklearn.svm.SVR The underlying sklearn estimator Examples -------- >>> from tslearn.generators import random_walk_blobs >>> X, y = random_walk_blobs(n_ts_per_blob=10, sz=64, d=2, n_blobs=2) >>> import numpy >>> y = y.astype(numpy.float) + numpy.random.randn(20) * .1 >>> reg = TimeSeriesSVR(kernel="gak", gamma="auto") >>> reg.fit(X, y).predict(X).shape (20,) >>> sv = reg.support_vectors_ >>> sv.shape # doctest: +ELLIPSIS (..., 64, 2) >>> sv.shape[0] <= 20 True References ---------- Fast Global Alignment Kernels. Marco Cuturi. ICML 2011. """ def __init__(self, C=1.0, kernel="gak", degree=3, gamma="auto", coef0=0.0, tol=0.001, epsilon=0.1, shrinking=True, cache_size=200, n_jobs=None, verbose=0, max_iter=-1): self.C = C self.kernel = kernel self.degree = degree self.gamma = gamma self.coef0 = coef0 self.tol = tol self.epsilon = epsilon self.shrinking = shrinking self.cache_size = cache_size self.n_jobs = n_jobs self.verbose = verbose self.max_iter = max_iter @property def n_iter_(self): warnings.warn('n_iter_ is always set to 1 for TimeSeriesSVR, since ' 'it is non-trivial to access the underlying libsvm') return 1 @deprecated def support_vectors_time_series_(self, X=None): """Support vectors as time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Training time series dataset. """ if X is not None: warnings.warn('The use of ' '`support_vectors_time_series_` is deprecated in ' 'tslearn v0.4 and will be removed in v0.6. Use ' '`support_vectors_` property instead.') check_is_fitted(self, '_X_fit') return self._X_fit[self.svm_estimator_.support_] @property def support_vectors_(self): check_is_fitted(self, '_X_fit') return self._X_fit[self.svm_estimator_.support_] def fit(self, X, y, sample_weight=None): """Fit the SVM model according to the given training data. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. y : array-like of shape=(n_ts, ) Time series labels. sample_weight : array-like of shape (n_samples,), default=None Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. """ sklearn_X, y = self._preprocess_sklearn(X, y, fit_time=True) self.svm_estimator_ = SVR(C=self.C, kernel=self.estimator_kernel_, degree=self.degree, gamma=self.gamma_, coef0=self.coef0, shrinking=self.shrinking, tol=self.tol, cache_size=self.cache_size, verbose=self.verbose, max_iter=self.max_iter) self.svm_estimator_.fit(sklearn_X, y, sample_weight=sample_weight) return self def predict(self, X): """Predict class for a given set of time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. Returns ------- array of shape=(n_ts, ) or (n_ts, dim_output), depending on the shape of the target vector provided at training time. Predicted targets """ sklearn_X = self._preprocess_sklearn(X, fit_time=False) return self.svm_estimator_.predict(sklearn_X) def _more_tags(self): return { 'non_deterministic': True, 'allow_nan': True, 'allow_variable_length': True }
def filter_genes_dispersion(data, flavor='seurat', min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_bins=20, n_top_genes=None, log=True, copy=False): """Extract highly variable genes. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. flavor : {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data - the logarithm of mean and dispersion is taken internally when `log` is at its default value `True`. For 'cell_ranger', this is usually called for logarithmized data - in this case you should set `log` to `False`. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional If `n_top_genes` unequals `None`, these cutoffs for the means and the normalized dispersions are ignored. n_bins : `int` (default: 20) Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. n_top_genes : `int` or `None` (default: `None`) Number of highly-variable genes to keep. log : `bool`, optional (default: `True`) Use the logarithm of the mean to variance ratio. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- If an AnnData `adata` is passed, returns or updates `adata` depending on \ `copy`. It filters the `adata` and adds the annotations """ adata = data.copy() if copy else data set_initial_size(adata) if n_top_genes is not None and adata.n_vars < n_top_genes: logg.info( 'Skip filtering by dispersion since number of variables are less than `n_top_genes`' ) else: if flavor is 'svr': mu = adata.X.mean(0).A1 if issparse(adata.X) else adata.X.mean(0) sigma = np.sqrt(adata.X.multiply(adata.X).mean(0).A1 - mu**2) if issparse(adata.X) else adata.X.std(0) log_mu = np.log2(mu) log_cv = np.log2(sigma / mu) from sklearn.svm import SVR clf = SVR(gamma=150. / len(mu)) clf.fit(log_mu[:, None], log_cv) score = log_cv - clf.predict(log_mu[:, None]) nth_score = np.sort(score)[::-1][n_top_genes] adata._inplace_subset_var(score >= nth_score) else: from scanpy.api.pp import filter_genes_dispersion filter_genes_dispersion(adata, flavor=flavor, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_bins=n_bins, n_top_genes=n_top_genes, log=log) return adata if copy else None
# feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # splitting data into train_test_split """ from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 1/3, random_state = 0)""" # fitting SVR in dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y.ravel()) # predict the model y_pred = sc_y.inverse_transform( regressor.predict(sc_X.transform((np.array([6.5]).reshape(1, -1))))) # Note: Scale back the data to the original representation = inververse_transform # visualize the SVR plt.scatter(X, y, c='red') plt.plot(X, regressor.predict(X), c='green') plt.title('truth or dare(SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
print(type(y_val)) print(y_val.shape) # Change value #x_val = countryGDI #y_val = countryHDI_f # Define Regression Function svr_lin = SVR(kernel='linear', C=1e3) svr_poly = SVR(kernel='poly', C=1e3, degree=3) svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) rvm = RVR(kernel='rbf', gamma=1) ### CHANGE TO RVR # Proceed regression using Support Vector Regression (SVR) t1 = time.time() y_rbf = svr_rbf.fit(x_val, y_val).predict(x_val) t2 = time.time() t_svr_rbf = t2 - t1 print('Support Vector Regression with RBF kernel takes {} s'.format(t_svr_rbf)) t1 = time.time() y_lin = svr_lin.fit(x_val, y_val).predict(x_val) t2 = time.time() t_svr_lin = t2 - t1 print('Support Vector Regression with linear kernel takes {} s'.format( t_svr_lin)) t1 = time.time() y_poly = svr_poly.fit(x_val, y_val).predict(x_val) t2 = time.time() t_svr_poly = t2 - t1
cv=cross_validation) # グリッドサーチの設定 gs_cv.fit(autoscaled_x_train, autoscaled_y_train) # グリッドサーチ + クロスバリデーション実施 optimal_linear_svr_c = gs_cv.best_params_['C'] # 最適な C optimal_linear_svr_epsilon = gs_cv.best_params_['epsilon'] # 最適な ε # 結果の確認 print('最適化された C : {0} (log(C)={1})'.format(optimal_linear_svr_c, np.log2(optimal_linear_svr_c))) print('最適化された ε : {0} (log(ε)={1})'.format( optimal_linear_svr_epsilon, np.log2(optimal_linear_svr_epsilon))) # モデル構築 model = SVR(kernel='linear', C=optimal_linear_svr_c, epsilon=optimal_linear_svr_epsilon) # SVRモデルの宣言 model.fit(autoscaled_x_train, autoscaled_y_train) # モデル構築 # 標準回帰係数 standard_regression_coefficients = pd.DataFrame( model.coef_.T, index=x_train.columns, columns=['standard_regression_coefficients']) # Pandas の DataFrame 型に変換 standard_regression_coefficients.to_csv( 'standard_regression_coefficients_svr_linear.csv' ) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください # トレーニングデータの推定 autoscaled_estimated_y_train = model.predict(autoscaled_x_train) # y の推定 estimated_y_train = autoscaled_estimated_y_train * y_train.std( ) + y_train.mean() # スケールをもとに戻す estimated_y_train = pd.DataFrame(estimated_y_train,
test_x = np.reshape(test_x, (3 * 480, 2)) test_y = np.reshape(test_y, (3 * 480, 1)) train_x = test_x train_y = test_y """ train_x.shape : (1261, 3) train_y.shape : (1261, 1) test_x.shape : (620, 3) test_y.shape : (620, 1) """ #todo svr method svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.01) #svr_lin = SVR(kernel='linear', C=1e3) #svr_poly = SVR(kernel='poly', C=1e3, degree=2) y_rbf = svr_rbf.fit(train_x, train_y).predict(test_x) #y_lin = svr_lin.fit(train_x, train_y).predict(test_x) #y_poly = svr_poly.fit(train_x, train_y).predict(test_x) #反标准化 #y_rbf=np.reshape(y_rbf,(len(y_rbf),-1)) #y_rbf=scaler.inverse_transform(y_rbf) #test_y=scaler.inverse_transform(test_y) plt.plot(y_rbf, label='y_rbf') #plt.plot(y_lin,label='y_lin') #plt.plot(y_poly,label='y_poly') plt.plot(test_y, label='true') plt.legend(loc='upper right') plt.show()
mse = mean_squared_error(test_label, predict_r) sgd_score = np.sqrt(mse) sgd_score #cross_val_Stochastic_gradient sgd = SGDRegressor(penalty='l2', n_iter_no_change=100, alpha=0.05) score = cross_val_score(sgd, train, train_label, cv=10, scoring='neg_mean_squared_error') sgd_score_cross = np.sqrt(-score) np.mean(sgd_score_cross), np.std(sgd_score_cross) from sklearn.svm import SVR svm = SVR(epsilon=15, kernel='linear') svm.fit(train, train_label) predict_r = svm.predict(test) mse = mean_squared_error(test_label, predict_r) svm_score = np.sqrt(mse) svm_score #cross_val_SVR svm = SVR(epsilon=15, kernel='linear') score = cross_val_score(svm, train, train_label, cv=10, scoring='neg_mean_squared_error') svm_score_cross = np.sqrt(-score) np.mean(svm_score_cross), np.std(svm_score_cross) from sklearn.tree import DecisionTreeRegressor
#import dataset dataset = pa.read_csv('Position_Salaries.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values #feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(-1, 1)) #reshape is important #Regressor from sklearn.svm import SVR Regressor = SVR(kernel='rbf') Regressor.fit(X, y) y_pred = Regressor.predict(sc_X.transform([[6.5]])) y_pred = sc_y.inverse_transform(y_pred) #Feature scaling should be done #plot X_grid = np.arange(min(X), max(X), 0.1) X_grid = X_grid.reshape(len(X_grid), 1) plt.scatter(X, y, color='red') plt.plot(X_grid, Regressor.predict(X_grid), color='blue') plt.show()
"""from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) #train the SVR machine on the dataset # Predicting a new result #un-scale the salary y_pred = sc_y.inverse_transform( regressor.predict(sc_X.transform(np.array([[6.5]])))) # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the Regression results (for higher resolution and smoother curve)
forest_rmse # forest_rmse = 21933.31414779769 #CrossValueScore_RandomForest from sklearn.model_selection import cross_val_score forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) display_scores(forest_rmse_scores) scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) pd.Series(np.sqrt(-scores)).describe() # SupportVectorRegression_SVR from sklearn.svm import SVR svm_reg = SVR(kernel="linear") svm_reg.fit(housing_prepared, housing_labels) housing_predictions = svm_reg.predict(housing_prepared) svm_mse = mean_squared_error(housing_labels, housing_predictions) svm_rmse = np.sqrt(svm_mse) svm_rmse #rmse 111094.6308539982 - not great very high rmse # __________________________________________________________________________________________ # Fine-tune the model - GridSearch RandomForest # GridSearch: Searching for the hyperparameters (instead of manually) using RandomForest from sklearn.model_selection import GridSearchCV # These hyperparameters are stated within dictionaries param_grid = [ # try 12 (3×4) combinations of hyperparameters {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
def createSupportVectorMachineModel(mode, ticker, startDate, endDate, days): df = mode.createInitialDataFrame(ticker, startDate, endDate) df.fillna(value=-99999, inplace=True) df, xTrain, yTrain = mode.trainingData(df, "Close", 1) svrClose = SVR(kernel="rbf", C=1e3, gamma=0.1) svrClose.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "MACD", 1) svrMACD = SVR(kernel="rbf", C=1e3, gamma=0.1) svrMACD.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "WR", 1) svrWR = SVR(kernel="rbf", C=1e3, gamma=0.1) svrWR.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "UO", 1) svrUO = SVR(kernel="rbf", C=1e3, gamma=0.1) svrUO.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "SMA", 1) svrSMA = SVR(kernel="rbf", C=1e3, gamma=0.1) svrSMA.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "ROCP", 1) svrROCP = SVR(kernel="rbf", C=1e3, gamma=0.1) svrROCP.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "ROCV", 1) svrROCV = SVR(kernel="rbf", C=1e3, gamma=0.1) svrROCV.fit(xTrain, yTrain) df = mode.createInitialDataFrame(ticker, startDate, endDate) lastDate = df.iloc[-1].name predictionData = np.array(df.iloc[-1]) predictionData = predictionData.reshape(1, len(predictionData)) for i in range(days): lastClosePrediction = svrClose.predict(predictionData) lastMACDPrediction = svrMACD.predict(predictionData) lastWRPrediction = svrWR.predict(predictionData) lastUOPrediction = svrUO.predict(predictionData) lastSMAPrediction = svrSMA.predict(predictionData) lastROCPPrediction = svrROCP.predict(predictionData) lastROCVPrediction = svrROCV.predict(predictionData) newDate = lastDate + datetime.timedelta(days=1) newRow=pd.Series(data={"Close": float(lastClosePrediction),\ "MACD": float(lastMACDPrediction), "WR": float(lastMACDPrediction),\ "UO": float(lastUOPrediction), "SMA": float(lastSMAPrediction),\ "ROCP": float(lastROCPPrediction),\ "ROCV": float(lastROCVPrediction)}, name=newDate) df = df.append(newRow, ignore_index=False) df, predictionData, svrClose, svrMACD, svrWR, svrUO, svrSMA,\ svrROCP, svrROCV=mode.updateSVMModel(df) lastDate = newDate plt.plot(df["Close"][:-days]) plt.plot(df["Close"][-days - 1:]) axi = plt.axes() axi.xaxis.set_major_locator(plt.MaxNLocator(4)) for tick in axi.xaxis.get_major_ticks(): tick.label.set_fontsize(8) plt.xticks(rotation=30) plt.show()
class SVM(object): def __split(self): """ Splits data into training and test data for the SVM :return: Xtrain, ytrain, Xtest, ytest """ Xtrain = self.scaledData[:self.startDay] ytrain = pd.DataFrame(self.scaledData[1:self.startDay + 1]).iloc[:, 4] Xtest = self.scaledData[self.startDay:-1] ytest = pd.DataFrame(self.scaledData[self.startDay + 1:]).iloc[:, 4] return Xtrain, ytrain.values, Xtest, ytest.values def __init__(self, C, epsilon, ticker, manager, startDay, kernel='rbf'): self.scaler = MinMaxScaler(feature_range=(0, 1)) self.model = SVR(C=C, epsilon=epsilon, kernel=kernel) self.ticker = ticker self.manager = manager self.startDay = startDay self.data = get_multifeature_data(manager, ticker) self.stationaryData = diff_multifeature(self.data) self.scaledData = self.scale(self.stationaryData) self.Xtrain, self.ytrain, self.Xtest, self.ytest = self.__split() self.raw_prices = list(self.data['vwap']) def scale(self, df): """ Normalizes the data between 0 and 1 :param df: dataframe :return: scaled dataframe """ values = df.values scaled = self.scaler.fit_transform(values) return scaled def unscale(self, series): """ Unnormalizes the data from the output :param series: series of scaled points :return: unscaled series """ padded = pd.DataFrame() reshaped = series.reshape(1, len(series))[0] for i in range(4): padded[i] = [0 for j in range(len(series))] padded['unscaled'] = reshaped padded[5] = [0 for j in range(len(series))] unscaled = pd.DataFrame(self.scaler.inverse_transform(padded.values)) unscaled = unscaled.iloc[:, 4] return list(unscaled) def test_and_error(self): """ Used in development for deciding the architecture :return: None """ self.fit() raw_predictions = self.model.predict(self.Xtest) unscaled_predictions = self.unscale(raw_predictions) predictions = undifference(self.data.iloc[self.startDay, 4], unscaled_predictions) print(mean_squared_error(self.ytest, raw_predictions)) days = create_timeseries(self.manager, self.ticker)[1] days = [days[x] for x in range(0, len(days), 2)] actual = list(self.data['vwap']) plt.plot(days, actual, color='black', label='Actual') plt.plot(days[self.startDay + 3:], predictions[1:], color='red', label='LSTM predictions') plt.xlabel('day') plt.title(self.ticker) plt.ylabel('price') plt.legend(loc=2) plt.savefig('plots/SVM/SVM_{0}_predictions.pdf'.format(self.ticker)) plt.show() def fit(self): """ Trains the model :return: None """ self.model.fit(self.Xtrain, self.ytrain) def predict(self, D): """ Predicts the next price :param D: day index :return: prediction """ d = D - len(self.Xtrain) - 1 if d == -1: x = self.Xtrain[len(self.Xtrain) - 1].reshape(1, 6) else: x = self.Xtest[d].reshape(1, 6) previousPrice = self.raw_prices[D - 1] diff_pred = self.unscale(self.model.predict(x)) prediction = previousPrice + diff_pred[0] return prediction
def updateSVMModel(mode, df): df, xTrain, yTrain = mode.trainingData(df, "Close", 1) svrClose = SVR(kernel="rbf", C=1e3, gamma=0.1) svrClose.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "MACD", 1) svrMACD = SVR(kernel="rbf", C=1e3, gamma=0.1) svrMACD.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "WR", 1) svrWR = SVR(kernel="rbf", C=1e3, gamma=0.1) svrWR.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "UO", 1) svrUO = SVR(kernel="rbf", C=1e3, gamma=0.1) svrUO.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "SMA", 1) svrSMA = SVR(kernel="rbf", C=1e3, gamma=0.1) svrSMA.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "ROCP", 1) svrROCP = SVR(kernel="rbf", C=1e3, gamma=0.1) svrROCP.fit(xTrain, yTrain) df, xTrain, yTrain = mode.trainingData(df, "ROCV", 1) svrROCV = SVR(kernel="rbf", C=1e3, gamma=0.1) svrROCV.fit(xTrain, yTrain) if "result" in df.columns: df = df.drop(["result"], 1) predictionData = np.array(df.iloc[-1]) predictionData = predictionData.reshape(1, len(predictionData)) return df, predictionData, svrClose, svrMACD, svrWR, svrUO, svrSMA,\ svrROCP, svrROCV
def run_particular(arg): i, cat = arg ibp = IBP(enable_cluster=False) ibp.fit(training_votes[2], training_votes[1]) y_per = training_votes[1] / training_votes[2].astype(float) y_ibp = ibp(training_votes[2], training_votes[1]) X = np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, i], training_x))))) X_tsb = np.array( list( map(lambda x: x[1], filter(lambda x: x[0] == cat, zip(tsb_cats[:, i], tsb_x))))) y_tsb = np.array( list( map(lambda x: x[1], filter(lambda x: x[0] == cat, zip(tsb_cats[:, i], tsb_truth))))) clf_per = SVR(C=1, gamma=0.001) clf_ibp = SVR(C=1000, gamma=0.0001) clf_per.fit( X, np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, i], y_per)))))) clf_ibp.fit( X, np.array( list( map( lambda x: x[1], filter(lambda x: x[0] == cat, zip(training_cats[:, i], y_ibp)))))) tsb_y_hat_per = clf_per.predict(X_tsb) tsb_y_hat_ibp = clf_ibp.predict(X_tsb) mse_tsb_per = ((tsb_y_hat_per - y_tsb)**2).mean() mae_tsb_per = abs(tsb_y_hat_per - y_tsb).mean() rmse_tsb_per = mse_tsb_per**0.5 mse_tsb_ibp = ((tsb_y_hat_ibp - y_tsb)**2).mean() mae_tsb_ibp = abs(tsb_y_hat_ibp - y_tsb).mean() rmse_tsb_ibp = mse_tsb_ibp**0.5 print(2**(i + 1), cat, 'tsb', (training_cats[:, i] == cat).astype(int).sum(), (tsb_cats[:, i] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp, (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp, (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per) return [[ 2**(i + 1), cat, 'tsb', (training_cats[:, i] == cat).astype(int).sum(), (tsb_cats[:, i] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp, (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp, (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per, ttest_rel(tsb_y_hat_per, tsb_y_hat_ibp).pvalue ]]
def analyze(data, label, num_folds): # Partition data into folds n = len(data) // num_folds data_folds = [data[i:i+n] for i in range(0, len(data), n)] label_folds = [label[i:i+n] for i in range(0, len(label), n)] lin_reg_error = 0 cs = [4**c for c in range(-10, 0, 1)] svm_error = [0] * len(cs) svm_std = [0] * len(cs) # for i in range(0, num_folds): # test_data = data_folds[i] # test_label = label_folds[i] # train_data = [] # train_label = [] # for j in range(num_folds): # if i != j: # train_data += data_folds[j] # train_label += label_folds[j] # model = linear_model.LinearRegression() # model.fit(data, label) # return model # lin_reg_error += np.mean(abs(model.predict(test_data) - test_label)) # # for i2 in range(len(cs)): # svm_classifier = SVR(gamma=cs[i2]) # svm_classifier.fit(train_data, train_label) # svm_error[i2] += np.mean(abs(svm_classifier.predict(test_data) - test_label)) # svm_std[i2] += np.std(abs(svm_classifier.predict(test_data) - test_label)) svm_c = SVR(gamma=4**-7) svm_c.fit(data, label) return svm_c